├── elit ├── server │ ├── __init__.py │ ├── parser_config.py │ ├── en_parser.py │ ├── en_util.py │ ├── format.py │ ├── service_parser.py │ ├── service_tokenizer.py │ └── en.py ├── components │ ├── parsers │ │ ├── udify │ │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── biaffine │ │ │ ├── __init__.py │ │ │ ├── mlp.py │ │ │ └── biaffine.py │ │ ├── constituency │ │ │ └── __init__.py │ │ └── second_order │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ └── treecrf_decoder.py │ ├── amr │ │ ├── seq2seq │ │ │ ├── __init__.py │ │ │ ├── dataset │ │ │ │ ├── __init__.py │ │ │ │ ├── IO.py │ │ │ │ ├── penman.py │ │ │ │ └── dataset.py │ │ │ └── evaluation.py │ │ ├── __init__.py │ │ └── amr_parser │ │ │ ├── __init__.py │ │ │ ├── data.py │ │ │ ├── amrio.py │ │ │ ├── customized_bart.py │ │ │ └── utils.py │ ├── srl │ │ ├── span_rank │ │ │ ├── util.py │ │ │ └── __init__.py │ │ └── __init__.py │ ├── eos │ │ └── __init__.py │ ├── mtl │ │ ├── __init__.py │ │ ├── tasks │ │ │ ├── ner │ │ │ │ └── __init__.py │ │ │ └── srl │ │ │ │ └── __init__.py │ │ └── loss_balancer.py │ ├── ner │ │ ├── __init__.py │ │ ├── biaffine_ner │ │ │ └── __init__.py │ │ └── rnn_ner.py │ ├── taggers │ │ ├── __init__.py │ │ ├── transformers │ │ │ └── __init__.py │ │ └── util.py │ ├── classifiers │ │ └── __init__.py │ ├── coref │ │ ├── __init__.py │ │ └── util.py │ ├── distillation │ │ ├── __init__.py │ │ ├── distillable_component.py │ │ └── schedulers.py │ ├── __init__.py │ ├── lambda_wrapper.py │ └── lemmatizer.py ├── pretrained │ ├── amr.py │ ├── dep.py │ ├── __init__.py │ ├── mtl.py │ └── coref.py ├── datasets │ ├── __init__.py │ ├── coref │ │ └── __init__.py │ ├── eos │ │ └── __init__.py │ ├── ner │ │ ├── __init__.py │ │ └── conll03.py │ ├── srl │ │ ├── __init__.py │ │ └── ontonotes5 │ │ │ └── __init__.py │ ├── parsing │ │ ├── __init__.py │ │ ├── ptb.py │ │ ├── semeval15.py │ │ └── conll_dataset.py │ └── lm │ │ └── __init__.py ├── layers │ ├── __init__.py │ ├── crf │ │ └── __init__.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── util.py │ │ ├── fast_text.py │ │ └── char_rnn.py │ ├── pass_through_encoder.py │ ├── feed_forward.py │ ├── transformers │ │ ├── __init__.py │ │ └── pt_imports.py │ └── context_layer.py ├── metrics │ ├── __init__.py │ ├── amr │ │ └── __init__.py │ ├── parsing │ │ ├── __init__.py │ │ ├── attachmentscore.py │ │ ├── conllx_eval.py │ │ ├── span.py │ │ └── labeled_f1.py │ ├── srl │ │ ├── __init__.py │ │ └── srlconll.py │ ├── chunking │ │ ├── __init__.py │ │ ├── chunking_f1.py │ │ └── binary_chunking_f1.py │ ├── accuracy.py │ ├── metric.py │ ├── mtl.py │ └── f1.py ├── transform │ └── __init__.py ├── common │ ├── __init__.py │ ├── constant.py │ └── component.py ├── version.py ├── utils │ ├── __init__.py │ ├── init_util.py │ ├── rules.py │ ├── reflection.py │ └── string.py ├── main.py └── __init__.py ├── tests ├── demo │ ├── __init__.py │ └── train_amr3_parser.py ├── __init__.py ├── test_parser.py ├── test_seq2seq_amr.py ├── test_tokenizer.py ├── test_joint_model.py ├── test_service_coref_doc.py └── test_service_coref_online.py ├── LICENSE ├── docs ├── semantic_role_labeling.md ├── named_entity_recognition.md ├── tokenization.md ├── dependency_parsing.md ├── constituency_parsing.md ├── part_of_speech_tagging.md └── abstract_meaning_parsing.md ├── README.md ├── .github └── workflows │ └── python-package.yml └── setup.py /elit/server/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-18 18:31 4 | -------------------------------------------------------------------------------- /tests/demo/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-01-19 15:11 4 | -------------------------------------------------------------------------------- /elit/components/parsers/udify/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-05-21 20:21 -------------------------------------------------------------------------------- /elit/components/amr/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-04-27 19:24 4 | -------------------------------------------------------------------------------- /elit/components/amr/seq2seq/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-04-27 19:29 4 | -------------------------------------------------------------------------------- /elit/pretrained/amr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-10-31 14:35 4 | from elit.common.constant import ELIT_URL 5 | 6 | AMR3_BART_LARGE_EN = ELIT_URL + 'amr3_bart_large_elit_20211030_173300.zip' 7 | 8 | ALL = {} 9 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-18 19:21 4 | import os 5 | 6 | root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) 7 | 8 | 9 | def cdroot(): 10 | """ 11 | cd to project root, so models are saved in the root folder 12 | """ 13 | os.chdir(root) 14 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import elit 3 | 4 | 5 | class TestAMR(unittest.TestCase): 6 | def test_parse(self): 7 | parser = elit.load('KOREAN_TREEBANK_BIAFFINE_DEP') 8 | tree = parser('그는 르노가 3 월말까지 인수제의 시한을 갖고 있다고 덧붙였다 .'.split()) 9 | print(tree) 10 | 11 | 12 | if __name__ == '__main__': 13 | unittest.main() 14 | -------------------------------------------------------------------------------- /elit/pretrained/dep.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-03-26 01:06 4 | from elit.common.constant import ELIT_URL 5 | 6 | KOREAN_TREEBANK_BIAFFINE_DEP = ELIT_URL + 'ktb-bert-base-multilingual-cased_20220326_004709.zip' 7 | 'A single task dependency parsing model for Korea trained on the Korean Treebank. UAS = 92.42% LAS = 90.48%.' 8 | 9 | ALL = {} 10 | -------------------------------------------------------------------------------- /tests/test_seq2seq_amr.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import elit 3 | import elit.pretrained.amr 4 | 5 | 6 | class TestAMR(unittest.TestCase): 7 | def test_parse(self): 8 | parser = elit.load(elit.pretrained.amr.AMR3_BART_LARGE_EN) 9 | amr = parser('The boy wants the girl to believe him.') 10 | print(amr) 11 | 12 | 13 | if __name__ == '__main__': 14 | unittest.main() 15 | -------------------------------------------------------------------------------- /elit/components/srl/span_rank/util.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL 2 | import torch 3 | 4 | 5 | def block_orth_normal_initializer(input_size, output_size): 6 | weight = [] 7 | for o in output_size: 8 | for i in input_size: 9 | param = torch.FloatTensor(o, i) 10 | torch.nn.init.orthogonal_(param) 11 | weight.append(param) 12 | return torch.cat(weight) 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 Emory University 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /elit/components/amr/seq2seq/evaluation.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import penman 4 | 5 | 6 | def write_predictions(predictions_path, tokenizer, graphs): 7 | pieces = [penman.encode(g) for g in graphs] 8 | Path(predictions_path).write_text('\n\n'.join(pieces).replace(tokenizer.INIT, '')) 9 | return predictions_path 10 | 11 | 12 | def compute_smatch(pred, gold): 13 | import smatch 14 | with Path(pred).open() as p, Path(gold).open() as g: 15 | score = next(smatch.score_amr_pairs(p, g)) 16 | return score[2] 17 | 18 | 19 | def compute_bleu(gold_sentences, pred_sentences): 20 | from sacrebleu import corpus_bleu 21 | return corpus_bleu(pred_sentences, [gold_sentences]) 22 | -------------------------------------------------------------------------------- /elit/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/layers/crf/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/transform/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/common/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | -------------------------------------------------------------------------------- /elit/components/amr/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/eos/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/mtl/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/ner/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/srl/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/datasets/coref/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/datasets/eos/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/datasets/ner/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/datasets/srl/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/metrics/amr/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/metrics/parsing/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/metrics/srl/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/taggers/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/datasets/parsing/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/layers/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/metrics/chunking/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/coref/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: Liyan Xu 19 | -------------------------------------------------------------------------------- /elit/components/distillation/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/mtl/tasks/ner/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/mtl/tasks/srl/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/srl/span_rank/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/ner/biaffine_ner/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/parsers/biaffine/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/amr/amr_parser/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | -------------------------------------------------------------------------------- /elit/components/parsers/constituency/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/parsers/second_order/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/taggers/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs -------------------------------------------------------------------------------- /elit/components/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from .pipeline import Pipeline -------------------------------------------------------------------------------- /elit/version.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | 20 | __version__ = '2.0.0-alpha.10' 21 | -------------------------------------------------------------------------------- /elit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from . import rules 20 | from . import util 21 | -------------------------------------------------------------------------------- /elit/server/parser_config.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | 19 | MTL_MODEL = 'LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN' 20 | -------------------------------------------------------------------------------- /tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-18 19:21 4 | import unittest 5 | 6 | from elit.components.tokenizer import EnglishTokenizer 7 | 8 | 9 | class TestTokenizer(unittest.TestCase): 10 | def test_tokenize(self): 11 | tokenizer = EnglishTokenizer() 12 | text = "Emory NLP is a research lab in Atlanta, GA. It is founded by Jinho D. Choi in 2014. Dr. Choi is a " \ 13 | "professor at Emory University. " 14 | print(tokenizer.tokenize(text)) 15 | 16 | def test_segment(self): 17 | tokenizer = EnglishTokenizer() 18 | print(tokenizer.segment( 19 | ['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.', 'It', 'is', 'founded', 'by', 20 | 'Jinho', 'D.', 'Choi', 'in', '2014', '.', 'Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 21 | 'University', '.'])) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /docs/semantic_role_labeling.md: -------------------------------------------------------------------------------- 1 | # Semantic Role Labeling 2 | 3 | Semantic role labeling parses the predicate-argument structure of a sentence often answers “who did what to whom”. Currently, ELIT supports span-based SRL. 4 | 5 | Users can load a MTL component and specify `tasks='srl'` to have sentences parsed by ELIT. E.g., 6 | 7 | ```python 8 | import elit 9 | nlp = elit.load('LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN') 10 | print(nlp([['Emory','NLP','is','in','Atlanta']], tasks='srl')) 11 | ``` 12 | 13 | Outputs: 14 | 15 | ```python 16 | { 17 | "tok": [ 18 | ["Emory", "NLP", "is", "in", "Atlanta"] 19 | ], 20 | "srl": [ 21 | [[["ARG1", 0, 2, "Emory NLP"], ["PRED", 2, 3, "is"], ["ARG2", 3, 5, "in Atlanta"]]] 22 | ], 23 | } 24 | ``` 25 | 26 | `srl` stores the `(role, start, end, form)` of the predicates and arguments corresponding to each flattened predicate-argument structure. In this component, the OntoNotes 5 SRL annotations are used with an additional role `PRED` indicating the predicate. 27 | 28 | -------------------------------------------------------------------------------- /elit/datasets/srl/ontonotes5/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | ONTONOTES5_HOME = 'https://catalog.ldc.upenn.edu/LDC2013T19/ontonotes-release-5.0.tgz#data/' 20 | CONLL12_HOME = ONTONOTES5_HOME + '../conll-2012/' 21 | -------------------------------------------------------------------------------- /elit/pretrained/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from elit.pretrained import mtl 20 | from elit.pretrained import coref 21 | from elit.pretrained import amr 22 | from elit.pretrained import dep 23 | 24 | # Will be filled up during runtime 25 | ALL = {} 26 | -------------------------------------------------------------------------------- /elit/common/constant.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | 20 | PAD = '' 21 | UNK = '' 22 | CLS = '[CLS]' 23 | BOS = '' 24 | EOS = '' 25 | ROOT = BOS 26 | IDX = '_idx_' 27 | ELIT_URL = 'https://elit-models.s3-us-west-2.amazonaws.com/v2/' 28 | NULL = '' 29 | PRED = 'PRED' 30 | -------------------------------------------------------------------------------- /elit/components/amr/seq2seq/dataset/IO.py: -------------------------------------------------------------------------------- 1 | import glob 2 | from typing import List, Union, Iterable 3 | from pathlib import Path 4 | from .penman import pm_load as pm_load 5 | 6 | 7 | def read_raw_amr_data( 8 | paths: List[Union[str, Path]], 9 | use_recategorization=False, 10 | dereify=True, 11 | remove_wiki=False, 12 | ): 13 | assert paths 14 | 15 | if not isinstance(paths, Iterable): 16 | paths = [paths] 17 | 18 | graphs = [] 19 | for path_ in paths: 20 | for path in glob.glob(str(path_)): 21 | path = Path(path) 22 | assert path.exists(), f'{path} not exist' 23 | graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki)) 24 | 25 | assert graphs, 'No graphs loaded' 26 | 27 | if use_recategorization: 28 | for g in graphs: 29 | metadata = g.metadata 30 | metadata['snt_orig'] = metadata['snt'] 31 | tokens = eval(metadata['tokens']) 32 | metadata['snt'] = ' '.join( 33 | [t for t in tokens if not ((t.startswith('-L') or t.startswith('-R')) and t.endswith('-'))]) 34 | 35 | return graphs 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ELIT 2 | 3 | The Emory Language and Information Toolkit (ELIT) provides the state-of-the-art NLP models for the following tasks: 4 | 5 | * [Tokenization](docs/tokenization.md) 6 | * [Part-of-Speech Tagging](docs/part_of_speech_tagging.md) 7 | * [Named Entity Recognition](docs/named_entity_recognition.md) 8 | * [Constituency Parsing](docs/constituency_parsing.md) 9 | * [Dependency Parsing](docs/dependency_parsing.md) 10 | * [Semantic Role Labeling](docs/semantic_role_labeling.md) 11 | * [AMR Parsing](docs/abstract_meaning_parsing.md) 12 | * Coreference Resolution 13 | * Emotion Detection 14 | 15 | This is an open-source project led by the [Emory NLP](http://nlp.cs.emory.edu/) Research Laboratory and released under the [Apache License 2.0](LICENSE). 16 | 17 | * The latest release: [2.0](https://pypi.org/project/elit/) 18 | * Contact: [Jinho D. Choi](http://www.cs.emory.edu/~choi) 19 | 20 | ## Resources 21 | 22 | * [Getting Started](docs/getting_started.md) 23 | * [Data Format](docs/data_format.md) 24 | 25 | ## Citation 26 | 27 | * [ELIT: Emory Language and Information Toolkit](https://arxiv.org/abs/2109.03903), Han He, Liyan Xu, and Jinho D. Choi, arXiv, 2109.03903, cs.CL, 2021. 28 | -------------------------------------------------------------------------------- /elit/layers/pass_through_encoder.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from alnlp.modules.pass_through_encoder import PassThroughEncoder as _PassThroughEncoder 20 | 21 | from elit.common.structure import ConfigTracker 22 | 23 | 24 | class PassThroughEncoder(_PassThroughEncoder, ConfigTracker): 25 | def __init__(self, input_dim: int) -> None: 26 | super().__init__(input_dim) 27 | ConfigTracker.__init__(self, locals()) 28 | -------------------------------------------------------------------------------- /elit/utils/init_util.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import math 20 | 21 | import torch 22 | from torch import nn 23 | import functools 24 | 25 | 26 | def embedding_uniform(tensor:torch.Tensor, seed=233): 27 | gen = torch.Generator().manual_seed(seed) 28 | with torch.no_grad(): 29 | fan_out = tensor.size(-1) 30 | bound = math.sqrt(3.0 / fan_out) 31 | return tensor.uniform_(-bound, bound, generator=gen) 32 | -------------------------------------------------------------------------------- /tests/test_joint_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-18 19:21 4 | import unittest 5 | from elit.server.en import en_services 6 | from elit.server.format import Input 7 | 8 | 9 | class TestJointModel(unittest.TestCase): 10 | 11 | def setUp(self) -> None: 12 | super().setUp() 13 | 14 | def test_text_input(self): 15 | text = "Emory NLP is a research lab in Atlanta, GA. " 16 | "It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University." 17 | doc = en_services.parser.parse([Input(text=text)])[0] 18 | print(doc) 19 | 20 | def test_sents_input(self): 21 | text = ["Emory NLP is a research lab in Atlanta, GA.", 22 | "It is founded by Jinho D. Choi in 2014.", 23 | 'Dr. Choi is a professor at Emory University.'] 24 | doc = en_services.parser.parse([Input(text=text)])[0] 25 | print(doc) 26 | 27 | def test_tokens_input(self): 28 | tokens = [ 29 | "yes i do what 's your job".split(), 30 | ] 31 | doc = en_services.parser.parse([Input(tokens=tokens)])[0] 32 | print(doc) 33 | 34 | 35 | if __name__ == '__main__': 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /elit/components/mtl/loss_balancer.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-01-14 13:14 4 | 5 | from collections import defaultdict, deque 6 | 7 | from elit.common.structure import ConfigTracker 8 | 9 | 10 | class MovingAverage(object): 11 | def __init__(self, maxlen=5) -> None: 12 | self._queue = defaultdict(lambda: deque(maxlen=maxlen)) 13 | 14 | def append(self, key, value: float): 15 | self._queue[key].append(value) 16 | 17 | def average(self, key) -> float: 18 | queue = self._queue[key] 19 | return sum(queue) / len(queue) 20 | 21 | 22 | class MovingAverageBalancer(MovingAverage, ConfigTracker): 23 | 24 | def __init__(self, maxlen=5, intrinsic_weighting=True) -> None: 25 | super().__init__(maxlen) 26 | ConfigTracker.__init__(self, locals()) 27 | self.intrinsic_weighting = intrinsic_weighting 28 | 29 | def weight(self, task) -> float: 30 | avg_losses = dict((k, self.average(k)) for k in self._queue) 31 | weight = sum(avg_losses.values()) / avg_losses[task] 32 | if self.intrinsic_weighting: 33 | cur_loss = self._queue[task][-1] 34 | weight *= cur_loss / avg_losses[task] 35 | 36 | return weight 37 | -------------------------------------------------------------------------------- /elit/pretrained/mtl.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from elit.common.constant import ELIT_URL 20 | 21 | LEM_POS_NER_DEP_SDP_CON_AMR_ELECTRA_BASE_EN = ELIT_URL + 'en_pos_ner_srl_dep_con_amr_electra_base_20201222.zip' 22 | LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN = ELIT_URL + 'en_pos_ner_srl_dep_con_amr_roberta_base_20210628_234225.zip' 23 | LEM_POS_NER_DEP_ROBERTA_BASE_EN = ELIT_URL + 'en_lem_pos_ner_ddr_roberta_base_20210325_121606.zip' 24 | 25 | ALL = {} 26 | -------------------------------------------------------------------------------- /elit/pretrained/coref.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: Liyan Xu 19 | from elit.common.constant import ELIT_URL 20 | 21 | DOC_COREF_SPANBERT_BASE_EN = ELIT_URL + 'doc_coref_spanbert_base_20201222.zip' 22 | DOC_COREF_SPANBERT_LARGE_EN = ELIT_URL + 'doc_coref_spanbert_large_20201222.zip' 23 | ONLINE_COREF_SPANBERT_BASE_EN = ELIT_URL + 'online_coref_spanbert_base_20201222.zip' 24 | ONLINE_COREF_SPANBERT_LARGE_EN = ELIT_URL + 'online_coref_spanbert_large_20201222.zip' 25 | 26 | ALL = {} 27 | -------------------------------------------------------------------------------- /elit/metrics/accuracy.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from alnlp import metrics 20 | from elit.metrics.metric import Metric 21 | 22 | 23 | class CategoricalAccuracy(metrics.CategoricalAccuracy, Metric): 24 | @property 25 | def score(self): 26 | return self.get_metric() 27 | 28 | def __repr__(self) -> str: 29 | return f'Accuracy:{self.score:.2%}' 30 | 31 | 32 | class BooleanAccuracy(metrics.BooleanAccuracy, CategoricalAccuracy): 33 | pass 34 | -------------------------------------------------------------------------------- /elit/datasets/lm/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | 20 | _PTB_HOME = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz#' 21 | PTB_TOKEN_TRAIN = _PTB_HOME + 'data/ptb.train.txt' 22 | PTB_TOKEN_DEV = _PTB_HOME + 'data/ptb.valid.txt' 23 | PTB_TOKEN_TEST = _PTB_HOME + 'data/ptb.test.txt' 24 | 25 | PTB_CHAR_TRAIN = _PTB_HOME + 'data/ptb.char.train.txt' 26 | PTB_CHAR_DEV = _PTB_HOME + 'data/ptb.char.valid.txt' 27 | PTB_CHAR_TEST = _PTB_HOME + 'data/ptb.char.test.txt' 28 | -------------------------------------------------------------------------------- /elit/server/en_parser.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | # Date: 2021-01-08 12:44 20 | import elit 21 | from elit.server import parser_config 22 | from elit.server.en_util import eos, tokenize 23 | from elit.server.service_parser import ServiceParser 24 | from elit.server.service_tokenizer import ServiceTokenizer 25 | 26 | service_tokenizer = ServiceTokenizer(eos, tokenize) 27 | service_parser = ServiceParser( 28 | service_tokenizer=service_tokenizer, 29 | model=elit.load(parser_config.MTL_MODEL) 30 | ) 31 | -------------------------------------------------------------------------------- /elit/components/parsers/second_order/model.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from torch import nn 20 | 21 | 22 | # noinspection PyAbstractClass 23 | class DependencyModel(nn.Module): 24 | def __init__(self, embed: nn.Module, encoder: nn.Module, decoder: nn.Module): 25 | super().__init__() 26 | self.embed = embed 27 | self.encoder = encoder 28 | self.decoder = decoder 29 | 30 | def forward(self, batch, mask): 31 | x = self.embed(batch, mask=mask) 32 | x = self.encoder(x, mask) 33 | return self.decoder(x, mask=mask) 34 | -------------------------------------------------------------------------------- /docs/named_entity_recognition.md: -------------------------------------------------------------------------------- 1 | # Named Entity Recognition 2 | 3 | Named Entity Recognition (NER) is the task of recognizing the span and type of each entity in a sentence. It is also a built-in decoder of the MTL framework. Users can load a MTL component and specify `tasks='ner'` to have the NER annotated by ELIT. E.g., 4 | 5 | ```python 6 | import elit 7 | nlp = elit.load('LEM_POS_NER_DEP_ROBERTA_BASE_EN') 8 | print(nlp([['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.'], 9 | ['It', 'is', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.'], 10 | ['Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.']] 11 | , tasks='ner')) 12 | ``` 13 | 14 | Outputs: 15 | 16 | ```python 17 | { 18 | "ner": [ 19 | [["ORG", 0, 2, "Emory NLP"], ["GPE", 7, 8, "Atlanta"], ["GPE", 9, 10, "GA"]], 20 | [["PERSON", 4, 7, "Jinho D. Choi"], ["DATE", 8, 9, "2014"]], 21 | [["PERSON", 1, 2, "Choi"], ["ORG", 6, 8, "Emory University"]] 22 | ], 23 | "tok": [ 24 | ["Emory", "NLP", "is", "a", "research", "lab", "in", "Atlanta", ",", "GA", "."], 25 | ["It", "is", "founded", "by", "Jinho", "D.", "Choi", "in", "2014", "."], 26 | ["Dr.", "Choi", "is", "a", "professor", "at", "Emory", "University", "."] 27 | ] 28 | } 29 | ``` 30 | 31 | In `ner` field, each entity is represented as a tupe of `(type, start, end, form)`. 32 | 33 | -------------------------------------------------------------------------------- /elit/layers/feed_forward.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import Union, List 20 | 21 | from alnlp.modules import feedforward 22 | 23 | from elit.common.structure import ConfigTracker 24 | 25 | 26 | class FeedForward(feedforward.FeedForward, ConfigTracker): 27 | def __init__(self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]], 28 | activations: Union[str, List[str]], dropout: Union[float, List[float]] = 0.0) -> None: 29 | super().__init__(input_dim, num_layers, hidden_dims, activations, dropout) 30 | ConfigTracker.__init__(self, locals()) 31 | -------------------------------------------------------------------------------- /elit/metrics/chunking/chunking_f1.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import List 20 | 21 | from elit.metrics.chunking.sequence_labeling import get_entities 22 | from elit.metrics.f1 import F1 23 | from elit.metrics.metric import Metric 24 | 25 | 26 | class ChunkingF1(F1): 27 | 28 | def __call__(self, pred_tags: List[List[str]], gold_tags: List[List[str]]): 29 | for p, g in zip(pred_tags, gold_tags): 30 | pred = set(get_entities(p)) 31 | gold = set(get_entities(g)) 32 | self.nb_pred += len(pred) 33 | self.nb_true += len(gold) 34 | self.nb_correct += len(pred & gold) 35 | -------------------------------------------------------------------------------- /elit/server/en_util.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs, Liyan Xu 19 | from typing import List 20 | 21 | from elit.components.tokenizer import EnglishTokenizer 22 | 23 | tokenizer = EnglishTokenizer() 24 | 25 | 26 | def eos(text: List[str]) -> List[List[str]]: 27 | results = [] 28 | for doc in text: 29 | tokens = tokenizer.tokenize(doc) 30 | sents = tokenizer.segment(tokens) 31 | results.append(['\0'.join(x) for x in sents]) 32 | return results 33 | 34 | 35 | def tokenize(sents: List[str]) -> List[List[str]]: 36 | # Since text from user seldom contains \0, so we use \0 to indicate a pre-tokenized sentence 37 | return [x.split('\0') if '\0' in x else tokenizer.tokenize(x) for x in sents] 38 | -------------------------------------------------------------------------------- /tests/demo/train_amr3_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-04-28 13:25 4 | from elit.common.dataset import SortingSamplerBuilder 5 | from elit.components.amr.seq2seq.seq2seq_amr_parser import Seq2seq_AMR_Parser 6 | from elit.utils.log_util import cprint 7 | from tests import cdroot 8 | 9 | cdroot() 10 | 11 | parser = Seq2seq_AMR_Parser() 12 | save_dir = 'data/model/amr/amr3_bart_large' 13 | cprint(f'Model will be saved in [cyan]{save_dir}[/cyan]') 14 | parser.fit( 15 | 'data/amr/amr_3.0/train.txt', 16 | 'data/amr/amr_3.0/dev.txt', 17 | save_dir, 18 | epochs=30, 19 | eval_after=20, 20 | transformer='facebook/bart-large', 21 | sampler_builder=SortingSamplerBuilder(batch_max_tokens=6000), 22 | gradient_accumulation=10, 23 | ) 24 | parser.load(save_dir) 25 | test_score = parser.evaluate('data/amr/amr_3.0/test.txt', save_dir)[-1] 26 | cprint(f'Official score on testset: [red]{test_score.score:.1%}[/red]') 27 | finegrained = ['Smatch', 28 | 'Unlabeled', 29 | 'No WSD', 30 | 'Concepts', 31 | 'SRL', 32 | 'Reentrancies', 33 | 'Negations', 34 | 'Named Ent.', 35 | 'Wikification'] 36 | print('\t'.join(f'{test_score[k].score * 100:.1f}' for k in finegrained)) 37 | cprint(f'Model saved in [cyan]{save_dir}[/cyan]') 38 | print(f'To perform wikification, refer to [BLINK](https://github.com/SapienzaNLP/spring/blob/8fad6a4ce59132b22d6bdb4d4eb3a9aa5223ead6/bin/blinkify.py) which will give you the extra `0.3` F1.') 39 | -------------------------------------------------------------------------------- /elit/common/component.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import inspect 20 | from abc import ABC, abstractmethod 21 | from typing import Any 22 | 23 | from elit.common.structure import Configurable 24 | 25 | 26 | class Component(Configurable, ABC): 27 | @abstractmethod 28 | def predict(self, data: Any, **kwargs): 29 | """Predict on data 30 | 31 | Args: 32 | data: Any type of data subject to sub-classes 33 | kwargs: Additional arguments 34 | data: Any: 35 | **kwargs: 36 | 37 | Returns: 38 | 39 | """ 40 | raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) 41 | 42 | def __call__(self, data, **kwargs): 43 | return self.predict(data, **kwargs) 44 | -------------------------------------------------------------------------------- /elit/components/taggers/util.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import List, Tuple 20 | from alnlp.modules.conditional_random_field import allowed_transitions 21 | 22 | 23 | def guess_tagging_scheme(labels: List[str]) -> str: 24 | tagset = set(y.split('-')[0] for y in labels) 25 | for scheme in "BIO", "BIOUL", "BMES", 'IOBES': 26 | if tagset == set(list(scheme)): 27 | return scheme 28 | 29 | 30 | def guess_allowed_transitions(labels) -> List[Tuple[int, int]]: 31 | scheme = guess_tagging_scheme(labels) 32 | if not scheme: 33 | return None 34 | if scheme == 'IOBES': 35 | scheme = 'BIOUL' 36 | labels = [y.replace('E-', 'L-').replace('S-', 'U-') for y in labels] 37 | return allowed_transitions(scheme, dict(enumerate(labels))) 38 | -------------------------------------------------------------------------------- /elit/server/format.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs, Liyan Xu 19 | from typing import Union, List, Tuple 20 | from pydantic import BaseModel 21 | 22 | 23 | class OnlineCorefContext(BaseModel): 24 | input_ids: List[int] 25 | sentence_map: List[int] 26 | subtoken_map: List[int] 27 | mentions: List[Tuple[int, int]] 28 | uttr_start_idx: List[int] 29 | speaker_ids: List[int] = None # Others are required 30 | 31 | 32 | class Input(BaseModel): 33 | text: Union[str, List[str]] = None 34 | tokens: List[List[str]] = None 35 | models: List[str] = ["lem", "pos", "ner", "con", "dep", "srl", "amr", "dcr", "ocr"] 36 | 37 | # For coref 38 | speaker_ids: Union[int, List[int]] = None 39 | genre: str = None 40 | coref_context: OnlineCorefContext = None 41 | return_coref_prob: bool = False 42 | 43 | language: str = 'en' 44 | verbose: bool = True 45 | -------------------------------------------------------------------------------- /elit/datasets/parsing/ptb.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | 20 | _PTB_HOME = 'https://github.com/KhalilMrini/LAL-Parser/archive/master.zip#data/' 21 | 22 | PTB_TRAIN = _PTB_HOME + '02-21.10way.clean' 23 | PTB_DEV = _PTB_HOME + '22.auto.clean' 24 | PTB_TEST = _PTB_HOME + '23.auto.clean' 25 | 26 | PTB_SD330_TRAIN = _PTB_HOME + 'ptb_train_3.3.0.sd.clean' 27 | PTB_SD330_DEV = _PTB_HOME + 'ptb_dev_3.3.0.sd.clean' 28 | PTB_SD330_TEST = _PTB_HOME + 'ptb_test_3.3.0.sd.clean' 29 | 30 | PTB_TOKEN_MAPPING = { 31 | "-LRB-": "(", 32 | "-RRB-": ")", 33 | "-LCB-": "{", 34 | "-RCB-": "}", 35 | "-LSB-": "[", 36 | "-RSB-": "]", 37 | "``": '"', 38 | "''": '"', 39 | "`": "'", 40 | '«': '"', 41 | '»': '"', 42 | '‘': "'", 43 | '’': "'", 44 | '“': '"', 45 | '”': '"', 46 | '„': '"', 47 | '‹': "'", 48 | '›': "'", 49 | "\u2013": "--", # en dash 50 | "\u2014": "--", # em dash 51 | } 52 | -------------------------------------------------------------------------------- /elit/components/lambda_wrapper.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import Callable, Any 20 | 21 | from elit.common.component import Component 22 | from elit.utils.reflection import classpath_of, object_from_classpath, str_to_type 23 | 24 | 25 | class LambdaComponent(Component): 26 | def __init__(self, function: Callable) -> None: 27 | super().__init__() 28 | self.function = function 29 | self.config['function'] = classpath_of(function) 30 | 31 | def predict(self, data: Any, **kwargs): 32 | unpack = kwargs.pop('_elit_unpack', None) 33 | if unpack: 34 | return self.function(*data, **kwargs) 35 | return self.function(data, **kwargs) 36 | 37 | @staticmethod 38 | def from_config(meta: dict, **kwargs): 39 | cls = str_to_type(meta['classpath']) 40 | function = meta['function'] 41 | function = object_from_classpath(function) 42 | return cls(function) 43 | -------------------------------------------------------------------------------- /elit/metrics/metric.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from abc import ABC, abstractmethod 20 | 21 | 22 | class Metric(ABC): 23 | 24 | def __lt__(self, other): 25 | return self.score < other 26 | 27 | def __le__(self, other): 28 | return self.score <= other 29 | 30 | def __eq__(self, other): 31 | return self.score == other 32 | 33 | def __ge__(self, other): 34 | return self.score >= other 35 | 36 | def __gt__(self, other): 37 | return self.score > other 38 | 39 | def __ne__(self, other): 40 | return self.score != other 41 | 42 | @property 43 | @abstractmethod 44 | def score(self): 45 | pass 46 | 47 | @abstractmethod 48 | def __call__(self, pred, gold, mask=None): 49 | pass 50 | 51 | def __repr__(self) -> str: 52 | return f'{self.score}:.4f' 53 | 54 | def __float__(self): 55 | return self.score 56 | 57 | @abstractmethod 58 | def reset(self): 59 | pass 60 | -------------------------------------------------------------------------------- /elit/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-18 19:36 4 | 5 | import argparse 6 | import sys 7 | 8 | from elit import __version__ 9 | from elit.server.format import Input 10 | from elit.server import parser_config 11 | 12 | 13 | def add_model_option(parser: argparse.ArgumentParser): 14 | parser.add_argument('--parser_model', type=str, default=parser_config.MTL_MODEL, 15 | help='the identifier of a parser model') 16 | 17 | 18 | def main(): 19 | if len(sys.argv) == 1: 20 | sys.argv.append('--help') 21 | 22 | arg_parser = argparse.ArgumentParser(description='ELIT-{}'.format(__version__)) 23 | task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') 24 | parse_parser = task_parser.add_parser(name='parse', help='interactive parse per document') 25 | add_model_option(parse_parser) 26 | server_parser = task_parser.add_parser(name='serve', help='start http server', 27 | description='A http server for ELIT') 28 | add_model_option(server_parser) 29 | server_parser.add_argument('--port', type=int, default=8000) 30 | server_parser.add_argument('--workers', type=int, default=1, help='number of workers') 31 | 32 | args = arg_parser.parse_args() 33 | parser_config.MTL_MODEL = args.parser_model 34 | 35 | if args.task == 'parse': 36 | from elit.server.en_parser import service_parser 37 | for line in sys.stdin: 38 | line = line.strip() 39 | doc = service_parser.parse([Input(text=line)])[0] 40 | print(doc) 41 | elif args.task == 'serve': 42 | from elit.server import server 43 | server.run(port=args.port, workers=args.workers) 44 | 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /docs/tokenization.md: -------------------------------------------------------------------------------- 1 | # Tokenization 2 | 3 | Tokenization is the task to split a sentence into tokens such that each token represents a word or a punctuation. ELIT features a rule-based English tokenizer which offers both tokenization and sentence segmentation. 4 | 5 | ### Tokenization 6 | 7 | The `EnglishTokenizer` class handles common abbreviations, apostrophes, concatenation words, hyphens, network protocols, emojis, emails, html entities, list item units with expert-crafted rules. E.g.: 8 | 9 | ```python 10 | from elit.components.tokenizer import EnglishTokenizer 11 | tokenizer = EnglishTokenizer() 12 | text = "Emory NLP is a research lab in Atlanta, GA. It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University." 13 | print(tokenizer.tokenize(text)) 14 | ``` 15 | 16 | Output: 17 | 18 | ```python 19 | ['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.', 'It', 'is', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.', 'Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.'] 20 | ``` 21 | 22 | ### Sentence Segmentation 23 | 24 | The tokenized tokens can be fed into `tokenizer.segment` for sentence segmentation. E.g.: 25 | 26 | ```python 27 | from elit.components.tokenizer import EnglishTokenizer 28 | tokenizer = EnglishTokenizer() 29 | print(tokenizer.segment( 30 | ['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.', 'It', 'is', 'founded', 'by', 31 | 'Jinho', 'D.', 'Choi', 'in', '2014', '.', 'Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 32 | 'University', '.'])) 33 | ``` 34 | 35 | Output: 36 | 37 | ```python 38 | [['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.'], ['It', 'is', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.'], ['Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.']] 39 | ``` 40 | 41 | -------------------------------------------------------------------------------- /elit/layers/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | 20 | from elit.common.constant import ELIT_URL 21 | # mute transformers 22 | import logging 23 | 24 | logging.getLogger('transformers.file_utils').setLevel(logging.ERROR) 25 | logging.getLogger('transformers.filelock').setLevel(logging.ERROR) 26 | logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR) 27 | logging.getLogger('transformers.configuration_utils').setLevel(logging.ERROR) 28 | logging.getLogger('transformers.modeling_tf_utils').setLevel(logging.ERROR) 29 | logging.getLogger('transformers.modeling_utils').setLevel(logging.ERROR) 30 | logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR) 31 | 32 | zh_albert_models_google = { 33 | 'albert_base_zh': ELIT_URL + 'embeddings/albert_base_zh.tar.gz', # Provide mirroring 34 | 'albert_large_zh': 'https://storage.googleapis.com/albert_models/albert_large_zh.tar.gz', 35 | 'albert_xlarge_zh': 'https://storage.googleapis.com/albert_models/albert_xlarge_zh.tar.gz', 36 | 'albert_xxlarge_zh': 'https://storage.googleapis.com/albert_models/albert_xxlarge_zh.tar.gz', 37 | } 38 | # bert_models_google['chinese_L-12_H-768_A-12'] = elit_URL + 'embeddings/chinese_L-12_H-768_A-12.zip' 39 | -------------------------------------------------------------------------------- /elit/layers/transformers/pt_imports.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import os 20 | 21 | if os.environ.get('USE_TF', None) is None: 22 | os.environ["USE_TF"] = 'NO' # saves time loading transformers 23 | if os.environ.get('TOKENIZERS_PARALLELISM', None) is None: 24 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 25 | from transformers import BertTokenizer, BertConfig, PretrainedConfig, \ 26 | AutoConfig, AutoTokenizer, PreTrainedTokenizer, BertTokenizerFast, AlbertConfig, BertModel, AutoModel, \ 27 | PreTrainedModel, get_linear_schedule_with_warmup, AdamW, AutoModelForSequenceClassification, \ 28 | AutoModelForTokenClassification, optimization, BartModel 29 | 30 | 31 | class AutoModel_(AutoModel): 32 | @classmethod 33 | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, training=True, **kwargs): 34 | if training: 35 | return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) 36 | else: 37 | if isinstance(pretrained_model_name_or_path, str): 38 | return super().from_config(AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)) 39 | else: 40 | assert not kwargs 41 | return super().from_config(pretrained_model_name_or_path) 42 | -------------------------------------------------------------------------------- /docs/dependency_parsing.md: -------------------------------------------------------------------------------- 1 | # Dependency Parsing 2 | 3 | Constituency parsing is the task of parsing a sentence to its dependency grammar which represents its syntax in the form of head/dependent word pairs. 4 | 5 | Users can load a MTL component and specify `tasks='dep'` to have sentences parsed by ELIT. E.g., 6 | 7 | ```python 8 | import elit 9 | nlp = elit.load('LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN') 10 | print(nlp(['Emory','NLP','is','in','Atlanta'] 11 | , tasks='dep')) 12 | ``` 13 | 14 | Outputs: 15 | 16 | ```python 17 | { 18 | "tok": [ 19 | ["Emory", "NLP", "is", "in", "Atlanta"] 20 | ], 21 | "dep": [ 22 | [[1, "com"], [3, "nsbj"], [3, "cop"], [-1, "root"], [3, "obj"]] 23 | ] 24 | } 25 | ``` 26 | 27 | `dep` stores the `(head, relation)` of each token, with the offset starting from `-1` (ROOT). In this component, the primary dependency of Deep Dependency Graph Representation is used. The full representation with secondary dependencies are provided in `LEM_POS_NER_DEP_ROBERTA_BASE_EN`. 28 | 29 | ### Korean Model 30 | 31 | We have also released a single task dependency parsing model for Korea trained on the Korean Treebank, which are demonstrated using the following codes. 32 | 33 | ```python 34 | import elit 35 | parser = elit.load('KOREAN_TREEBANK_BIAFFINE_DEP') 36 | tree = parser('그는 르노가 3 월말까지 인수제의 시한을 갖고 있다고 덧붙였다 .'.split()) 37 | print(tree) 38 | ``` 39 | 40 | The input is a tokenized sentence or a list of multiple tokenized sentences. The type of the output is [`CoNLLSentence` ](https://github.com/emorynlp/elit/blob/main/elit/components/parsers/conll.py#L179) which is a list of [`CoNLLUWord`](https://github.com/emorynlp/elit/blob/main/elit/components/parsers/conll.py#L95). When the output is fed into `print` or `str`, a CoNLLU format will be returned: 41 | 42 | ``` 43 | 1 그는 _ _ _ _ 9 nsubj _ _ 44 | 2 르노가 _ _ _ _ 7 csubj _ _ 45 | 3 3 _ _ _ _ 4 nummod _ _ 46 | 4 월말까지 _ _ _ _ 7 obl _ _ 47 | 5 인수제의 _ _ _ _ 6 compound _ _ 48 | 6 시한을 _ _ _ _ 7 obj _ _ 49 | 7 갖고 _ _ _ _ 9 advcl _ _ 50 | 8 있다고 _ _ _ _ 7 aux _ _ 51 | 9 덧붙였다 _ _ _ _ 0 root _ _ 52 | 10 . _ _ _ _ 9 punct _ _ 53 | ``` 54 | 55 | -------------------------------------------------------------------------------- /elit/metrics/chunking/binary_chunking_f1.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from collections import defaultdict 20 | from typing import List, Union 21 | 22 | import torch 23 | 24 | from elit.metrics.f1 import F1 25 | 26 | 27 | class BinaryChunkingF1(F1): 28 | def __call__(self, pred_tags: torch.LongTensor, gold_tags: torch.LongTensor, lens: List[int] = None): 29 | if lens is None: 30 | lens = [gold_tags.size(1)] * gold_tags.size(0) 31 | self.update(self.decode_spans(pred_tags, lens), self.decode_spans(gold_tags, lens)) 32 | 33 | def update(self, pred_tags, gold_tags): 34 | for pred, gold in zip(pred_tags, gold_tags): 35 | super().__call__(set(pred), set(gold)) 36 | 37 | @staticmethod 38 | def decode_spans(pred_tags: torch.LongTensor, lens: Union[List[int], torch.LongTensor]): 39 | if isinstance(lens, torch.Tensor): 40 | lens = lens.tolist() 41 | batch_pred = defaultdict(list) 42 | for batch, offset in pred_tags.nonzero(as_tuple=False).tolist(): 43 | batch_pred[batch].append(offset) 44 | batch_pred_spans = [[(0, l)] for l in lens] 45 | for batch, offsets in batch_pred.items(): 46 | l = lens[batch] 47 | batch_pred_spans[batch] = list(zip(offsets, offsets[1:] + [l])) 48 | return batch_pred_spans 49 | -------------------------------------------------------------------------------- /elit/__init__.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | __author__ = "Jinho D. Choi" 20 | import elit.common 21 | import elit.components 22 | import elit.pretrained 23 | import elit.utils 24 | from elit.version import __version__ 25 | 26 | elit.utils.util.ls_resource_in_module(elit.pretrained) 27 | 28 | 29 | def load(save_dir: str, transform_only: bool = False, **kwargs) -> elit.common.component.Component: 30 | """Load pretrained component from an identifier. 31 | 32 | Args: 33 | save_dir (str): The identifier to the saved component. It could be a remote URL or a local path. 34 | transform_only: Whether to load transform only for TensorFlow components. Default: ``False``. 35 | **kwargs: Arguments passed to `Component.load` 36 | 37 | Returns: 38 | A pretrained component. 39 | 40 | """ 41 | save_dir = elit.pretrained.ALL.get(save_dir, save_dir) 42 | from elit.utils.component_util import load_from_meta_file 43 | return load_from_meta_file(save_dir, 'meta.json', transform_only=transform_only, **kwargs) 44 | 45 | 46 | def pipeline(*pipes) -> elit.components.pipeline.Pipeline: 47 | """Creates a pipeline of components. 48 | 49 | Args: 50 | *pipes: Components if pre-defined any. 51 | 52 | Returns: 53 | A pipeline 54 | 55 | """ 56 | return elit.components.pipeline.Pipeline(*pipes) 57 | -------------------------------------------------------------------------------- /elit/components/lemmatizer.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import List 20 | 21 | from elit.common.transform import TransformList 22 | from elit.components.parsers.udify.lemma_edit import gen_lemma_rule, apply_lemma_rule 23 | from elit.components.taggers.transformers.transformer_tagger import TransformerTagger 24 | 25 | 26 | def add_lemma_rules_to_sample(sample: dict): 27 | if 'tag' in sample and 'lemma' not in sample: 28 | lemma_rules = [gen_lemma_rule(word, lemma) 29 | if lemma != "_" else "_" 30 | for word, lemma in zip(sample['token'], sample['tag'])] 31 | sample['lemma'] = sample['tag'] = lemma_rules 32 | return sample 33 | 34 | 35 | class TransformerLemmatizer(TransformerTagger): 36 | def build_dataset(self, data, transform=None, **kwargs): 37 | if not isinstance(transform, list): 38 | transform = TransformList() 39 | transform.append(add_lemma_rules_to_sample) 40 | return super().build_dataset(data, transform, **kwargs) 41 | 42 | def prediction_to_human(self, pred, vocab: List[str], batch, token=None): 43 | if token is None: 44 | token = batch['token'] 45 | rules = super().prediction_to_human(pred, vocab, batch) 46 | for token_per_sent, rule_per_sent in zip(token, rules): 47 | lemma_per_sent = [apply_lemma_rule(t, r) for t, r in zip(token_per_sent, rule_per_sent)] 48 | yield lemma_per_sent 49 | -------------------------------------------------------------------------------- /elit/components/parsers/second_order/treecrf_decoder.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import Any, Tuple 20 | 21 | import torch 22 | 23 | from elit.components.parsers.biaffine.biaffine_model import BiaffineDecoder 24 | from elit.components.parsers.biaffine.mlp import MLP 25 | from elit.components.parsers.constituency.treecrf import CRF2oDependency 26 | from elit.components.parsers.second_order.affine import Triaffine 27 | 28 | 29 | class TreeCRFDecoder(BiaffineDecoder): 30 | def __init__(self, hidden_size, n_mlp_arc, n_mlp_sib, n_mlp_rel, mlp_dropout, n_rels) -> None: 31 | super().__init__(hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels) 32 | self.mlp_sib_s = MLP(hidden_size, n_mlp_sib, dropout=mlp_dropout) 33 | self.mlp_sib_d = MLP(hidden_size, n_mlp_sib, dropout=mlp_dropout) 34 | self.mlp_sib_h = MLP(hidden_size, n_mlp_sib, dropout=mlp_dropout) 35 | 36 | self.sib_attn = Triaffine(n_in=n_mlp_sib, bias_x=True, bias_y=True) 37 | self.crf = CRF2oDependency() 38 | 39 | def forward(self, x, mask=None, **kwargs: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 40 | s_arc, s_rel = super(TreeCRFDecoder, self).forward(x, mask) 41 | sib_s = self.mlp_sib_s(x) 42 | sib_d = self.mlp_sib_d(x) 43 | sib_h = self.mlp_sib_h(x) 44 | # [batch_size, seq_len, seq_len, seq_len] 45 | s_sib = self.sib_attn(sib_s, sib_d, sib_h).permute(0, 3, 1, 2) 46 | return s_arc, s_sib, s_rel 47 | -------------------------------------------------------------------------------- /elit/components/amr/seq2seq/dataset/penman.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from penman import load as load_, Graph, Triple 4 | from penman import loads as loads_ 5 | from penman import encode as encode_ 6 | from penman.model import Model 7 | from penman.models.noop import NoOpModel 8 | from penman.models import amr 9 | import penman 10 | import logging 11 | 12 | op_model = Model() 13 | noop_model = NoOpModel() 14 | amr_model = amr.model 15 | DEFAULT = op_model 16 | 17 | # Mute loggers 18 | penman.layout.logger.setLevel(logging.CRITICAL) 19 | penman._parse.logger.setLevel(logging.CRITICAL) 20 | 21 | 22 | def _get_model(dereify): 23 | if dereify is None: 24 | return DEFAULT 25 | elif dereify: 26 | return op_model 27 | else: 28 | return noop_model 29 | 30 | 31 | def _remove_wiki(graph): 32 | metadata = graph.metadata 33 | triples = [] 34 | for t in graph.triples: 35 | v1, rel, v2 = t 36 | if rel == ':wiki': 37 | t = Triple(v1, rel, '+') 38 | triples.append(t) 39 | graph = Graph(triples) 40 | graph.metadata = metadata 41 | return graph 42 | 43 | 44 | def pm_load(source, dereify=None, remove_wiki=False) -> List[penman.Graph]: 45 | """ 46 | 47 | Args: 48 | source: 49 | dereify: Restore reverted relations 50 | remove_wiki: 51 | 52 | Returns: 53 | 54 | """ 55 | model = _get_model(dereify) 56 | out = load_(source=source, model=model) 57 | if remove_wiki: 58 | for i in range(len(out)): 59 | out[i] = _remove_wiki(out[i]) 60 | return out 61 | 62 | 63 | def loads(string, dereify=None, remove_wiki=False): 64 | model = _get_model(dereify) 65 | out = loads_(string=string, model=model) 66 | if remove_wiki: 67 | for i in range(len(out)): 68 | out[i] = _remove_wiki(out[i]) 69 | return out 70 | 71 | 72 | def encode(g, top=None, indent=-1, compact=False): 73 | model = amr_model 74 | return encode_(g=g, top=top, indent=indent, compact=compact, model=model) 75 | 76 | 77 | def role_is_reverted(role: str): 78 | if role.endswith('consist-of'): 79 | return False 80 | return role.endswith('-of') 81 | 82 | 83 | class AMRGraph(penman.Graph): 84 | def __str__(self): 85 | return penman.encode(self) 86 | -------------------------------------------------------------------------------- /elit/utils/rules.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | 20 | import re 21 | 22 | SEPARATOR = r'@' 23 | RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) 24 | AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) 25 | AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) 26 | UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE) 27 | UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE) 28 | 29 | 30 | def replace_with_separator(text, separator, regexs): 31 | replacement = r"\1" + separator + r"\2" 32 | result = text 33 | for regex in regexs: 34 | result = regex.sub(replacement, result) 35 | return result 36 | 37 | 38 | def split_sentence(text, best=True): 39 | text = re.sub('([。!?\?])([^”’])', r"\1\n\2", text) 40 | text = re.sub('(\.{6})([^”’])', r"\1\n\2", text) 41 | text = re.sub('(\…{2})([^”’])', r"\1\n\2", text) 42 | text = re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', text) 43 | for chunk in text.split("\n"): 44 | chunk = chunk.strip() 45 | if not chunk: 46 | continue 47 | if not best: 48 | yield chunk 49 | continue 50 | processed = replace_with_separator(chunk, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) 51 | for sentence in RE_SENTENCE.finditer(processed): 52 | sentence = replace_with_separator(sentence.group(), r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) 53 | yield sentence 54 | 55 | 56 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "**" ] 9 | pull_request: 10 | branches: [ "**" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: [3.7, 3.8, 3.9, '3.10'] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install -e . 31 | python -m pip install pytest 32 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 33 | # - name: Lint with flake8 34 | # run: | 35 | # # stop the build if there are Python syntax errors or undefined names 36 | # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pytest tests 42 | deploy: 43 | needs: build 44 | if: github.event_name == 'push' && github.ref == 'refs/heads/main' 45 | runs-on: ubuntu-latest 46 | 47 | steps: 48 | - uses: actions/checkout@v2 49 | # - name: Set up Python 3.7 50 | # uses: actions/setup-python@v2 51 | # with: 52 | # python-version: 3.7 53 | - name: Install dependencies 54 | run: | 55 | python -m pip install --upgrade pip 56 | python -m pip install setuptools wheel twine 57 | - name: Deploy to PyPI 58 | run: | 59 | python setup.py sdist bdist_wheel 60 | python -m twine upload dist/* 61 | env: 62 | TWINE_USERNAME: __token__ 63 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 64 | TWINE_REPOSITORY: pypi 65 | -------------------------------------------------------------------------------- /docs/constituency_parsing.md: -------------------------------------------------------------------------------- 1 | # Constituency Parsing 2 | 3 | Constituency parsing is the task of parsing a sentence to its [phrase structure grammar](https://en.wikipedia.org/wiki/Phrase_structure_grammar) which represents its syntax in the form of nested constituent structure. 4 | 5 | Users can load a MTL component and specify `tasks='con'` to have sentences parsed by ELIT. E.g., 6 | 7 | ```python 8 | import elit 9 | nlp = elit.load('LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN') 10 | print(nlp([['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.'], 11 | ['It', 'is', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.'], 12 | ['Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.']] 13 | , tasks='con')) 14 | ``` 15 | 16 | Outputs: 17 | 18 | ```python 19 | { 20 | "con": [ 21 | ["TOP", [["S", [["NP", [["_", ["Emory"]], ["_", ["NLP"]]]], ["VP", [["_", ["is"]], ["NP", [["NP", [["_", ["a"]], ["_", ["research"]], ["_", ["lab"]]]], ["PP", [["_", ["in"]], ["NP", [["NP", [["_", ["Atlanta"]]]], ["_", [","]], ["NP", [["_", ["GA"]]]]]]]]]]]], ["_", ["."]]]]]], 22 | ["TOP", [["S", [["NP", [["_", ["It"]]]], ["VP", [["_", ["is"]], ["VP", [["_", ["founded"]], ["PP", [["_", ["by"]], ["NP", [["_", ["Jinho"]], ["_", ["D."]], ["_", ["Choi"]]]]]], ["PP", [["_", ["in"]], ["NP", [["_", ["2014"]]]]]]]]]], ["_", ["."]]]]]], 23 | ["TOP", [["S", [["NP", [["_", ["Dr."]], ["_", ["Choi"]]]], ["VP", [["_", ["is"]], ["NP", [["NP", [["_", ["a"]], ["_", ["professor"]]]], ["PP", [["_", ["at"]], ["NP", [["_", ["Emory"]], ["_", ["University"]]]]]]]]]], ["_", ["."]]]]]] 24 | ], 25 | "tok": [ 26 | ["Emory", "NLP", "is", "a", "research", "lab", "in", "Atlanta", ",", "GA", "."], 27 | ["It", "is", "founded", "by", "Jinho", "D.", "Choi", "in", "2014", "."], 28 | ["Dr.", "Choi", "is", "a", "professor", "at", "Emory", "University", "."] 29 | ] 30 | } 31 | ``` 32 | 33 | `con` stores the constituency trees, specifically `(label, child-constituents)` for the non-terminal constituents and `form` for the terminals. Note that we designed a nested list representation with the round brackets replaced by square brackets to avoid ambiguity and make it compatible with JSON. When not being printed out, the `Document` class will convert our nested list structure to the conventional bracketed tree. 34 | -------------------------------------------------------------------------------- /elit/metrics/mtl.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from elit.metrics.metric import Metric 20 | 21 | 22 | class MetricDict(Metric, dict): 23 | _COLORS = ["magenta", "cyan", "green", "yellow"] 24 | 25 | @property 26 | def score(self): 27 | return sum(float(x) for x in self.values()) / len(self) 28 | 29 | def __call__(self, pred, gold): 30 | for metric in self.values(): 31 | metric(pred, gold) 32 | 33 | def reset(self): 34 | for metric in self.values(): 35 | metric.reset() 36 | 37 | def __repr__(self) -> str: 38 | return ' '.join(f'({k} {v})' for k, v in self.items()) 39 | 40 | def cstr(self, idx=None, level=0) -> str: 41 | if idx is None: 42 | idx = [0] 43 | prefix = '' 44 | for _, (k, v) in enumerate(self.items()): 45 | color = self._COLORS[idx[0] % len(self._COLORS)] 46 | idx[0] += 1 47 | child_is_dict = isinstance(v, MetricDict) 48 | _level = min(level, 2) 49 | # if level != 0 and not child_is_dict: 50 | # _level = 2 51 | lb = '{[(' 52 | rb = '}])' 53 | k = f'[bold][underline]{k}[/underline][/bold]' 54 | prefix += f'[{color}]{lb[_level]}{k} [/{color}]' 55 | if child_is_dict: 56 | prefix += v.cstr(idx, level + 1) 57 | else: 58 | prefix += f'[{color}]{v}[/{color}]' 59 | prefix += f'[{color}]{rb[_level]}[/{color}]' 60 | return prefix 61 | -------------------------------------------------------------------------------- /elit/utils/reflection.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import importlib 20 | import inspect 21 | 22 | 23 | def classpath_of(obj) -> str: 24 | """get the full class path of object 25 | 26 | Args: 27 | obj: return: 28 | 29 | Returns: 30 | 31 | """ 32 | if inspect.isfunction(obj): 33 | return module_path_of(obj) 34 | return "{0}.{1}".format(obj.__class__.__module__, obj.__class__.__name__) 35 | 36 | 37 | def module_path_of(func) -> str: 38 | return inspect.getmodule(func).__name__ + '.' + func.__name__ 39 | 40 | 41 | def object_from_classpath(classpath, **kwargs): 42 | classpath = str_to_type(classpath) 43 | if inspect.isfunction(classpath): 44 | return classpath 45 | return classpath(**kwargs) 46 | 47 | 48 | def str_to_type(classpath): 49 | """convert class path in str format to a type 50 | 51 | Args: 52 | classpath: class path 53 | 54 | Returns: 55 | type 56 | 57 | """ 58 | module_name, class_name = classpath.rsplit(".", 1) 59 | cls = getattr(importlib.import_module(module_name), class_name) 60 | return cls 61 | 62 | 63 | def type_to_str(type_object) -> str: 64 | """convert a type object to class path in str format 65 | 66 | Args: 67 | type_object: type 68 | 69 | Returns: 70 | class path 71 | 72 | """ 73 | cls_name = str(type_object) 74 | assert cls_name.startswith(""), 'illegal input' 77 | cls_name = cls_name[:-len("'>")] 78 | return cls_name 79 | -------------------------------------------------------------------------------- /elit/components/coref/util.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: Liyan Xu 19 | from copy import deepcopy 20 | import torch 21 | from transformers import BertTokenizer 22 | 23 | 24 | def flatten(l): 25 | return [item for sublist in l for item in sublist] 26 | 27 | 28 | def get_vocab_size(config): 29 | tokenizer = BertTokenizer.from_pretrained(config['bert_tokenizer_name']) 30 | if config['add_speaker_token']: 31 | return len(tokenizer) + config['max_num_speakers'] + 1 32 | else: 33 | return len(tokenizer) 34 | 35 | 36 | def bucket_distance(offsets): 37 | logspace_distance = torch.log2(offsets.to(torch.float)).to(torch.long) + 3 38 | identity_mask = (offsets <= 4).to(torch.long) 39 | combined_distance = identity_mask * offsets + (1 - identity_mask) * logspace_distance 40 | combined_distance = torch.clamp(combined_distance, 0, 9) 41 | return combined_distance 42 | 43 | 44 | def batch_select(tensor, idx, device=torch.device('cpu')): 45 | assert tensor.shape[0] == idx.shape[0] 46 | dim0_size, dim1_size = tensor.shape[0], tensor.shape[1] 47 | 48 | tensor = torch.reshape(tensor, [dim0_size * dim1_size, -1]) 49 | idx_offset = torch.unsqueeze(torch.arange(0, dim0_size, device=device) * dim1_size, 1) 50 | new_idx = idx + idx_offset 51 | selected = tensor[new_idx] 52 | 53 | if tensor.shape[-1] == 1: 54 | selected = torch.squeeze(selected, -1) 55 | 56 | return selected 57 | 58 | 59 | def random_select(tensor, num_selection): 60 | if tensor.shape[0] > num_selection: 61 | rand_idx = torch.randperm(tensor.shape[0])[:num_selection] 62 | return tensor[rand_idx] 63 | else: 64 | return tensor 65 | -------------------------------------------------------------------------------- /elit/metrics/f1.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from abc import ABC 20 | 21 | from elit.metrics.metric import Metric 22 | 23 | 24 | class F1(Metric, ABC): 25 | def __init__(self, nb_pred=0, nb_true=0, nb_correct=0) -> None: 26 | super().__init__() 27 | self.nb_correct = nb_correct 28 | self.nb_pred = nb_pred 29 | self.nb_true = nb_true 30 | 31 | def __repr__(self) -> str: 32 | p, r, f = self.prf 33 | return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}" 34 | 35 | @property 36 | def prf(self): 37 | nb_correct = self.nb_correct 38 | nb_pred = self.nb_pred 39 | nb_true = self.nb_true 40 | p = nb_correct / nb_pred if nb_pred > 0 else .0 41 | r = nb_correct / nb_true if nb_true > 0 else .0 42 | f = 2 * p * r / (p + r) if p + r > 0 else .0 43 | return p, r, f 44 | 45 | @property 46 | def score(self): 47 | return self.prf[-1] 48 | 49 | def reset(self): 50 | self.nb_correct = 0 51 | self.nb_pred = 0 52 | self.nb_true = 0 53 | 54 | def __call__(self, pred: set, gold: set): 55 | self.nb_correct += len(pred & gold) 56 | self.nb_pred += len(pred) 57 | self.nb_true += len(gold) 58 | 59 | 60 | class F1_(Metric): 61 | def __init__(self, p, r, f) -> None: 62 | super().__init__() 63 | self.f = f 64 | self.r = r 65 | self.p = p 66 | 67 | @property 68 | def score(self): 69 | return self.f 70 | 71 | def __call__(self, pred, gold): 72 | raise NotImplementedError() 73 | 74 | def reset(self): 75 | self.f = self.r = self.p = 0 76 | 77 | def __repr__(self) -> str: 78 | p, r, f = self.p, self.r, self.f 79 | return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}" 80 | -------------------------------------------------------------------------------- /elit/metrics/srl/srlconll.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import os 20 | 21 | from elit.utils.io_util import get_resource, get_exitcode_stdout_stderr, run_cmd 22 | 23 | 24 | def official_conll_05_evaluate(pred_path, gold_path): 25 | script_root = get_resource('http://www.lsi.upc.edu/~srlconll/srlconll-1.1.tgz') 26 | lib_path = f'{script_root}/lib' 27 | if lib_path not in os.environ.get("PERL5LIB", ""): 28 | os.environ['PERL5LIB'] = f'{lib_path}:{os.environ.get("PERL5LIB", "")}' 29 | bin_path = f'{script_root}/bin' 30 | if bin_path not in os.environ.get('PATH', ''): 31 | os.environ['PATH'] = f'{bin_path}:{os.environ.get("PATH", "")}' 32 | eval_info_gold_pred = run_cmd(f'perl {script_root}/bin/srl-eval.pl {gold_path} {pred_path}') 33 | eval_info_pred_gold = run_cmd(f'perl {script_root}/bin/srl-eval.pl {pred_path} {gold_path}') 34 | conll_recall = float(eval_info_gold_pred.strip().split("\n")[6].strip().split()[5]) / 100 35 | conll_precision = float(eval_info_pred_gold.strip().split("\n")[6].strip().split()[5]) / 100 36 | if conll_recall + conll_precision > 0: 37 | conll_f1 = 2 * conll_recall * conll_precision / (conll_recall + conll_precision) 38 | else: 39 | conll_f1 = 0 40 | return conll_precision, conll_recall, conll_f1 41 | 42 | 43 | def run_perl(script, src, dst=None): 44 | os.environ['PERL5LIB'] = f'' 45 | exitcode, out, err = get_exitcode_stdout_stderr( 46 | f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}') 47 | if exitcode: 48 | # cpanm -l ~/.local namespace::autoclean 49 | # cpanm -l ~/.local Moose 50 | # cpanm -l ~/.local MooseX::SemiAffordanceAccessor module 51 | raise RuntimeError(err) 52 | with open(dst, 'w') as ofile: 53 | ofile.write(out) 54 | return dst 55 | -------------------------------------------------------------------------------- /elit/utils/string.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2018 ELIT 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | import string 17 | 18 | __author__ = 'Jinho D. Choi' 19 | 20 | 21 | def is_range(c, begin, end): 22 | return begin <= c <= end 23 | 24 | 25 | def is_single_quote(c): 26 | return c in {'\'', '`'} or is_range(c, u'\u2018', u'\u201B') 27 | 28 | 29 | def is_double_quote(c): 30 | return c == '"' or is_range(c, u'\u201C', u'\u201F') 31 | 32 | 33 | def is_left_bracket(c): 34 | return c in {'(', '{', '[', '<'} 35 | 36 | 37 | def is_right_bracket(c): 38 | return c in {')', '}', ']', '>'} 39 | 40 | 41 | def is_bracket(c): 42 | return is_left_bracket(c) or is_right_bracket(c) 43 | 44 | 45 | def is_hyphen(c): 46 | return c == '-' or is_range(c, u'\u2010', u'\u2014') 47 | 48 | 49 | def is_arrow(c): 50 | return is_range(c, u'\u2190', u'\u21FF') or is_range(c, u'\u27F0', u'\u27FF') or is_range(c, u'\u2900', u'\u297F') 51 | 52 | 53 | def is_currency(c): 54 | return c == '$' or is_range(c, u'\u00A2', u'\u00A5') or is_range(c, u'\u20A0', u'\u20CF') 55 | 56 | 57 | def is_final_mark(c): 58 | return c in {'.', '?', '!', u'\u203C'} or is_range(c, u'\u2047', u'\u2049') 59 | 60 | 61 | def is_punct(c): 62 | return c in string.punctuation 63 | 64 | 65 | def collapse_digits(s): 66 | def get(i, c): 67 | if i+1 < len(s) and digits[i+1]: 68 | if is_currency(c) or c in {'.', '-', '+', '#'}: 69 | return '' 70 | if i-1 >= 0 and digits[i-1] and (is_hyphen(c) or c in {',', ':', '/', '='}): 71 | return '' 72 | elif i-1 >= 0 and digits[i-1]: 73 | if c == '%': 74 | return '' 75 | elif digits[i]: 76 | if t and t[0] == '0': 77 | return '' 78 | 79 | t[0] = '0' if digits[i] else c 80 | return t[0] 81 | 82 | t = [''] 83 | digits = [c.isdigit() for c in s] 84 | return ''.join([get(i, c) for i, c in enumerate(s)]) 85 | -------------------------------------------------------------------------------- /elit/components/amr/seq2seq/dataset/dataset.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from typing import Union, List, Callable 3 | 4 | from penman import Graph 5 | 6 | from elit.common.dataset import TransformDataset 7 | from elit.components.amr.seq2seq.dataset.IO import read_raw_amr_data 8 | from elit.components.amr.seq2seq.dataset.penman import role_is_reverted 9 | from elit.components.amr.seq2seq.dataset.tokenization_bart import PENMANBartTokenizer 10 | 11 | 12 | class AMRDataset(TransformDataset): 13 | 14 | def __init__(self, 15 | data: Union[str, List], 16 | use_recategorization=False, 17 | remove_wiki=False, 18 | dereify=False, 19 | transform: Union[Callable, List] = None, 20 | cache=None, 21 | generate_idx=None) -> None: 22 | self.dereify = dereify 23 | self.remove_wiki = remove_wiki 24 | self.use_recategorization = use_recategorization 25 | super().__init__(data, transform, cache, generate_idx) 26 | 27 | def load_file(self, filepath: str): 28 | graphs = read_raw_amr_data([filepath], self.use_recategorization, remove_wiki=self.remove_wiki, 29 | dereify=self.dereify) 30 | for g in graphs: 31 | yield {'amr': g} 32 | 33 | def get_roles(self): 34 | roles = Counter() 35 | for sample in self.data: 36 | g: Graph = sample['amr'] 37 | for s, r, t in g.triples: 38 | if role_is_reverted(r): 39 | r = r[:-3] 40 | roles[r] += 1 41 | return roles 42 | 43 | def get_frames(self): 44 | frames = Counter() 45 | for sample in self.data: 46 | g: Graph = sample['amr'] 47 | for i in g.instances(): 48 | t = i.target 49 | cells = t.split('-') 50 | if len(cells) == 2 and len(cells[1]) == 2 and cells[1].isdigit(): 51 | frames[t] += 1 52 | return frames 53 | 54 | 55 | 56 | 57 | def dfs_linearize_tokenize(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False, text_key='snt') -> dict: 58 | amr = sample.get('amr', None) 59 | if amr: 60 | l, e = tokenizer.linearize(amr) 61 | sample['graph_tokens'] = e['linearized_graphs'] 62 | sample['graph_token_ids'] = l 63 | text = amr.metadata[text_key] 64 | else: 65 | text = sample['text'] 66 | if remove_space: 67 | text = ''.join(text.split()) 68 | sample['text'] = text 69 | sample['text_token_ids'] = tokenizer.encode(text) 70 | return sample 71 | 72 | 73 | -------------------------------------------------------------------------------- /elit/layers/context_layer.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from alnlp.modules.pytorch_seq2seq_wrapper import LstmSeq2SeqEncoder 20 | from torch import nn 21 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 22 | 23 | from elit.common.structure import ConfigTracker 24 | 25 | 26 | class _LSTMSeq2Seq(nn.Module): 27 | def __init__( 28 | self, 29 | input_size: int, 30 | hidden_size: int, 31 | num_layers: int = 1, 32 | bias: bool = True, 33 | dropout: float = 0.0, 34 | bidirectional: bool = False, 35 | ): 36 | """ 37 | Under construction, not ready for production 38 | :param input_size: 39 | :param hidden_size: 40 | :param num_layers: 41 | :param bias: 42 | :param dropout: 43 | :param bidirectional: 44 | """ 45 | self.rnn = nn.LSTM( 46 | input_size=input_size, 47 | hidden_size=hidden_size, 48 | num_layers=num_layers, 49 | bias=bias, 50 | batch_first=True, 51 | dropout=dropout, 52 | bidirectional=bidirectional, 53 | ) 54 | 55 | def forward(self, embed, lens, max_len): 56 | x = pack_padded_sequence(embed, lens, True, False) 57 | x, _ = self.rnn(x) 58 | x, _ = pad_packed_sequence(x, True, total_length=max_len) 59 | return x 60 | 61 | 62 | # We might update this to support yaml based configuration 63 | class LSTMContextualEncoder(LstmSeq2SeqEncoder, ConfigTracker): 64 | 65 | def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, bias: bool = True, dropout: float = 0.0, 66 | bidirectional: bool = False, stateful: bool = False): 67 | super().__init__(input_size, hidden_size, num_layers, bias, dropout, bidirectional, stateful) 68 | ConfigTracker.__init__(self, locals()) 69 | -------------------------------------------------------------------------------- /elit/metrics/parsing/attachmentscore.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Yu Zhang 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from elit.metrics.metric import Metric 24 | 25 | 26 | class AttachmentScore(Metric): 27 | 28 | def __init__(self, eps=1e-12): 29 | super(AttachmentScore, self).__init__() 30 | 31 | self.eps = eps 32 | self.total = 0.0 33 | self.correct_arcs = 0.0 34 | self.correct_rels = 0.0 35 | 36 | def __repr__(self): 37 | return f"UAS: {self.uas:.2%} LAS: {self.las:.2%}" 38 | 39 | # noinspection PyMethodOverriding 40 | def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask): 41 | arc_mask = arc_preds.eq(arc_golds)[mask] 42 | rel_mask = rel_preds.eq(rel_golds)[mask] & arc_mask 43 | 44 | self.total += len(arc_mask) 45 | self.correct_arcs += arc_mask.sum().item() 46 | self.correct_rels += rel_mask.sum().item() 47 | 48 | def __lt__(self, other): 49 | return self.score < other 50 | 51 | def __le__(self, other): 52 | return self.score <= other 53 | 54 | def __ge__(self, other): 55 | return self.score >= other 56 | 57 | def __gt__(self, other): 58 | return self.score > other 59 | 60 | @property 61 | def score(self): 62 | return self.las 63 | 64 | @property 65 | def uas(self): 66 | return self.correct_arcs / (self.total + self.eps) 67 | 68 | @property 69 | def las(self): 70 | return self.correct_rels / (self.total + self.eps) 71 | 72 | def reset(self): 73 | self.total = 0.0 74 | self.correct_arcs = 0.0 75 | self.correct_rels = 0.0 76 | -------------------------------------------------------------------------------- /docs/part_of_speech_tagging.md: -------------------------------------------------------------------------------- 1 | # Part-of-Speech Tagging 2 | 3 | Part-of-speech (POS) tagging is the task of tagging each word in a sentence with its part-of-speech, which is a grammatical category including noun, verb, adjective etc. 4 | 5 | ## API 6 | 7 | The part-of-speech (POS) tagger is built into the Multi-Task Learning framework of ELIT. To use the POS tagger, a MTL component is required. One can list the pre-trained MTL components via the following code snippets. 8 | 9 | ```python 10 | import elit 11 | print(elit.pretrained.mtl.ALL) 12 | ``` 13 | 14 | Output: 15 | 16 | ```python 17 | { 18 | 'LEM_POS_NER_DEP_SDP_CON_AMR_ELECTRA_BASE_EN': 'https://elit-models.s3-us-west-2.amazonaws.com/v2/en_pos_ner_srl_dep_con_amr_electra_base_20201222.zip', 19 | 'LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN': 'https://elit-models.s3-us-west-2.amazonaws.com/v2/en_pos_ner_srl_dep_con_amr_roberta_base_20210402_152521.zip', 20 | 'LEM_POS_NER_DEP_ROBERTA_BASE_EN': 'https://elit-models.s3-us-west-2.amazonaws.com/v2/en_lem_pos_ner_ddr_roberta_base_20210325_121606.zip' 21 | } 22 | ``` 23 | 24 | Each of them is a pair of an identifier and the URL to it. The identifier indicates the task it supports and the backbone transformer it was trained on. If you are only interested in POS, it's recommended to use `LEM_POS_NER_DEP_ROBERTA_BASE_EN` as it involves less tasks so its size is smaller. 25 | 26 | Once you've decided which model to use, you can pass its identifier to `elit.load`. This method will download the model and load it to the least occupied GPU if any. E.g., 27 | 28 | ```python 29 | nlp = elit.load('LEM_POS_NER_DEP_ROBERTA_BASE_EN') 30 | print(nlp('I banked 1 dollar in a bank .'.split(), tasks='pos')) 31 | ``` 32 | 33 | Output: 34 | 35 | ```python 36 | {'pos': ['PRP', 'VBD', 'CD', 'NN', 'IN', 'DT', 'NN', '.'], 37 | 'tok': ['I', 'banked', '1', 'dollar', 'in', 'a', 'bank', '.']} 38 | ``` 39 | 40 | The output is a dict mapping the task names to corresponding annotations. It always contains the tokens so that each tag can be easily aligned with its actual token. 41 | 42 | ### Batched Prediction 43 | 44 | Note that batched prediction is usually faster than sequential prediction even on CPUs. So it's always recommended to gather the input sentences into one list and make only 1 function call. E.g., 45 | 46 | ```python 47 | nlp([["The", "first", "sentence", "."], 48 | ["The", "second", "sentence", "."]]) 49 | ``` 50 | 51 | is faster than call `nlp` twice separately: 52 | 53 | ```python 54 | nlp(["The", "first", "sentence", "."]) 55 | nlp(["The", "second", "sentence", "."]) 56 | ``` 57 | 58 | ## Annotation Guideline 59 | 60 | Please refer to [Part-of-Speech Tagging Guidelines for the Penn Treebank](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1603&context=cis_reports). 61 | -------------------------------------------------------------------------------- /elit/components/amr/amr_parser/data.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Deng Cai 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import random 24 | 25 | import numpy as np 26 | 27 | from elit.common.constant import PAD 28 | from elit.common.vocab import Vocab 29 | 30 | DUM, NIL, END = '[unused0]', '', '[unused1]' 31 | REL = 'rel=' 32 | 33 | 34 | def list_to_tensor(xs, vocab: Vocab = None, local_vocabs=None, unk_rate=0.): 35 | pad = vocab.pad_idx if vocab else 0 36 | 37 | def to_idx(w, i): 38 | if vocab is None: 39 | return w 40 | if isinstance(w, list): 41 | return [to_idx(_, i) for _ in w] 42 | if random.random() < unk_rate: 43 | return vocab.unk_idx 44 | if local_vocabs is not None: 45 | local_vocab = local_vocabs[i] 46 | if (local_vocab is not None) and (w in local_vocab): 47 | return local_vocab[w] 48 | return vocab.get_idx(w) 49 | 50 | max_len = max(len(x) for x in xs) 51 | ys = [] 52 | for i, x in enumerate(xs): 53 | y = to_idx(x, i) + [pad] * (max_len - len(x)) 54 | ys.append(y) 55 | data = np.transpose(np.array(ys)) 56 | return data 57 | 58 | 59 | def lists_of_string_to_tensor(xs, vocab: Vocab, max_string_len=20): 60 | max_len = max(len(x) for x in xs) 61 | ys = [] 62 | for x in xs: 63 | y = x + [PAD] * (max_len - len(x)) 64 | zs = [] 65 | for z in y: 66 | z = list(z[:max_string_len]) 67 | zs.append(vocab([DUM] + z + [END]) + [vocab.pad_idx] * (max_string_len - len(z))) 68 | ys.append(zs) 69 | 70 | data = np.transpose(np.array(ys), (1, 0, 2)) 71 | return data 72 | -------------------------------------------------------------------------------- /elit/server/service_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-14 12:05 4 | from collections import defaultdict 5 | from typing import List, Callable 6 | 7 | from elit.common.document import Document 8 | from elit.server.format import Input 9 | from elit.server.service_tokenizer import ServiceTokenizer 10 | from elit.server.en_util import eos, tokenize 11 | 12 | 13 | class ServiceParser(object): 14 | 15 | def __init__(self, 16 | service_tokenizer: ServiceTokenizer, 17 | model: Callable[[List[List[str]], List[str]], Document]) -> None: 18 | super().__init__() 19 | self.service_tokenizer = ServiceTokenizer(eos, tokenize) if service_tokenizer is None else service_tokenizer 20 | self.model = model 21 | 22 | def parse_sents(self, sents: List[List[str]], tasks: List[str] = None) -> Document: 23 | return self.model(sents, tasks=tasks) 24 | 25 | def parse(self, inputs: List[Input]) -> List[Document]: 26 | self.service_tokenizer.tokenize_inputs(inputs) # no effects (read-only) in server pipeline 27 | 28 | # We shall group by models 29 | inputs_by_tasks = defaultdict(list) 30 | for i, input in enumerate(inputs): 31 | tasks = tuple(sorted(input.models)) 32 | inputs_by_tasks[tasks].append(i) 33 | 34 | results = [Document() for _ in inputs] 35 | for tasks, input_ids in inputs_by_tasks.items(): 36 | group_inputs = [inputs[i] for i in input_ids] 37 | group_tokens = sum([input.tokens for input in group_inputs], []) 38 | annotations = self.parse_sents(group_tokens, tasks) 39 | for k, v in annotations.items(): 40 | # fit ELIT standard 41 | if k == 'ner': 42 | for j, s in enumerate(v): 43 | v[j] = [x[1:] + x[:1] for x in s] 44 | elif k == 'srl': 45 | for _v in v: 46 | for j, s in enumerate(_v): 47 | _v[j] = [x[1:] + x[:1] for x in s] 48 | for i, input in zip(input_ids, group_inputs): 49 | for k, v in annotations.items(): 50 | results[i][k] = v[:len(input.tokens)] 51 | if k == 'ner': 52 | if not input.verbose: 53 | for j, s in enumerate(results[i][k]): 54 | results[i][k][j] = [x[:-1] for x in s] 55 | elif k == 'srl': 56 | if not input.verbose: 57 | for _v in v: 58 | for j, s in enumerate(_v): 59 | _v[j] = [x[:-1] for x in s] 60 | del v[:len(input.tokens)] 61 | 62 | return results 63 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | 20 | from os.path import abspath, join, dirname 21 | from setuptools import find_packages, setup 22 | 23 | this_dir = abspath(dirname(__file__)) 24 | with open(join(this_dir, 'README.md'), encoding='utf-8') as file: 25 | long_description = file.read() 26 | version = {} 27 | with open(join(this_dir, "elit", "version.py")) as fp: 28 | exec(fp.read(), version) 29 | 30 | setup( 31 | name='elit', 32 | version=version['__version__'], 33 | description='The Emory Language and Information Toolkit (ELIT)', 34 | long_description=long_description, 35 | long_description_content_type="text/markdown", 36 | url='https://github.com/emorynlp/elit', 37 | author='Han He, Liyan Xu', 38 | license='Apache License 2.0', 39 | classifiers=[ 40 | 'Intended Audience :: Science/Research', 41 | 'Intended Audience :: Developers', 42 | "Development Status :: 3 - Alpha", 43 | 'Operating System :: OS Independent', 44 | "License :: OSI Approved :: Apache Software License", 45 | 'Programming Language :: Python :: 3 :: Only', 46 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 47 | "Topic :: Text Processing :: Linguistic" 48 | ], 49 | keywords='corpus,machine-learning,NLU,NLP', 50 | packages=find_packages(exclude=['docs', 'tests*']), 51 | include_package_data=True, 52 | install_requires=[ 53 | 'torch>=1.6.0', 54 | 'sentencepiece>=0.1.95', 55 | 'termcolor==1.1.0', 56 | 'phrasetree==0.0.8', 57 | 'pynvml==8.0.4', 58 | 'alnlp==1.0.0rc27', 59 | 'penman==1.2.1', 60 | 'toposort==1.5', 61 | 'unofficial_stog==0.0.21', 62 | 'uvicorn==0.13.1', 63 | 'fastapi==0.65.2', 64 | 'transformers>=4.6.1', 65 | 'editdistance==0.5.3', 66 | 'scipy>=1.5.4' 67 | ], 68 | python_requires='>=3.7', 69 | entry_points={ 70 | 'console_scripts': [ 71 | 'elit=elit.main:main', 72 | ], 73 | }, 74 | ) 75 | -------------------------------------------------------------------------------- /elit/datasets/ner/conll03.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import Union, List, Callable 20 | 21 | from elit.common.dataset import TransformDataset 22 | from elit.utils.io_util import get_resource, generate_words_tags_from_tsv 23 | from elit.utils.string_util import split_long_sentence_into 24 | 25 | 26 | class TSVTaggingDataset(TransformDataset): 27 | 28 | def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, 29 | generate_idx=None, 30 | delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, 31 | ) -> None: 32 | self.char_level = char_level 33 | self.hard_constraint = hard_constraint 34 | self.sent_delimiter = sent_delimiter 35 | self.max_seq_len = max_seq_len 36 | self.delimiter = delimiter 37 | super().__init__(data, transform, cache, generate_idx) 38 | 39 | def load_file(self, filepath): 40 | filepath = get_resource(filepath) 41 | # idx = 0 42 | for words, tags in generate_words_tags_from_tsv(filepath, lower=False): 43 | # idx += 1 44 | # if idx % 1000 == 0: 45 | # print(f'\rRead instances {idx // 1000}k', end='') 46 | if self.max_seq_len: 47 | start = 0 48 | for short_sents in split_long_sentence_into(words, self.max_seq_len, self.sent_delimiter, 49 | char_level=self.char_level, 50 | hard_constraint=self.hard_constraint): 51 | end = start + len(short_sents) 52 | yield {'token': short_sents, 'tag': tags[start:end]} 53 | start = end 54 | else: 55 | yield {'token': words, 'tag': tags} 56 | # print('\r', end='') 57 | 58 | 59 | def main(): 60 | dataset = TSVTaggingDataset(CONLL03_EN_DEV) 61 | print(dataset[3]) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /elit/server/service_tokenizer.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs, Liyan Xu 19 | from typing import List, Callable, Union 20 | 21 | from elit.server.format import Input 22 | 23 | 24 | class ServiceTokenizer: 25 | 26 | def __init__(self, 27 | eos: Callable[[List[str]], List[List[str]]] = None, 28 | tokenizer: Callable[[List[str]], List[List[str]]] = None) -> None: 29 | super().__init__() 30 | self.eos = eos 31 | self.tokenizer = tokenizer 32 | 33 | def split_sents(self, text: List[str]) -> List[List[str]]: 34 | assert self.eos, 'eos is required to perform sentence split' 35 | return self.eos(text) 36 | 37 | def tokenize(self, docs: List[List[str]]) -> List[List[List[str]]]: 38 | assert self.tokenizer, 'tokenizer is required to perform tokenization' 39 | results = [] 40 | tokens = self.tokenizer(sum(docs, [])) 41 | for doc in docs: 42 | results.append([]) 43 | results[-1].extend(tokens[:len(doc)]) 44 | del tokens[:len(doc)] 45 | return results 46 | 47 | def tokenize_inputs(self, inputs: Union[Input, List[Input]]) -> Union[Input, List[Input]]: 48 | single_input = False 49 | if isinstance(inputs, Input): 50 | single_input = True 51 | inputs = [inputs] 52 | 53 | needs_split = [] 54 | input_ids = [] 55 | for i, input in enumerate(inputs): 56 | if isinstance(input.text, str): 57 | needs_split.append(input.text) 58 | input_ids.append(i) 59 | for i, sents in zip(input_ids, self.split_sents(needs_split)): 60 | inputs[i].text = sents 61 | 62 | needs_tokenize = [] 63 | input_ids = [] 64 | for i, input in enumerate(inputs): 65 | if not input.tokens: 66 | needs_tokenize.append(input.text) 67 | input_ids.append(i) 68 | for i, tokens in zip(input_ids, self.tokenize(needs_tokenize)): 69 | inputs[i].tokens = tokens 70 | 71 | return inputs[0] if single_input else inputs 72 | -------------------------------------------------------------------------------- /elit/datasets/parsing/semeval15.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import warnings 20 | 21 | from elit.common.constant import ROOT, PAD 22 | from elit.components.parsers.conll import CoNLLSentence 23 | 24 | 25 | def unpack_deps_to_head_deprel(sample: dict, pad_rel=None, arc_key='arc', rel_key='rel'): 26 | if 'DEPS' in sample: 27 | deps = ['_'] + sample['DEPS'] 28 | sample[arc_key] = arc = [] 29 | sample[rel_key] = rel = [] 30 | for each in deps: 31 | arc_per_token = [False] * len(deps) 32 | rel_per_token = [None] * len(deps) 33 | if each != '_': 34 | for ar in each.split('|'): 35 | a, r = ar.split(':') 36 | a = int(a) 37 | arc_per_token[a] = True 38 | rel_per_token[a] = r 39 | if not pad_rel: 40 | pad_rel = r 41 | arc.append(arc_per_token) 42 | rel.append(rel_per_token) 43 | if not pad_rel: 44 | pad_rel = PAD 45 | for i in range(len(rel)): 46 | rel[i] = [r if r else pad_rel for r in rel[i]] 47 | return sample 48 | 49 | 50 | def append_bos_to_form_pos(sample, pos_key='CPOS'): 51 | sample['token'] = [ROOT] + sample['FORM'] 52 | if pos_key in sample: 53 | sample['pos'] = [ROOT] + sample[pos_key] 54 | return sample 55 | 56 | 57 | def merge_head_deprel_with_2nd(sample: dict): 58 | if 'arc' in sample: 59 | arc_2nd = sample['arc_2nd'] 60 | rel_2nd = sample['rel_2nd'] 61 | for i, (arc, rel) in enumerate(zip(sample['arc'], sample['rel'])): 62 | if i: 63 | if arc_2nd[i][arc] and rel_2nd[i][arc] != rel: 64 | sample_str = CoNLLSentence.from_dict(sample, conllu=True).to_markdown() 65 | warnings.warn(f'The main dependency conflicts with 2nd dependency at ID={i}, ' \ 66 | 'which means joint mode might not be suitable. ' \ 67 | f'The sample is\n{sample_str}') 68 | arc_2nd[i][arc] = True 69 | rel_2nd[i][arc] = rel 70 | return sample 71 | -------------------------------------------------------------------------------- /elit/components/distillation/distillable_component.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from abc import ABC 20 | from copy import copy 21 | 22 | import elit 23 | from elit.common.torch_component import TorchComponent 24 | from elit.components.distillation.losses import KnowledgeDistillationLoss 25 | from elit.components.distillation.schedulers import TemperatureScheduler 26 | from elit.utils.torch_util import cuda_devices 27 | from elit.utils.util import merge_locals_kwargs 28 | 29 | 30 | class DistillableComponent(TorchComponent, ABC): 31 | 32 | # noinspection PyMethodMayBeStatic,PyTypeChecker 33 | def build_teacher(self, teacher: str, devices) -> TorchComponent: 34 | return elit.load(teacher, load_kwargs={'devices': devices}) 35 | 36 | def distill(self, 37 | teacher: str, 38 | trn_data, 39 | dev_data, 40 | save_dir, 41 | batch_size=None, 42 | epochs=None, 43 | kd_criterion='kd_ce_loss', 44 | temperature_scheduler='flsw', 45 | devices=None, 46 | logger=None, 47 | seed=None, 48 | **kwargs): 49 | devices = devices or cuda_devices() 50 | if isinstance(kd_criterion, str): 51 | kd_criterion = KnowledgeDistillationLoss(kd_criterion) 52 | if isinstance(temperature_scheduler, str): 53 | temperature_scheduler = TemperatureScheduler.from_name(temperature_scheduler) 54 | teacher = self.build_teacher(teacher, devices=devices) 55 | self.vocabs = teacher.vocabs 56 | config = copy(teacher.config) 57 | batch_size = batch_size or config.get('batch_size', None) 58 | epochs = epochs or config.get('epochs', None) 59 | config.update(kwargs) 60 | return super().fit(**merge_locals_kwargs(locals(), 61 | config, 62 | excludes=('self', 'kwargs', '__class__', 'config'))) 63 | 64 | @property 65 | def _savable_config(self): 66 | config = super(DistillableComponent, self)._savable_config 67 | if 'teacher' in config: 68 | config.teacher = config.teacher.load_path 69 | return config 70 | -------------------------------------------------------------------------------- /elit/components/parsers/biaffine/mlp.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Yu Zhang 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | 25 | import torch.nn as nn 26 | 27 | from elit.layers.dropout import SharedDropout 28 | 29 | 30 | class MLP(nn.Module): 31 | r""" 32 | Applies a linear transformation together with a non-linear activation to the incoming tensor: 33 | :math:`y = \mathrm{Activation}(x A^T + b)` 34 | 35 | Args: 36 | n_in (~torch.Tensor): 37 | The size of each input feature. 38 | n_out (~torch.Tensor): 39 | The size of each output feature. 40 | dropout (float): 41 | If non-zero, introduce a :class:`SharedDropout` layer on the output with this dropout ratio. Default: 0. 42 | activation (bool): 43 | Whether to use activations. Default: True. 44 | """ 45 | 46 | def __init__(self, n_in, n_out, dropout=0, activation=True): 47 | super().__init__() 48 | 49 | self.n_in = n_in 50 | self.n_out = n_out 51 | self.linear = nn.Linear(n_in, n_out) 52 | self.activation = nn.LeakyReLU(negative_slope=0.1) if activation else nn.Identity() 53 | self.dropout = SharedDropout(p=dropout) 54 | 55 | self.reset_parameters() 56 | 57 | def __repr__(self): 58 | s = f"n_in={self.n_in}, n_out={self.n_out}" 59 | if self.dropout.p > 0: 60 | s += f", dropout={self.dropout.p}" 61 | 62 | return f"{self.__class__.__name__}({s})" 63 | 64 | def reset_parameters(self): 65 | nn.init.orthogonal_(self.linear.weight) 66 | nn.init.zeros_(self.linear.bias) 67 | 68 | def forward(self, x): 69 | r""" 70 | Args: 71 | x (~torch.Tensor): 72 | The size of each input feature is `n_in`. 73 | 74 | Returns: 75 | A tensor with the size of each output feature `n_out`. 76 | """ 77 | 78 | x = self.linear(x) 79 | x = self.activation(x) 80 | x = self.dropout(x) 81 | 82 | return x 83 | 84 | -------------------------------------------------------------------------------- /elit/components/amr/amr_parser/amrio.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Deng Cai 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from collections import Counter 24 | import json 25 | 26 | from elit.components.amr.amr_parser.amr import AMR 27 | from elit.components.amr.amr_parser.amr_graph import _is_abs_form, AMRGraph 28 | 29 | 30 | class AMRIO: 31 | 32 | def __init__(self): 33 | pass 34 | 35 | @staticmethod 36 | def read(file_path): 37 | with open(file_path, encoding='utf-8') as f: 38 | idx = 0 39 | for line in f: 40 | line = line.rstrip() 41 | if line.startswith('# ::id '): 42 | amr_id = line[len('# ::id '):] 43 | elif line.startswith('# ::snt '): 44 | sentence = line[len('# ::snt '):] 45 | elif line.startswith('# ::tokens '): 46 | tokens = json.loads(line[len('# ::tokens '):]) 47 | elif line.startswith('# ::lemmas '): 48 | lemmas = json.loads(line[len('# ::lemmas '):]) 49 | lemmas = [le if _is_abs_form(le) else le.lower() for le in lemmas] 50 | elif line.startswith('# ::pos_tags '): 51 | pos_tags = json.loads(line[len('# ::pos_tags '):]) 52 | elif line.startswith('# ::ner_tags '): 53 | ner_tags = json.loads(line[len('# ::ner_tags '):]) 54 | elif line.startswith('# ::abstract_map '): 55 | abstract_map = json.loads(line[len('# ::abstract_map '):]) 56 | graph_line = AMR.get_amr_line(f) 57 | amr = AMR.parse_AMR_line(graph_line) 58 | myamr = AMRGraph(amr) 59 | assert len(tokens) == len(ner_tags) 60 | yield tokens, lemmas, pos_tags, ner_tags, myamr 61 | idx += 1 62 | 63 | 64 | def make_vocab(batch_seq, char_level=False): 65 | cnt = Counter() 66 | for seq in batch_seq: 67 | cnt.update(seq) 68 | if not char_level: 69 | return cnt 70 | char_cnt = Counter() 71 | for x, y in cnt.most_common(): 72 | for ch in list(x): 73 | char_cnt[ch] += y 74 | return cnt, char_cnt 75 | -------------------------------------------------------------------------------- /elit/components/amr/amr_parser/customized_bart.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import torch.nn.functional as F 20 | from transformers.modeling_bart import DecoderLayer 21 | 22 | 23 | # noinspection PyAbstractClass 24 | class CustomizedDecoderLayer(DecoderLayer): 25 | def forward( 26 | self, 27 | x, 28 | encoder_hidden_states, 29 | encoder_attn_mask=None, 30 | layer_state=None, 31 | causal_mask=None, 32 | decoder_padding_mask=None, 33 | output_attentions=False, 34 | ): 35 | residual = x 36 | 37 | if layer_state is None: 38 | layer_state = {} 39 | if self.normalize_before: 40 | x = self.self_attn_layer_norm(x) 41 | # Self Attention 42 | 43 | x, self_attn_weights = self.self_attn( 44 | query=x, 45 | key=x, 46 | layer_state=layer_state, # adds keys to layer state 47 | key_padding_mask=decoder_padding_mask, 48 | attn_mask=causal_mask, 49 | output_attentions=output_attentions, 50 | ) 51 | x = F.dropout(x, p=self.dropout, training=self.training) 52 | x = residual + x 53 | if not self.normalize_before: 54 | x = self.self_attn_layer_norm(x) 55 | 56 | # Cross attention 57 | residual = x 58 | assert self.encoder_attn.cache_key != self.self_attn.cache_key 59 | if self.normalize_before: 60 | x = self.encoder_attn_layer_norm(x) 61 | x, cross_attn_weights = self.encoder_attn( 62 | query=x, 63 | key=encoder_hidden_states, 64 | key_padding_mask=encoder_attn_mask, 65 | layer_state=layer_state, # mutates layer state 66 | output_attentions=True 67 | ) 68 | x = F.dropout(x, p=self.dropout, training=self.training) 69 | x = residual + x 70 | if not self.normalize_before: 71 | x = self.encoder_attn_layer_norm(x) 72 | 73 | # Fully Connected 74 | residual = x 75 | if self.normalize_before: 76 | x = self.final_layer_norm(x) 77 | x = self.activation_fn(self.fc1(x)) 78 | x = F.dropout(x, p=self.activation_dropout, training=self.training) 79 | x = self.fc2(x) 80 | x = F.dropout(x, p=self.dropout, training=self.training) 81 | x = residual + x 82 | if not self.normalize_before: 83 | x = self.final_layer_norm(x) 84 | return ( 85 | x, 86 | (self_attn_weights, cross_attn_weights), 87 | layer_state, 88 | ) # just self_attn weights for now, following t5, layer_state = cache for decoding 89 | -------------------------------------------------------------------------------- /elit/metrics/parsing/conllx_eval.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import tempfile 20 | 21 | from elit.components.parsers.conll import read_conll 22 | from elit.utils.io_util import get_resource, get_exitcode_stdout_stderr 23 | 24 | CONLLX_EVAL = get_resource( 25 | 'https://github.com/elikip/bist-parser/archive/master.zip' + '#bmstparser/src/utils/eval.pl') 26 | 27 | 28 | def evaluate(gold_file, pred_file): 29 | """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski) 30 | 31 | Args: 32 | gold_file(str): The gold conllx file 33 | pred_file(str): The pred conllx file 34 | 35 | Returns: 36 | 37 | 38 | """ 39 | gold_file = get_resource(gold_file) 40 | fixed_pred_file = tempfile.NamedTemporaryFile().name 41 | copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False) 42 | if gold_file.endswith('.conllu'): 43 | fixed_gold_file = tempfile.NamedTemporaryFile().name 44 | copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False) 45 | gold_file = fixed_gold_file 46 | 47 | exitcode, out, err = get_exitcode_stdout_stderr(f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}') 48 | if exitcode: 49 | raise RuntimeError(f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.') 50 | lines = out.split('\n')[-4:] 51 | las = int(lines[0].split()[3]) / int(lines[0].split()[5]) 52 | uas = int(lines[1].split()[3]) / int(lines[1].split()[5]) 53 | return uas, las 54 | 55 | 56 | def copy_cols(gold_file, pred_file, copied_pred_file, keep_comments=True): 57 | """Copy the first 6 columns from gold file to pred file 58 | 59 | Args: 60 | gold_file: 61 | pred_file: 62 | copied_pred_file: 63 | keep_comments: (Default value = True) 64 | 65 | Returns: 66 | 67 | 68 | """ 69 | with open(copied_pred_file, 'w') as to_out, open(pred_file) as pred_file, open(gold_file) as gold_file: 70 | for idx, (p, g) in enumerate(zip(pred_file, gold_file)): 71 | while p.startswith('#'): 72 | p = next(pred_file) 73 | if not g.strip(): 74 | if p.strip(): 75 | raise ValueError( 76 | f'Prediction file {pred_file.name} does not end a sentence at line {idx + 1}\n{p.strip()}') 77 | to_out.write('\n') 78 | continue 79 | while g.startswith('#') or '-' in g.split('\t')[0]: 80 | if keep_comments or g.startswith('-'): 81 | to_out.write(g) 82 | g = next(gold_file) 83 | to_out.write('\t'.join(str(x) for x in g.split('\t')[:6] + p.split('\t')[6:])) 84 | -------------------------------------------------------------------------------- /elit/components/amr/amr_parser/utils.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Deng Cai 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import math 24 | 25 | import numpy as np 26 | import torch 27 | 28 | 29 | def move_to_device(maybe_tensor, device): 30 | if torch.is_tensor(maybe_tensor): 31 | return maybe_tensor.to(device) 32 | elif isinstance(maybe_tensor, np.ndarray): 33 | return torch.from_numpy(maybe_tensor).to(device).contiguous() 34 | elif isinstance(maybe_tensor, dict): 35 | return { 36 | key: move_to_device(value, device) 37 | for key, value in maybe_tensor.items() 38 | } 39 | elif isinstance(maybe_tensor, list): 40 | return [move_to_device(x, device) for x in maybe_tensor] 41 | elif isinstance(maybe_tensor, tuple): 42 | return tuple([move_to_device(x, device) for x in maybe_tensor]) 43 | return maybe_tensor 44 | 45 | 46 | def compute_f_by_tensor(input, target, mask): 47 | input = input.view(-1).tolist() 48 | target = target.view(-1).tolist() 49 | mask = mask.view(-1).tolist() 50 | tp, fp, tn, fn = 0., 0., 0., 0. 51 | for i, t, m in zip(input, target, mask): 52 | if m == 1: 53 | continue 54 | else: 55 | if i == 1: 56 | if t == 1: 57 | tp += 1 58 | else: 59 | fp += 1 60 | else: 61 | if t == 1: 62 | fn += 1 63 | else: 64 | tn += 1 65 | if tp == 0: 66 | return 0., 0., 0. 67 | 68 | P = tp / (tp + fp) 69 | R = tp / (tp + fn) 70 | F = 2 * P * R / (P + R) 71 | return P, R, F 72 | 73 | 74 | def gelu_fast(x): 75 | if not hasattr(gelu_fast, "_a"): 76 | gelu_fast._a = math.sqrt(2 / math.pi) 77 | return 0.5 * x * (1 + torch.tanh(gelu_fast._a * (x + 0.044715 * torch.pow(x, 3)))) 78 | 79 | 80 | def gelu(x: torch.Tensor) -> torch.Tensor: 81 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 82 | 83 | 84 | def label_smoothed_nll_loss(log_probs, target, eps): 85 | # log_probs: N x C 86 | # target: N 87 | nll_loss = -log_probs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1) 88 | if eps == 0.: 89 | return nll_loss 90 | smooth_loss = -log_probs.sum(dim=-1) 91 | eps_i = eps / log_probs.size(-1) 92 | loss = (1. - eps) * nll_loss + eps_i * smooth_loss 93 | return loss 94 | -------------------------------------------------------------------------------- /elit/server/en.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs, Liyan Xu 19 | import elit 20 | from elit.pretrained.coref import DOC_COREF_SPANBERT_LARGE_EN, ONLINE_COREF_SPANBERT_LARGE_EN 21 | from elit.server.en_parser import service_tokenizer, service_parser 22 | from elit.server.format import Input 23 | from elit.server.service_tokenizer import ServiceTokenizer 24 | from elit.server.service_parser import ServiceParser 25 | from elit.server.service_coref import ServiceCoreference 26 | 27 | 28 | class BundledServices: 29 | def __init__(self, 30 | tokenizer: ServiceTokenizer = None, 31 | parser: ServiceParser = None, 32 | doc_coref: ServiceCoreference = None, 33 | online_coref: ServiceCoreference = None): 34 | self.tokenizer = tokenizer 35 | self.parser = parser 36 | self.doc_coref = doc_coref 37 | self.online_coref = online_coref 38 | self.emotion_detection = None 39 | 40 | 41 | service_doc_coref = ServiceCoreference( 42 | service_tokenizer=service_tokenizer, 43 | models=elit.load(DOC_COREF_SPANBERT_LARGE_EN) 44 | ) 45 | # service_doc_coref = ServiceCoreference( 46 | # service_tokenizer=service_tokenizer, 47 | # models=[elit.load(DOC_COREF_SPANBERT_LARGE_EN, devices=0), 48 | # elit.load(DOC_COREF_SPANBERT_LARGE_EN, devices=1)] 49 | # ) 50 | 51 | service_online_coref = ServiceCoreference( 52 | service_tokenizer=service_tokenizer, 53 | models=elit.load(ONLINE_COREF_SPANBERT_LARGE_EN) 54 | ) 55 | 56 | en_services = BundledServices( 57 | tokenizer=service_tokenizer, 58 | parser=service_parser, 59 | doc_coref=service_doc_coref, 60 | online_coref=service_online_coref 61 | ) 62 | 63 | 64 | def main(): 65 | text = [ 66 | "Emory NLP is a research lab in Atlanta, GA. " 67 | "It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University." 68 | ] 69 | input = Input(text=text) 70 | input.models = ['lem'] 71 | docs = en_services.parser.parse([input]) 72 | for doc in docs: 73 | print(doc) 74 | 75 | # See elit.client for coreference examples 76 | text = 'Pfizer said last week it may need the U.S. government to help it secure some components needed to ' \ 77 | 'make the vaccine. While the company halved its 2020 production target due to manufacturing issues, ' \ 78 | 'it said last week its manufacturing is running smoothly now. The government also has the option to ' \ 79 | 'acquire up to an additional 400 million doses of the vaccine.' 80 | input_doc = Input(text=text, models=['dcr']) 81 | doc = service_doc_coref.predict(input_doc) 82 | print(doc) 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /docs/abstract_meaning_parsing.md: -------------------------------------------------------------------------------- 1 | # Abstract Meaning Parsing 2 | 3 | Abstract Meaning Representation parsing is the task of parsing a sentence to a representation of the abstract meaning in a sentence as a rooted, directed, acyclic graph. 4 | 5 | ELIT provides two APIs for parsing a raw sentence into an AMR graph. 6 | 7 | ## State-of-The-Art AMR Parser 8 | 9 | We recommend to use the SOTA AMR parser with `83.3` Smatch score trained on the AMR 3.0 dataset: 10 | 11 | ```python 12 | import elit 13 | parser = elit.load(elit.pretrained.amr.AMR3_BART_LARGE_EN) 14 | amr = parser('The boy wants the girl to believe him.') 15 | print(amr) 16 | ``` 17 | 18 | Outputs: 19 | 20 | ```text 21 | (z0 / want-01 22 | :ARG0 (z1 / boy) 23 | :ARG1 (z2 / believe-01 24 | :ARG0 (z3 / girl) 25 | :ARG1 z1)) 26 | ``` 27 | 28 | Feel free to parse multiple sentences in one batch to benefit from parallelization. 29 | 30 | ### Evaluation 31 | 32 | Its detailed performance is listed as follows: 33 | 34 | ```bash 35 | $ cat ~/.elit/amr3_bart_large_elit_20211030_173300/test.log 36 | {Smatch P: 84.00% R: 82.60% F1: 83.30%}{Unlabeled P: 86.40% R: 84.90% F1: 85.70%}{No WSD P: 84.50% R: 83.10% F1: 83.80%}{Non_sense_frames P: 91.90% R: 91.30% F1: 91.60%}{Wikification P: 81.70% R: 80.80% F1: 81.20%}{Named Ent. P: 89.20% R: 87.00% F1: 88.10%}{Negations P: 71.70% R: 70.90% F1: 71.30%}{IgnoreVars P: 73.80% R: 73.10% F1: 73.50%}{Concepts P: 90.70% R: 89.60% F1: 90.10%}{Frames P: 88.50% R: 87.90% F1: 88.20%}{Reentrancies P: 70.40% R: 71.80% F1: 71.10%}{SRL P: 79.00% R: 79.60% F1: 79.30%} 37 | ``` 38 | 39 | To re-produce this evaluation, run the followling code: 40 | 41 | ```python 42 | import elit 43 | parser = elit.load(elit.pretrained.amr.AMR3_BART_LARGE_EN) 44 | parser.evaluate('data/amr/amr_3.0/test.txt', '/tmp') 45 | ``` 46 | 47 | which will generate the Smatch score `/tmp/test.log` **before** wikification: 48 | 49 | ``` 50 | 81/81 {Smatch P: 83.70% R: 82.40% F1: 83.00%}{Unlabeled P: 86.10% R: 84.60% F1: 85.40%}{No WSD P: 84.20% R: 82.90% F1: 83.50%}{Non_sense_frames P: 91.90% R: 91.30% F1: 91.60%}{Wikification P: 75.00% R: 74.20% F1: 74.60%}{Named Ent. P: 89.20% R: 87.00% F1: 88.10%}{Negations P: 71.70% R: 70.90% F1: 71.30%}{IgnoreVars P: 73.50% R: 72.80% F1: 73.10%}{Concepts P: 90.70% R: 89.60% F1: 90.10%}{Frames P: 88.50% R: 87.90% F1: 88.20%}{Reentrancies P: 70.40% R: 71.80% F1: 71.10%}{SRL P: 79.00% R: 79.60% F1: 79.30%} ET: 8 m 1 s 51 | ``` 52 | 53 | The predictions are saved in `/tmp/test.pred.txt`. To perform wikification, refer to [BLINK](https://github.com/SapienzaNLP/spring/blob/8fad6a4ce59132b22d6bdb4d4eb3a9aa5223ead6/bin/blinkify.py) which will give you the extra `0.3` F1. 54 | 55 | ## AMR Decoder in MTL 56 | 57 | Users can also load a MTL component and specify `tasks=['lem', 'amr'])` to parse a sentence into an AMR graph. E.g., 58 | 59 | ```python 60 | import elit 61 | nlp = elit.load('LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN') 62 | print(nlp(['Emory','NLP','is','in','Atlanta'], tasks=['lem', 'amr'])) 63 | ``` 64 | 65 | Outputs: 66 | 67 | ```python 68 | { 69 | "tok": [ 70 | ["Emory", "NLP", "is", "in", "Atlanta"] 71 | ], 72 | "lem": [ 73 | ["emory", "nlp", "be", "in", "atlanta"] 74 | ], 75 | "amr": [ 76 | [["c0", "ARG1", "c1"], ["c0", "ARG2", "c2"], ["c0", "instance", "be-located-at-91"], ["c1", "instance", "emory nlp"], ["c2", "instance", "atlanta"]] 77 | ], 78 | } 79 | ``` 80 | 81 | `amr` stores the logical triples of Abstract Meaning Representation in the format of `(source, relation, target)`. Note that the `Document` class will convert it to Penman format when being accessed through code. Thus, you will get Penman `Graph`s by accessing `doc['amr']`. 82 | 83 | -------------------------------------------------------------------------------- /elit/components/ner/rnn_ner.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import Any 20 | 21 | import torch 22 | from alnlp.metrics import span_utils 23 | 24 | from elit.components.taggers.rnn_tagger import RNNTagger 25 | from elit.metrics.chunking.conlleval import SpanF1 26 | from elit.utils.util import merge_locals_kwargs 27 | 28 | 29 | class RNNNamedEntityRecognizer(RNNTagger): 30 | def build_metric(self, **kwargs): 31 | return SpanF1(self.tagging_scheme) 32 | 33 | def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, **kwargs): 34 | loss, metric = super().evaluate_dataloader(data, criterion, logger, ratio_width, **kwargs) 35 | if logger: 36 | logger.info(metric.result(True, False)[-1]) 37 | return loss, metric 38 | 39 | def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256, 40 | drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='token', tagging_scheme=None, 41 | anneal_factor: float = 0.5, delimiter=None, anneal_patience=2, devices=None, 42 | token_delimiter=None, 43 | logger=None, 44 | verbose=True, **kwargs): 45 | return super().fit(**merge_locals_kwargs(locals(), kwargs)) 46 | 47 | def update_metrics(self, metric, logits, y, mask, batch, prediction): 48 | logits = self.decode_output(logits, mask, batch) 49 | if isinstance(logits, torch.Tensor): 50 | logits = logits.tolist() 51 | metric(self._id_to_tags(logits), batch['tag']) 52 | 53 | def predict(self, tokens: Any, batch_size: int = None, **kwargs): 54 | return super().predict(tokens, batch_size, **kwargs) 55 | 56 | def predict_data(self, data, batch_size, **kwargs): 57 | outputs = super().predict_data(data, batch_size) 58 | tagging_scheme = self.tagging_scheme 59 | if tagging_scheme == 'IOBES': 60 | entities = [span_utils.iobes_tags_to_spans(y) for y in outputs] 61 | elif tagging_scheme == 'BIO': 62 | entities = [span_utils.bio_tags_to_spans(y) for y in outputs] 63 | elif tagging_scheme == 'BIOUL': 64 | entities = [span_utils.bioul_tags_to_spans(y) for y in outputs] 65 | else: 66 | raise ValueError(f'Unrecognized tag scheme {tagging_scheme}') 67 | for i, (tokens, es) in enumerate(zip(data, entities)): 68 | outputs[i] = [(self.config.token_delimiter.join(tokens[b:e + 1]), t, b, e + 1) for t, (b, e) in es] 69 | return outputs 70 | 71 | def save_config(self, save_dir, filename='config.json'): 72 | if self.config.token_delimiter is None: 73 | self.config.token_delimiter = '' if all( 74 | [len(x) == 1 for x in self.vocabs[self.config.token_key].idx_to_token[-100:]]) else ' ' 75 | super().save_config(save_dir, filename) 76 | -------------------------------------------------------------------------------- /elit/metrics/parsing/span.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Yu Zhang 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from collections import Counter 24 | 25 | from elit.metrics.metric import Metric 26 | 27 | 28 | class SpanMetric(Metric): 29 | 30 | def __init__(self, eps=1e-12): 31 | super().__init__() 32 | self.reset(eps) 33 | 34 | # noinspection PyAttributeOutsideInit 35 | def reset(self, eps=1e-12): 36 | self.n = 0.0 37 | self.n_ucm = 0.0 38 | self.n_lcm = 0.0 39 | self.utp = 0.0 40 | self.ltp = 0.0 41 | self.pred = 0.0 42 | self.gold = 0.0 43 | self.eps = eps 44 | 45 | def __call__(self, preds, golds): 46 | for pred, gold in zip(preds, golds): 47 | upred = Counter([(i, j) for i, j, label in pred]) 48 | ugold = Counter([(i, j) for i, j, label in gold]) 49 | utp = list((upred & ugold).elements()) 50 | lpred = Counter(pred) 51 | lgold = Counter(gold) 52 | ltp = list((lpred & lgold).elements()) 53 | self.n += 1 54 | self.n_ucm += len(utp) == len(pred) == len(gold) 55 | self.n_lcm += len(ltp) == len(pred) == len(gold) 56 | self.utp += len(utp) 57 | self.ltp += len(ltp) 58 | self.pred += len(pred) 59 | self.gold += len(gold) 60 | return self 61 | 62 | def __repr__(self): 63 | s = f"UCM: {self.ucm:.2%} LCM: {self.lcm:.2%} " 64 | s += f"UP: {self.up:.2%} UR: {self.ur:.2%} UF: {self.uf:.2%} " 65 | s += f"LP: {self.lp:.2%} LR: {self.lr:.2%} LF: {self.lf:.2%}" 66 | 67 | return s 68 | 69 | @property 70 | def score(self): 71 | return self.lf 72 | 73 | @property 74 | def ucm(self): 75 | return self.n_ucm / (self.n + self.eps) 76 | 77 | @property 78 | def lcm(self): 79 | return self.n_lcm / (self.n + self.eps) 80 | 81 | @property 82 | def up(self): 83 | return self.utp / (self.pred + self.eps) 84 | 85 | @property 86 | def ur(self): 87 | return self.utp / (self.gold + self.eps) 88 | 89 | @property 90 | def uf(self): 91 | return 2 * self.utp / (self.pred + self.gold + self.eps) 92 | 93 | @property 94 | def lp(self): 95 | return self.ltp / (self.pred + self.eps) 96 | 97 | @property 98 | def lr(self): 99 | return self.ltp / (self.gold + self.eps) 100 | 101 | @property 102 | def lf(self): 103 | return 2 * self.ltp / (self.pred + self.gold + self.eps) 104 | -------------------------------------------------------------------------------- /tests/test_service_coref_doc.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: Liyan Xu 19 | import unittest 20 | import time 21 | 22 | from elit.server.en import en_services 23 | from elit.server.format import Input 24 | 25 | 26 | class TestDocCoref(unittest.TestCase): 27 | 28 | def setUp(self) -> None: 29 | super().setUp() 30 | 31 | def get_sample_text(self): 32 | text = 'Pfizer said last week it may need the U.S. government to help it secure some components needed to ' \ 33 | 'make the vaccine. While the company halved its 2020 production target due to manufacturing issues, ' \ 34 | 'it said last week its manufacturing is running smoothly now. The government also has the option to ' \ 35 | 'acquire up to an additional 400 million doses of the vaccine.' 36 | return text 37 | 38 | def test_doc_coref_sequential(self): 39 | 40 | batch_size = 32 41 | inputs = [Input(text=self.get_sample_text(), models=['dcr'])] * batch_size 42 | 43 | start_time = time.time() 44 | docs = en_services.doc_coref.predict_sequentially(inputs) 45 | end_time = time.time() 46 | print(f'Sequential doc coref time elapse for {batch_size} small documents: {end_time - start_time :.2f}s') 47 | 48 | assert len(docs) == len(inputs) 49 | print(docs[0]) 50 | print(docs[-1]) 51 | 52 | def test_doc_coref_concurrent(self): 53 | 54 | batch_size = 32 55 | inputs = [Input(text=self.get_sample_text(), models=['dcr'])] * batch_size 56 | 57 | start_time = time.time() 58 | docs = en_services.doc_coref.predict(inputs) 59 | end_time = time.time() 60 | print(f'Concurrent doc coref time elapse for {batch_size} small documents: {end_time - start_time :.2f}s') 61 | 62 | assert len(docs) == len(inputs) 63 | print(docs[0]) 64 | print(docs[-1]) 65 | 66 | def test_doc_coref_text(self): 67 | text = "Emory NLP is a research lab in Atlanta, GA. " 68 | "It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University." 69 | input_doc = Input(text=text, models=['dcr']) 70 | print(en_services.doc_coref.predict(input_doc)) 71 | 72 | def test_doc_coref_sents(self): 73 | text = ["Emory NLP is a research lab in Atlanta, GA.", 74 | "It is founded by Jinho D. Choi in 2014.", 75 | 'Dr. Choi is a professor at Emory University.'] 76 | input_doc = Input(text=text, models=['dcr']) 77 | print(en_services.doc_coref.predict(input_doc)) 78 | 79 | def test_doc_coref_tokens(self): 80 | tokens = [ 81 | ["Emory", "NLP", "is", "a", "research", "lab", "in", "Atlanta", ",", "GA", "."], 82 | ["It", "is", "founded", "by", "Jinho", "D.", "Choi", "in", "2014", ".", "Dr.", "Choi", "is", "a", 83 | "professor", "at", "Emory", "University", "."] 84 | ] 85 | input_doc = Input(tokens=tokens, models=['dcr']) 86 | print(en_services.doc_coref.predict(input_doc)) 87 | 88 | 89 | if __name__ == '__main__': 90 | unittest.main() 91 | -------------------------------------------------------------------------------- /elit/layers/embeddings/util.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import Union 20 | 21 | import torch 22 | from torch import nn 23 | 24 | from elit.common.vocab import Vocab 25 | from elit.utils.init_util import embedding_uniform 26 | from elit.utils.io_util import load_word2vec, load_word2vec_as_vocab_tensor 27 | 28 | 29 | def index_word2vec_with_vocab(filepath: str, 30 | vocab: Vocab, 31 | extend_vocab=True, 32 | unk=None, 33 | lowercase=False, 34 | init='uniform', 35 | normalize=None) -> torch.Tensor: 36 | pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath) 37 | if unk and unk in pret_vocab: 38 | pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk) 39 | if extend_vocab: 40 | vocab.unlock() 41 | for word in pret_vocab: 42 | vocab.get_idx(word.lower() if lowercase else word) 43 | vocab.lock() 44 | ids = [] 45 | 46 | unk_id_offset = 0 47 | for word, idx in vocab.token_to_idx.items(): 48 | word_id = pret_vocab.get(word, None) 49 | # Retry lower case 50 | if word_id is None: 51 | word_id = pret_vocab.get(word.lower(), None) 52 | if word_id is None: 53 | word_id = len(pret_vocab) + unk_id_offset 54 | unk_id_offset += 1 55 | ids.append(word_id) 56 | if unk_id_offset: 57 | unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1)) 58 | if init and init != 'zeros': 59 | if init == 'uniform': 60 | init = embedding_uniform 61 | else: 62 | raise ValueError(f'Unsupported init {init}') 63 | unk_embeds = init(unk_embeds) 64 | pret_matrix = torch.cat([pret_matrix, unk_embeds]) 65 | ids = torch.LongTensor(ids) 66 | embedding = pret_matrix.index_select(0, ids) 67 | if normalize == 'norm': 68 | embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) 69 | elif normalize == 'std': 70 | embedding /= torch.std(embedding) 71 | return embedding 72 | 73 | 74 | def build_word2vec_with_vocab(embed: Union[str, int], 75 | vocab: Vocab, 76 | extend_vocab=True, 77 | unk=None, 78 | lowercase=False, 79 | trainable=False, 80 | init='zeros', 81 | normalize=None) -> nn.Embedding: 82 | if isinstance(embed, str): 83 | embed = index_word2vec_with_vocab(embed, vocab, extend_vocab, unk, lowercase, init, normalize) 84 | embed = nn.Embedding.from_pretrained(embed, freeze=not trainable, padding_idx=vocab.pad_idx) 85 | return embed 86 | elif isinstance(embed, int): 87 | embed = nn.Embedding(len(vocab), embed, padding_idx=vocab.pad_idx) 88 | return embed 89 | else: 90 | raise ValueError(f'Unsupported parameter type: {embed}') 91 | -------------------------------------------------------------------------------- /tests/test_service_coref_online.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: Liyan Xu 19 | import unittest 20 | import json 21 | 22 | from elit.server.en import en_services 23 | from elit.server.format import Input, OnlineCorefContext 24 | from elit.components.coref.util import flatten 25 | 26 | 27 | class TestOnlineCoref(unittest.TestCase): 28 | 29 | def setUp(self) -> None: 30 | super().setUp() 31 | 32 | @classmethod 33 | def convert_output_to_context(cls, coref_output): 34 | if coref_output is None: 35 | return None 36 | return OnlineCorefContext( 37 | input_ids=coref_output['input_ids'], 38 | sentence_map=coref_output['sentence_map'], 39 | subtoken_map=coref_output['subtoken_map'], 40 | mentions=[tuple(m) for m in coref_output['mentions']], 41 | uttr_start_idx=coref_output['uttr_start_idx'], 42 | speaker_ids=coref_output['speaker_ids'] 43 | ) 44 | 45 | def test_online_coref(self): 46 | # Test text, sents, tokens input 47 | # utterances = [ 48 | # {'speaker_id': 1, 'text': 'I read an article today. It is about US politics.'}, 49 | # {'speaker_id': 2, 'tokens': [['What', 'does', 'it', 'say', 'about', 'US', 'politics', '?']]}, # Tokens 50 | # {'speaker_id': 1, 'text': 'It talks about the US presidential election.'}, 51 | # {'speaker_id': 2, 'text': ['I am interested to hear.', 'Can you elaborate more?']}, # Sents 52 | # {'speaker_id': 1, 'text': 'Sure! The presidential election is indeed interesting.'} 53 | # ] 54 | 55 | utterances = [ 56 | {'speaker_id': 1, 'tokens': [['I', 'read', 'an', 'article', 'today', '.']]}, 57 | {'speaker_id': 2, 'tokens': [['Can', 'you', 'tell', 'me', 'what', 'it', 'is', 'about', '?']]} 58 | ] 59 | 60 | context = None 61 | tokens_to_date = [] 62 | for turn, uttr in enumerate(utterances): 63 | if 'tokens' in uttr: 64 | input_doc = Input(tokens=uttr['tokens'], speaker_ids=uttr['speaker_id'], 65 | coref_context=self.convert_output_to_context(context), models=['ocr']) 66 | else: 67 | input_doc = Input(text=uttr['text'], speaker_ids=uttr['speaker_id'], 68 | coref_context=self.convert_output_to_context(context), models=['ocr']) 69 | output_doc = en_services.online_coref.predict_sequentially(input_doc, check_sanitization=True) 70 | print(json.dumps(output_doc)) 71 | context = output_doc['ocr'] 72 | 73 | # Print cluster text 74 | tokens_to_date += flatten(input_doc.tokens) 75 | for cluster in output_doc['ocr']['clusters']: 76 | for i in range(len(cluster)): 77 | m1, m2 = tuple(cluster[i]) 78 | cluster[i] = (m1, m2, ' '.join(tokens_to_date[m1:m2])) 79 | print(f'cluster text: {output_doc["ocr"]["clusters"]}') 80 | print() 81 | 82 | 83 | if __name__ == '__main__': 84 | unittest.main() 85 | -------------------------------------------------------------------------------- /elit/components/parsers/biaffine/biaffine.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Yu Zhang 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import torch 25 | import torch.nn as nn 26 | 27 | 28 | class Biaffine(nn.Module): 29 | r""" 30 | Biaffine layer for first-order scoring. 31 | 32 | This function has a tensor of weights :math:`W` and bias terms if needed. 33 | The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y`, 34 | in which :math:`x` and :math:`y` can be concatenated with bias terms. 35 | 36 | References: 37 | - Timothy Dozat and Christopher D. Manning. 2017. 38 | `Deep Biaffine Attention for Neural Dependency Parsing`_. 39 | 40 | Args: 41 | n_in (int): 42 | The size of the input feature. 43 | n_out (int): 44 | The number of output channels. 45 | bias_x (bool): 46 | If ``True``, adds a bias term for tensor :math:`x`. Default: ``True``. 47 | bias_y (bool): 48 | If ``True``, adds a bias term for tensor :math:`y`. Default: ``True``. 49 | 50 | .. _Deep Biaffine Attention for Neural Dependency Parsing: 51 | https://openreview.net/forum?id=Hk95PK9le 52 | """ 53 | 54 | def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True): 55 | super().__init__() 56 | 57 | self.n_in = n_in 58 | self.n_out = n_out 59 | self.bias_x = bias_x 60 | self.bias_y = bias_y 61 | self.weight = nn.Parameter(torch.Tensor(n_out, n_in + bias_x, n_in + bias_y)) 62 | 63 | self.reset_parameters() 64 | 65 | def __repr__(self): 66 | s = f"n_in={self.n_in}, n_out={self.n_out}" 67 | if self.bias_x: 68 | s += f", bias_x={self.bias_x}" 69 | if self.bias_y: 70 | s += f", bias_y={self.bias_y}" 71 | 72 | return f"{self.__class__.__name__}({s})" 73 | 74 | def reset_parameters(self): 75 | nn.init.zeros_(self.weight) 76 | 77 | def forward(self, x, y): 78 | r""" 79 | Args: 80 | x (torch.Tensor): ``[batch_size, seq_len, n_in]``. 81 | y (torch.Tensor): ``[batch_size, seq_len, n_in]``. 82 | 83 | Returns: 84 | ~torch.Tensor: 85 | A scoring tensor of shape ``[batch_size, n_out, seq_len, seq_len]``. 86 | If ``n_out=1``, the dimension for ``n_out`` will be squeezed automatically. 87 | """ 88 | 89 | if self.bias_x: 90 | x = torch.cat((x, torch.ones_like(x[..., :1])), -1) 91 | if self.bias_y: 92 | y = torch.cat((y, torch.ones_like(y[..., :1])), -1) 93 | # [batch_size, n_out, seq_len, seq_len] 94 | s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y) 95 | # remove dim 1 if n_out == 1 96 | s = s.squeeze(1) 97 | 98 | return s 99 | -------------------------------------------------------------------------------- /elit/layers/embeddings/fast_text.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | import os 20 | import sys 21 | from typing import Optional, Callable 22 | 23 | import fasttext 24 | import torch 25 | from torch import nn 26 | from torch.nn.utils.rnn import pad_sequence 27 | 28 | from elit.common.structure import AutoConfigurable 29 | from elit.common.transform import EmbeddingNamedTransform 30 | from elit.layers.embeddings.embedding import Embedding, EmbeddingDim 31 | from elit.utils.io_util import get_resource, stdout_redirected 32 | from elit.utils.log_util import flash 33 | 34 | 35 | class FastTextTransform(EmbeddingNamedTransform): 36 | def __init__(self, filepath: str, src, dst=None, **kwargs) -> None: 37 | if not dst: 38 | dst = src + '_fasttext' 39 | self.filepath = filepath 40 | flash(f'Loading fasttext model {filepath} [blink][yellow]...[/yellow][/blink]') 41 | filepath = get_resource(filepath) 42 | with stdout_redirected(to=os.devnull, stdout=sys.stderr): 43 | self._model = fasttext.load_model(filepath) 44 | flash('') 45 | output_dim = self._model['king'].size 46 | super().__init__(output_dim, src, dst) 47 | 48 | def __call__(self, sample: dict): 49 | word = sample[self.src] 50 | if isinstance(word, str): 51 | vector = self.embed(word) 52 | else: 53 | vector = torch.stack([self.embed(each) for each in word]) 54 | sample[self.dst] = vector 55 | return sample 56 | 57 | def embed(self, word: str): 58 | return torch.tensor(self._model[word]) 59 | 60 | 61 | class PassThroughModule(torch.nn.Module): 62 | def __init__(self, key) -> None: 63 | super().__init__() 64 | self.key = key 65 | 66 | def __call__(self, batch: dict, mask=None, **kwargs): 67 | return batch[self.key] 68 | 69 | 70 | class FastTextEmbeddingModule(PassThroughModule): 71 | 72 | def __init__(self, key, embedding_dim: int) -> None: 73 | super().__init__(key) 74 | self.embedding_dim = embedding_dim 75 | 76 | def __call__(self, batch: dict, mask=None, **kwargs): 77 | outputs = super().__call__(batch, **kwargs) 78 | outputs = pad_sequence(outputs, True, 0).to(mask.device) 79 | return outputs 80 | 81 | def __repr__(self): 82 | s = self.__class__.__name__ + '(' 83 | s += f'key={self.key}, embedding_dim={self.embedding_dim}' 84 | s += ')' 85 | return s 86 | 87 | def get_output_dim(self): 88 | return self.embedding_dim 89 | 90 | 91 | class FastTextEmbedding(Embedding, AutoConfigurable): 92 | def __init__(self, src: str, filepath: str) -> None: 93 | super().__init__() 94 | self.src = src 95 | self.filepath = filepath 96 | self._fasttext = FastTextTransform(self.filepath, self.src) 97 | 98 | def transform(self, **kwargs) -> Optional[Callable]: 99 | return self._fasttext 100 | 101 | def module(self, **kwargs) -> Optional[nn.Module]: 102 | return FastTextEmbeddingModule(self._fasttext.dst, self._fasttext.output_dim) 103 | -------------------------------------------------------------------------------- /elit/metrics/parsing/labeled_f1.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | 20 | from elit.metrics.metric import Metric 21 | 22 | 23 | class LabeledF1(Metric): 24 | 25 | def __init__(self): 26 | super(LabeledF1, self).__init__() 27 | 28 | self.sum_gold_arcs_wo_punc = 0.0 29 | self.sum_pred_arcs_wo_punc = 0.0 30 | self.correct_arcs_wo_punc = 0.0 31 | self.correct_rels_wo_punc = 0.0 32 | 33 | def __repr__(self): 34 | return f"UF: {self.uf:4.2%} LF: {self.lf:4.2%}" 35 | 36 | def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask): 37 | mask_gold = mask & arc_golds 38 | mask_pred = mask & arc_preds 39 | 40 | correct_mask = mask_gold & mask_pred 41 | correct_arcs_wo_punc = (arc_preds == arc_golds)[correct_mask] 42 | correct_rels_wo_punc = (rel_preds == rel_golds)[correct_mask] & correct_arcs_wo_punc 43 | 44 | self.sum_gold_arcs_wo_punc += float(mask_gold.sum()) 45 | self.sum_pred_arcs_wo_punc += float(mask_pred.sum()) 46 | self.correct_arcs_wo_punc += float(correct_arcs_wo_punc.sum()) 47 | self.correct_rels_wo_punc += float(correct_rels_wo_punc.sum()) 48 | 49 | def __lt__(self, other): 50 | return self.score < other 51 | 52 | def __le__(self, other): 53 | return self.score <= other 54 | 55 | def __ge__(self, other): 56 | return self.score >= other 57 | 58 | def __gt__(self, other): 59 | return self.score > other 60 | 61 | @property 62 | def score(self): 63 | return self.las 64 | 65 | @property 66 | def uas(self): 67 | return self.uf 68 | 69 | @property 70 | def las(self): 71 | return self.lf 72 | 73 | @property 74 | def ur(self): 75 | if not self.sum_gold_arcs_wo_punc: 76 | return .0 77 | return self.correct_arcs_wo_punc / self.sum_gold_arcs_wo_punc 78 | 79 | @property 80 | def up(self): 81 | if not self.sum_pred_arcs_wo_punc: 82 | return .0 83 | return self.correct_arcs_wo_punc / self.sum_pred_arcs_wo_punc 84 | 85 | @property 86 | def lr(self): 87 | if not self.sum_gold_arcs_wo_punc: 88 | return .0 89 | return self.correct_rels_wo_punc / self.sum_gold_arcs_wo_punc 90 | 91 | @property 92 | def lp(self): 93 | if not self.sum_pred_arcs_wo_punc: 94 | return .0 95 | return self.correct_rels_wo_punc / self.sum_pred_arcs_wo_punc 96 | 97 | @property 98 | def uf(self): 99 | rp = self.ur + self.up 100 | if not rp: 101 | return .0 102 | return 2 * self.ur * self.up / rp 103 | 104 | @property 105 | def lf(self): 106 | rp = self.lr + self.lp 107 | if not rp: 108 | return .0 109 | return 2 * self.lr * self.lp / rp 110 | 111 | def reset(self): 112 | self.sum_gold_arcs_wo_punc = 0.0 113 | self.sum_pred_arcs_wo_punc = 0.0 114 | self.correct_arcs_wo_punc = 0.0 115 | self.correct_rels_wo_punc = 0.0 116 | 117 | def to_dict(self) -> dict: 118 | return {'UF': self.uf, 'LF': self.lf} 119 | -------------------------------------------------------------------------------- /elit/components/distillation/schedulers.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/airaria/TextBrewer 2 | # Apache License Version 2.0 3 | from abc import ABC, abstractmethod 4 | 5 | import torch 6 | 7 | # x is between 0 and 1 8 | from elit.common.structure import AutoConfigurable 9 | 10 | 11 | def linear_growth_weight_scheduler(x): 12 | return x 13 | 14 | 15 | def linear_decay_weight_scheduler(x): 16 | return 1 - x 17 | 18 | 19 | def constant_temperature_scheduler(logits_S, logits_T, base_temperature): 20 | ''' 21 | Remember to detach logits_S 22 | ''' 23 | return base_temperature 24 | 25 | 26 | def flsw_temperature_scheduler_builder(beta, gamma, eps=1e-4, *args): 27 | ''' 28 | adapted from arXiv:1911.07471 29 | ''' 30 | 31 | def flsw_temperature_scheduler(logits_S, logits_T, base_temperature): 32 | v = logits_S.detach() 33 | t = logits_T.detach() 34 | with torch.no_grad(): 35 | v = v / (torch.norm(v, dim=-1, keepdim=True) + eps) 36 | t = t / (torch.norm(t, dim=-1, keepdim=True) + eps) 37 | w = torch.pow((1 - (v * t).sum(dim=-1)), gamma) 38 | tau = base_temperature + (w.mean() - w) * beta 39 | return tau 40 | 41 | return flsw_temperature_scheduler 42 | 43 | 44 | def cwsm_temperature_scheduler_builder(beta, *args): 45 | ''' 46 | adapted from arXiv:1911.07471 47 | ''' 48 | 49 | def cwsm_temperature_scheduler(logits_S, logits_T, base_temperature): 50 | v = logits_S.detach() 51 | with torch.no_grad(): 52 | v = torch.softmax(v, dim=-1) 53 | v_max = v.max(dim=-1)[0] 54 | w = 1 / (v_max + 1e-3) 55 | tau = base_temperature + (w.mean() - w) * beta 56 | return tau 57 | 58 | return cwsm_temperature_scheduler 59 | 60 | 61 | class LinearTeacherAnnealingScheduler(object): 62 | def __init__(self, num_training_steps: int) -> None: 63 | super().__init__() 64 | self._num_training_steps = num_training_steps 65 | self._current_training_steps = 0 66 | 67 | def step(self): 68 | self._current_training_steps += 1 69 | 70 | def __float__(self): 71 | return self._current_training_steps / self._num_training_steps 72 | 73 | 74 | class TemperatureScheduler(ABC, AutoConfigurable): 75 | 76 | def __init__(self, base_temperature) -> None: 77 | super().__init__() 78 | self.base_temperature = base_temperature 79 | 80 | def __call__(self, logits_S, logits_T): 81 | return self.forward(logits_S, logits_T) 82 | 83 | @abstractmethod 84 | def forward(self, logits_S, logits_T): 85 | raise NotImplementedError() 86 | 87 | @staticmethod 88 | def from_name(name): 89 | classes = { 90 | 'constant': ConstantScheduler, 91 | 'flsw': FlswScheduler, 92 | 'cwsm': CwsmScheduler, 93 | } 94 | assert name in classes, f'Unsupported temperature scheduler {name}. Expect one from {list(classes.keys())}.' 95 | return classes[name]() 96 | 97 | 98 | class FunctionalScheduler(TemperatureScheduler): 99 | 100 | def __init__(self, scheduler_func, base_temperature) -> None: 101 | super().__init__(base_temperature) 102 | self._scheduler_func = scheduler_func 103 | 104 | def forward(self, logits_S, logits_T): 105 | return self._scheduler_func(logits_S, logits_T, self.base_temperature) 106 | 107 | 108 | class ConstantScheduler(TemperatureScheduler): 109 | def forward(self, logits_S, logits_T): 110 | return self.base_temperature 111 | 112 | 113 | class FlswScheduler(FunctionalScheduler): 114 | def __init__(self, beta=1, gamma=1, eps=1e-4, base_temperature=8): 115 | super().__init__(flsw_temperature_scheduler_builder(beta, gamma, eps), base_temperature) 116 | self.beta = beta 117 | self.gamma = gamma 118 | self.eps = eps 119 | 120 | 121 | class CwsmScheduler(FunctionalScheduler): 122 | def __init__(self, beta=1, base_temperature=8): 123 | super().__init__(cwsm_temperature_scheduler_builder(beta), base_temperature) 124 | self.beta = beta 125 | -------------------------------------------------------------------------------- /elit/datasets/parsing/conll_dataset.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import Union, List, Callable, Dict 20 | 21 | from elit.common.constant import ROOT, EOS, BOS 22 | from elit.common.dataset import TransformDataset 23 | from elit.components.parsers.conll import read_conll 24 | from elit.utils.io_util import TimingFileIterator 25 | from elit.utils.log_util import flash 26 | 27 | 28 | class CoNLLParsingDataset(TransformDataset): 29 | 30 | def __init__(self, 31 | data: Union[str, List], 32 | transform: Union[Callable, List] = None, 33 | cache=None, 34 | generate_idx=None, 35 | prune: Callable[[Dict[str, List[str]]], bool] = None) -> None: 36 | self._prune = prune 37 | super().__init__(data, transform, cache, generate_idx) 38 | 39 | def load_file(self, filepath): 40 | if filepath.endswith('.conllu'): 41 | # See https://universaldependencies.org/format.html 42 | field_names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 43 | 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] 44 | else: 45 | field_names = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 46 | 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'] 47 | fp = TimingFileIterator(filepath) 48 | for idx, sent in enumerate(read_conll(fp)): 49 | sample = {} 50 | for i, field in enumerate(field_names): 51 | sample[field] = [cell[i] for cell in sent] 52 | if not self._prune or not self._prune(sample): 53 | yield sample 54 | fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]') 55 | 56 | def __len__(self) -> int: 57 | return len(self.data) 58 | 59 | 60 | def append_bos(sample: dict, pos_key='CPOS', bos=ROOT) -> dict: 61 | """ 62 | 63 | Args: 64 | sample: 65 | pos_key: 66 | bos: A special token inserted to the head of tokens. 67 | 68 | Returns: 69 | 70 | """ 71 | sample['token'] = [bos] + sample['FORM'] 72 | if pos_key in sample: 73 | sample['pos'] = [ROOT] + sample[pos_key] 74 | if 'HEAD' in sample: 75 | sample['arc'] = [0] + sample['HEAD'] 76 | sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] 77 | return sample 78 | 79 | 80 | def append_bos_eos(sample: dict) -> dict: 81 | sample['token'] = [BOS] + sample['FORM'] + [EOS] 82 | if 'CPOS' in sample: 83 | sample['pos'] = [BOS] + sample['CPOS'] + [EOS] 84 | if 'HEAD' in sample: 85 | sample['arc'] = [0] + sample['HEAD'] + [0] 86 | sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] + sample['DEPREL'][:1] 87 | return sample 88 | 89 | 90 | def get_sibs(sample: dict) -> dict: 91 | heads = sample.get('arc', None) 92 | if heads: 93 | sibs = [-1] * len(heads) 94 | for i in range(1, len(heads)): 95 | hi = heads[i] 96 | for j in range(i + 1, len(heads)): 97 | hj = heads[j] 98 | di, dj = hi - i, hj - j 99 | if hi >= 0 and hj >= 0 and hi == hj and di * dj > 0: 100 | if abs(di) > abs(dj): 101 | sibs[i] = j 102 | else: 103 | sibs[j] = i 104 | break 105 | sample['sib_id'] = [0] + sibs[1:] 106 | return sample 107 | -------------------------------------------------------------------------------- /elit/layers/embeddings/char_rnn.py: -------------------------------------------------------------------------------- 1 | # ======================================================================== 2 | # Copyright 2020 Emory University 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ======================================================================== 16 | 17 | # -*- coding:utf-8 -*- 18 | # Author: hankcs 19 | from typing import Optional, Callable, Union 20 | 21 | import torch 22 | import torch.nn as nn 23 | from torch.nn.utils.rnn import pack_padded_sequence 24 | 25 | from elit.common.structure import AutoConfigurable 26 | from elit.common.transform import VocabDict, ToChar 27 | from elit.common.vocab import Vocab 28 | from elit.layers.embeddings.embedding import Embedding, EmbeddingDim 29 | 30 | 31 | class CharRNN(nn.Module, EmbeddingDim): 32 | def __init__(self, 33 | field, 34 | vocab_size, 35 | embed: Union[int, nn.Embedding], 36 | hidden_size): 37 | super(CharRNN, self).__init__() 38 | self.field = field 39 | # the embedding layer 40 | if isinstance(embed, int): 41 | self.embed = nn.Embedding(num_embeddings=vocab_size, 42 | embedding_dim=embed) 43 | elif isinstance(embed, nn.Module): 44 | self.embed = embed 45 | embed = embed.embedding_dim 46 | else: 47 | raise ValueError(f'Unrecognized type for {embed}') 48 | # the lstm layer 49 | self.lstm = nn.LSTM(input_size=embed, 50 | hidden_size=hidden_size, 51 | batch_first=True, 52 | bidirectional=True) 53 | 54 | def forward(self, batch, mask, **kwargs): 55 | x = batch[f'{self.field}_char_id'] 56 | # [batch_size, seq_len, fix_len] 57 | mask = x.ne(0) 58 | # [batch_size, seq_len] 59 | lens = mask.sum(-1) 60 | char_mask = lens.gt(0) 61 | 62 | # [n, fix_len, n_embed] 63 | x = self.embed(x[char_mask]) 64 | x = pack_padded_sequence(x, lens[char_mask], True, False) 65 | x, (h, _) = self.lstm(x) 66 | # [n, fix_len, n_out] 67 | h = torch.cat(torch.unbind(h), -1) 68 | # [batch_size, seq_len, n_out] 69 | embed = h.new_zeros(*lens.shape, h.size(-1)) 70 | embed = embed.masked_scatter_(char_mask.unsqueeze(-1), h) 71 | 72 | return embed 73 | 74 | @property 75 | def embedding_dim(self) -> int: 76 | return self.lstm.hidden_size * 2 77 | 78 | 79 | class CharRNNEmbedding(Embedding, AutoConfigurable): 80 | def __init__(self, 81 | field, 82 | embed, 83 | hidden_size, 84 | max_word_length=None) -> None: 85 | super().__init__() 86 | self.field = field 87 | self.hidden_size = hidden_size 88 | self.embed = embed 89 | self.max_word_length = max_word_length 90 | 91 | def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]: 92 | if isinstance(self.embed, Embedding): 93 | self.embed.transform(vocabs=vocabs) 94 | vocab_name = self.vocab_name 95 | if vocab_name not in vocabs: 96 | vocabs[vocab_name] = Vocab() 97 | return ToChar(self.field, vocab_name, max_word_length=self.max_word_length) 98 | 99 | @property 100 | def vocab_name(self): 101 | vocab_name = f'{self.field}_char' 102 | return vocab_name 103 | 104 | def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]: 105 | embed = self.embed 106 | if isinstance(self.embed, Embedding): 107 | embed = self.embed.module(vocabs=vocabs) 108 | return CharRNN(self.field, len(vocabs[self.vocab_name]), embed, self.hidden_size) 109 | --------------------------------------------------------------------------------