├── elit
    ├── server
    │   ├── __init__.py
    │   ├── parser_config.py
    │   ├── en_parser.py
    │   ├── en_util.py
    │   ├── format.py
    │   ├── service_parser.py
    │   ├── service_tokenizer.py
    │   └── en.py
    ├── components
    │   ├── parsers
    │   │   ├── udify
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   ├── biaffine
    │   │   │   ├── __init__.py
    │   │   │   ├── mlp.py
    │   │   │   └── biaffine.py
    │   │   ├── constituency
    │   │   │   └── __init__.py
    │   │   └── second_order
    │   │   │   ├── __init__.py
    │   │   │   ├── model.py
    │   │   │   └── treecrf_decoder.py
    │   ├── amr
    │   │   ├── seq2seq
    │   │   │   ├── __init__.py
    │   │   │   ├── dataset
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── IO.py
    │   │   │   │   ├── penman.py
    │   │   │   │   └── dataset.py
    │   │   │   └── evaluation.py
    │   │   ├── __init__.py
    │   │   └── amr_parser
    │   │   │   ├── __init__.py
    │   │   │   ├── data.py
    │   │   │   ├── amrio.py
    │   │   │   ├── customized_bart.py
    │   │   │   └── utils.py
    │   ├── srl
    │   │   ├── span_rank
    │   │   │   ├── util.py
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── eos
    │   │   └── __init__.py
    │   ├── mtl
    │   │   ├── __init__.py
    │   │   ├── tasks
    │   │   │   ├── ner
    │   │   │   │   └── __init__.py
    │   │   │   └── srl
    │   │   │   │   └── __init__.py
    │   │   └── loss_balancer.py
    │   ├── ner
    │   │   ├── __init__.py
    │   │   ├── biaffine_ner
    │   │   │   └── __init__.py
    │   │   └── rnn_ner.py
    │   ├── taggers
    │   │   ├── __init__.py
    │   │   ├── transformers
    │   │   │   └── __init__.py
    │   │   └── util.py
    │   ├── classifiers
    │   │   └── __init__.py
    │   ├── coref
    │   │   ├── __init__.py
    │   │   └── util.py
    │   ├── distillation
    │   │   ├── __init__.py
    │   │   ├── distillable_component.py
    │   │   └── schedulers.py
    │   ├── __init__.py
    │   ├── lambda_wrapper.py
    │   └── lemmatizer.py
    ├── pretrained
    │   ├── amr.py
    │   ├── dep.py
    │   ├── __init__.py
    │   ├── mtl.py
    │   └── coref.py
    ├── datasets
    │   ├── __init__.py
    │   ├── coref
    │   │   └── __init__.py
    │   ├── eos
    │   │   └── __init__.py
    │   ├── ner
    │   │   ├── __init__.py
    │   │   └── conll03.py
    │   ├── srl
    │   │   ├── __init__.py
    │   │   └── ontonotes5
    │   │   │   └── __init__.py
    │   ├── parsing
    │   │   ├── __init__.py
    │   │   ├── ptb.py
    │   │   ├── semeval15.py
    │   │   └── conll_dataset.py
    │   └── lm
    │   │   └── __init__.py
    ├── layers
    │   ├── __init__.py
    │   ├── crf
    │   │   └── __init__.py
    │   ├── embeddings
    │   │   ├── __init__.py
    │   │   ├── util.py
    │   │   ├── fast_text.py
    │   │   └── char_rnn.py
    │   ├── pass_through_encoder.py
    │   ├── feed_forward.py
    │   ├── transformers
    │   │   ├── __init__.py
    │   │   └── pt_imports.py
    │   └── context_layer.py
    ├── metrics
    │   ├── __init__.py
    │   ├── amr
    │   │   └── __init__.py
    │   ├── parsing
    │   │   ├── __init__.py
    │   │   ├── attachmentscore.py
    │   │   ├── conllx_eval.py
    │   │   ├── span.py
    │   │   └── labeled_f1.py
    │   ├── srl
    │   │   ├── __init__.py
    │   │   └── srlconll.py
    │   ├── chunking
    │   │   ├── __init__.py
    │   │   ├── chunking_f1.py
    │   │   └── binary_chunking_f1.py
    │   ├── accuracy.py
    │   ├── metric.py
    │   ├── mtl.py
    │   └── f1.py
    ├── transform
    │   └── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── constant.py
    │   └── component.py
    ├── version.py
    ├── utils
    │   ├── __init__.py
    │   ├── init_util.py
    │   ├── rules.py
    │   ├── reflection.py
    │   └── string.py
    ├── main.py
    └── __init__.py
├── tests
    ├── demo
    │   ├── __init__.py
    │   └── train_amr3_parser.py
    ├── __init__.py
    ├── test_parser.py
    ├── test_seq2seq_amr.py
    ├── test_tokenizer.py
    ├── test_joint_model.py
    ├── test_service_coref_doc.py
    └── test_service_coref_online.py
├── LICENSE
├── docs
    ├── semantic_role_labeling.md
    ├── named_entity_recognition.md
    ├── tokenization.md
    ├── dependency_parsing.md
    ├── constituency_parsing.md
    ├── part_of_speech_tagging.md
    └── abstract_meaning_parsing.md
├── README.md
├── .github
    └── workflows
    │   └── python-package.yml
└── setup.py


/elit/server/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-18 18:31
4 | 


--------------------------------------------------------------------------------
/tests/demo/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-01-19 15:11
4 | 


--------------------------------------------------------------------------------
/elit/components/parsers/udify/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-05-21 20:21


--------------------------------------------------------------------------------
/elit/components/amr/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-04-27 19:24
4 | 


--------------------------------------------------------------------------------
/elit/components/amr/seq2seq/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-04-27 19:29
4 | 


--------------------------------------------------------------------------------
/elit/pretrained/amr.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-10-31 14:35
4 | from elit.common.constant import ELIT_URL
5 | 
6 | AMR3_BART_LARGE_EN = ELIT_URL + 'amr3_bart_large_elit_20211030_173300.zip'
7 | 
8 | ALL = {}
9 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: hankcs
 3 | # Date: 2020-12-18 19:21
 4 | import os
 5 | 
 6 | root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
 7 | 
 8 | 
 9 | def cdroot():
10 |     """
11 |     cd to project root, so models are saved in the root folder
12 |     """
13 |     os.chdir(root)
14 | 


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import elit
 3 | 
 4 | 
 5 | class TestAMR(unittest.TestCase):
 6 |     def test_parse(self):
 7 |         parser = elit.load('KOREAN_TREEBANK_BIAFFINE_DEP')
 8 |         tree = parser('그는 르노가 3 월말까지 인수제의 시한을 갖고 있다고 덧붙였다 .'.split())
 9 |         print(tree)
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     unittest.main()
14 | 


--------------------------------------------------------------------------------
/elit/pretrained/dep.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: hankcs
 3 | # Date: 2022-03-26 01:06
 4 | from elit.common.constant import ELIT_URL
 5 | 
 6 | KOREAN_TREEBANK_BIAFFINE_DEP = ELIT_URL + 'ktb-bert-base-multilingual-cased_20220326_004709.zip'
 7 | 'A single task dependency parsing model for Korea trained on the Korean Treebank. UAS = 92.42% LAS = 90.48%.'
 8 | 
 9 | ALL = {}
10 | 


--------------------------------------------------------------------------------
/tests/test_seq2seq_amr.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import elit
 3 | import elit.pretrained.amr
 4 | 
 5 | 
 6 | class TestAMR(unittest.TestCase):
 7 |     def test_parse(self):
 8 |         parser = elit.load(elit.pretrained.amr.AMR3_BART_LARGE_EN)
 9 |         amr = parser('The boy wants the girl to believe him.')
10 |         print(amr)
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     unittest.main()
15 | 


--------------------------------------------------------------------------------
/elit/components/srl/span_rank/util.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL
 2 | import torch
 3 | 
 4 | 
 5 | def block_orth_normal_initializer(input_size, output_size):
 6 |     weight = []
 7 |     for o in output_size:
 8 |         for i in input_size:
 9 |             param = torch.FloatTensor(o, i)
10 |             torch.nn.init.orthogonal_(param)
11 |             weight.append(param)
12 |     return torch.cat(weight)
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Emory University
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |       http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/elit/components/amr/seq2seq/evaluation.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import penman
 4 | 
 5 | 
 6 | def write_predictions(predictions_path, tokenizer, graphs):
 7 |     pieces = [penman.encode(g) for g in graphs]
 8 |     Path(predictions_path).write_text('\n\n'.join(pieces).replace(tokenizer.INIT, ''))
 9 |     return predictions_path
10 | 
11 | 
12 | def compute_smatch(pred, gold):
13 |     import smatch
14 |     with Path(pred).open() as p, Path(gold).open() as g:
15 |         score = next(smatch.score_amr_pairs(p, g))
16 |     return score[2]
17 | 
18 | 
19 | def compute_bleu(gold_sentences, pred_sentences):
20 |     from sacrebleu import corpus_bleu
21 |     return corpus_bleu(pred_sentences, [gold_sentences])
22 | 


--------------------------------------------------------------------------------
/elit/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/layers/crf/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/transform/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 


--------------------------------------------------------------------------------
/elit/components/amr/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/eos/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/mtl/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/ner/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/srl/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/datasets/coref/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/datasets/eos/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/datasets/ner/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/datasets/srl/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/metrics/amr/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/metrics/parsing/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/metrics/srl/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/taggers/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/datasets/parsing/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/layers/embeddings/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/metrics/chunking/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/classifiers/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/coref/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: Liyan Xu
19 | 


--------------------------------------------------------------------------------
/elit/components/distillation/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/mtl/tasks/ner/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/mtl/tasks/srl/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/srl/span_rank/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/ner/biaffine_ner/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/parsers/biaffine/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/amr/amr_parser/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 


--------------------------------------------------------------------------------
/elit/components/parsers/constituency/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/parsers/second_order/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/taggers/transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs


--------------------------------------------------------------------------------
/elit/components/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from .pipeline import Pipeline


--------------------------------------------------------------------------------
/elit/version.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 
20 | __version__ = '2.0.0-alpha.10'
21 | 


--------------------------------------------------------------------------------
/elit/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from . import rules
20 | from . import util
21 | 


--------------------------------------------------------------------------------
/elit/server/parser_config.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | 
19 | MTL_MODEL = 'LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN'
20 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: hankcs
 3 | # Date: 2020-12-18 19:21
 4 | import unittest
 5 | 
 6 | from elit.components.tokenizer import EnglishTokenizer
 7 | 
 8 | 
 9 | class TestTokenizer(unittest.TestCase):
10 |     def test_tokenize(self):
11 |         tokenizer = EnglishTokenizer()
12 |         text = "Emory NLP is a research lab in Atlanta, GA. It is founded by Jinho D. Choi in 2014. Dr. Choi is a " \
13 |                "professor at Emory University. "
14 |         print(tokenizer.tokenize(text))
15 | 
16 |     def test_segment(self):
17 |         tokenizer = EnglishTokenizer()
18 |         print(tokenizer.segment(
19 |             ['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.', 'It', 'is', 'founded', 'by',
20 |              'Jinho', 'D.', 'Choi', 'in', '2014', '.', 'Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory',
21 |              'University', '.']))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     unittest.main()
26 | 


--------------------------------------------------------------------------------
/docs/semantic_role_labeling.md:
--------------------------------------------------------------------------------
 1 | # Semantic Role Labeling
 2 | 
 3 | Semantic role labeling parses the predicate-argument structure of a sentence often answers “who did what to whom”. Currently, ELIT supports span-based SRL.
 4 | 
 5 | Users can load a MTL component and specify `tasks='srl'` to have sentences parsed by ELIT. E.g.,
 6 | 
 7 | ```python
 8 | import elit
 9 | nlp = elit.load('LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN')
10 | print(nlp([['Emory','NLP','is','in','Atlanta']], tasks='srl'))
11 | ```
12 | 
13 | Outputs:
14 | 
15 | ```python
16 | {
17 |   "tok": [
18 |     ["Emory", "NLP", "is", "in", "Atlanta"]
19 |   ],
20 |   "srl": [
21 |     [[["ARG1", 0, 2, "Emory NLP"], ["PRED", 2, 3, "is"], ["ARG2", 3, 5, "in Atlanta"]]]
22 |   ],
23 | }
24 | ```
25 | 
26 | `srl` stores the `(role, start, end, form)` of the predicates and arguments corresponding to each flattened predicate-argument structure. In this component, the OntoNotes 5 SRL annotations are used with an additional role `PRED` indicating the predicate.
27 | 
28 | 


--------------------------------------------------------------------------------
/elit/datasets/srl/ontonotes5/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | ONTONOTES5_HOME = 'https://catalog.ldc.upenn.edu/LDC2013T19/ontonotes-release-5.0.tgz#data/'
20 | CONLL12_HOME = ONTONOTES5_HOME + '../conll-2012/'
21 | 


--------------------------------------------------------------------------------
/elit/pretrained/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from elit.pretrained import mtl
20 | from elit.pretrained import coref
21 | from elit.pretrained import amr
22 | from elit.pretrained import dep
23 | 
24 | # Will be filled up during runtime
25 | ALL = {}
26 | 


--------------------------------------------------------------------------------
/elit/common/constant.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 
20 | PAD = '<pad>'
21 | UNK = '<unk>'
22 | CLS = '[CLS]'
23 | BOS = '<bos>'
24 | EOS = '<eos>'
25 | ROOT = BOS
26 | IDX = '_idx_'
27 | ELIT_URL = 'https://elit-models.s3-us-west-2.amazonaws.com/v2/'
28 | NULL = '<null>'
29 | PRED = 'PRED'
30 | 


--------------------------------------------------------------------------------
/elit/components/amr/seq2seq/dataset/IO.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | from typing import List, Union, Iterable
 3 | from pathlib import Path
 4 | from .penman import pm_load as pm_load
 5 | 
 6 | 
 7 | def read_raw_amr_data(
 8 |         paths: List[Union[str, Path]],
 9 |         use_recategorization=False,
10 |         dereify=True,
11 |         remove_wiki=False,
12 | ):
13 |     assert paths
14 | 
15 |     if not isinstance(paths, Iterable):
16 |         paths = [paths]
17 | 
18 |     graphs = []
19 |     for path_ in paths:
20 |         for path in glob.glob(str(path_)):
21 |             path = Path(path)
22 |             assert path.exists(), f'{path} not exist'
23 |             graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki))
24 | 
25 |     assert graphs, 'No graphs loaded'
26 | 
27 |     if use_recategorization:
28 |         for g in graphs:
29 |             metadata = g.metadata
30 |             metadata['snt_orig'] = metadata['snt']
31 |             tokens = eval(metadata['tokens'])
32 |             metadata['snt'] = ' '.join(
33 |                 [t for t in tokens if not ((t.startswith('-L') or t.startswith('-R')) and t.endswith('-'))])
34 | 
35 |     return graphs
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ELIT
 2 | 
 3 | The Emory Language and Information Toolkit (ELIT) provides the state-of-the-art NLP models for the following tasks:
 4 | 
 5 | * [Tokenization](docs/tokenization.md)
 6 | * [Part-of-Speech Tagging](docs/part_of_speech_tagging.md)
 7 | * [Named Entity Recognition](docs/named_entity_recognition.md)
 8 | * [Constituency Parsing](docs/constituency_parsing.md)
 9 | * [Dependency Parsing](docs/dependency_parsing.md)
10 | * [Semantic Role Labeling](docs/semantic_role_labeling.md)
11 | * [AMR Parsing](docs/abstract_meaning_parsing.md)
12 | * Coreference Resolution
13 | * Emotion Detection
14 | 
15 | This is an open-source project led by the [Emory NLP](http://nlp.cs.emory.edu/) Research Laboratory and released under the [Apache License 2.0](LICENSE).
16 | 
17 | * The latest release: [2.0](https://pypi.org/project/elit/)
18 | * Contact: [Jinho D. Choi](http://www.cs.emory.edu/~choi)
19 | 
20 | ## Resources
21 | 
22 | * [Getting Started](docs/getting_started.md)
23 | * [Data Format](docs/data_format.md)
24 | 
25 | ## Citation
26 | 
27 | * [ELIT: Emory Language and Information Toolkit](https://arxiv.org/abs/2109.03903), Han He, Liyan Xu, and Jinho D. Choi, arXiv, 2109.03903, cs.CL, 2021.
28 | 


--------------------------------------------------------------------------------
/elit/layers/pass_through_encoder.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from alnlp.modules.pass_through_encoder import PassThroughEncoder as _PassThroughEncoder
20 | 
21 | from elit.common.structure import ConfigTracker
22 | 
23 | 
24 | class PassThroughEncoder(_PassThroughEncoder, ConfigTracker):
25 |     def __init__(self, input_dim: int) -> None:
26 |         super().__init__(input_dim)
27 |         ConfigTracker.__init__(self, locals())
28 | 


--------------------------------------------------------------------------------
/elit/utils/init_util.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | import math
20 | 
21 | import torch
22 | from torch import nn
23 | import functools
24 | 
25 | 
26 | def embedding_uniform(tensor:torch.Tensor, seed=233):
27 |     gen = torch.Generator().manual_seed(seed)
28 |     with torch.no_grad():
29 |         fan_out = tensor.size(-1)
30 |         bound = math.sqrt(3.0 / fan_out)
31 |         return tensor.uniform_(-bound, bound, generator=gen)
32 | 


--------------------------------------------------------------------------------
/tests/test_joint_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: hankcs
 3 | # Date: 2020-12-18 19:21
 4 | import unittest
 5 | from elit.server.en import en_services
 6 | from elit.server.format import Input
 7 | 
 8 | 
 9 | class TestJointModel(unittest.TestCase):
10 | 
11 |     def setUp(self) -> None:
12 |         super().setUp()
13 | 
14 |     def test_text_input(self):
15 |         text = "Emory NLP is a research lab in Atlanta, GA. "
16 |         "It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University."
17 |         doc = en_services.parser.parse([Input(text=text)])[0]
18 |         print(doc)
19 | 
20 |     def test_sents_input(self):
21 |         text = ["Emory NLP is a research lab in Atlanta, GA.",
22 |                 "It is founded by Jinho D. Choi in 2014.",
23 |                 'Dr. Choi is a professor at Emory University.']
24 |         doc = en_services.parser.parse([Input(text=text)])[0]
25 |         print(doc)
26 | 
27 |     def test_tokens_input(self):
28 |         tokens = [
29 |             "yes i do what 's your job".split(),
30 |         ]
31 |         doc = en_services.parser.parse([Input(tokens=tokens)])[0]
32 |         print(doc)
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     unittest.main()
37 | 


--------------------------------------------------------------------------------
/elit/components/mtl/loss_balancer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: hankcs
 3 | # Date: 2021-01-14 13:14
 4 | 
 5 | from collections import defaultdict, deque
 6 | 
 7 | from elit.common.structure import ConfigTracker
 8 | 
 9 | 
10 | class MovingAverage(object):
11 |     def __init__(self, maxlen=5) -> None:
12 |         self._queue = defaultdict(lambda: deque(maxlen=maxlen))
13 | 
14 |     def append(self, key, value: float):
15 |         self._queue[key].append(value)
16 | 
17 |     def average(self, key) -> float:
18 |         queue = self._queue[key]
19 |         return sum(queue) / len(queue)
20 | 
21 | 
22 | class MovingAverageBalancer(MovingAverage, ConfigTracker):
23 | 
24 |     def __init__(self, maxlen=5, intrinsic_weighting=True) -> None:
25 |         super().__init__(maxlen)
26 |         ConfigTracker.__init__(self, locals())
27 |         self.intrinsic_weighting = intrinsic_weighting
28 | 
29 |     def weight(self, task) -> float:
30 |         avg_losses = dict((k, self.average(k)) for k in self._queue)
31 |         weight = sum(avg_losses.values()) / avg_losses[task]
32 |         if self.intrinsic_weighting:
33 |             cur_loss = self._queue[task][-1]
34 |             weight *= cur_loss / avg_losses[task]
35 | 
36 |         return weight
37 | 


--------------------------------------------------------------------------------
/elit/pretrained/mtl.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from elit.common.constant import ELIT_URL
20 | 
21 | LEM_POS_NER_DEP_SDP_CON_AMR_ELECTRA_BASE_EN = ELIT_URL + 'en_pos_ner_srl_dep_con_amr_electra_base_20201222.zip'
22 | LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN = ELIT_URL + 'en_pos_ner_srl_dep_con_amr_roberta_base_20210628_234225.zip'
23 | LEM_POS_NER_DEP_ROBERTA_BASE_EN = ELIT_URL + 'en_lem_pos_ner_ddr_roberta_base_20210325_121606.zip'
24 | 
25 | ALL = {}
26 | 


--------------------------------------------------------------------------------
/elit/pretrained/coref.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: Liyan Xu
19 | from elit.common.constant import ELIT_URL
20 | 
21 | DOC_COREF_SPANBERT_BASE_EN = ELIT_URL + 'doc_coref_spanbert_base_20201222.zip'
22 | DOC_COREF_SPANBERT_LARGE_EN = ELIT_URL + 'doc_coref_spanbert_large_20201222.zip'
23 | ONLINE_COREF_SPANBERT_BASE_EN = ELIT_URL + 'online_coref_spanbert_base_20201222.zip'
24 | ONLINE_COREF_SPANBERT_LARGE_EN = ELIT_URL + 'online_coref_spanbert_large_20201222.zip'
25 | 
26 | ALL = {}
27 | 


--------------------------------------------------------------------------------
/elit/metrics/accuracy.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from alnlp import metrics
20 | from elit.metrics.metric import Metric
21 | 
22 | 
23 | class CategoricalAccuracy(metrics.CategoricalAccuracy, Metric):
24 |     @property
25 |     def score(self):
26 |         return self.get_metric()
27 | 
28 |     def __repr__(self) -> str:
29 |         return f'Accuracy:{self.score:.2%}'
30 | 
31 | 
32 | class BooleanAccuracy(metrics.BooleanAccuracy, CategoricalAccuracy):
33 |     pass
34 | 


--------------------------------------------------------------------------------
/elit/datasets/lm/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 
20 | _PTB_HOME = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz#'
21 | PTB_TOKEN_TRAIN = _PTB_HOME + 'data/ptb.train.txt'
22 | PTB_TOKEN_DEV = _PTB_HOME + 'data/ptb.valid.txt'
23 | PTB_TOKEN_TEST = _PTB_HOME + 'data/ptb.test.txt'
24 | 
25 | PTB_CHAR_TRAIN = _PTB_HOME + 'data/ptb.char.train.txt'
26 | PTB_CHAR_DEV = _PTB_HOME + 'data/ptb.char.valid.txt'
27 | PTB_CHAR_TEST = _PTB_HOME + 'data/ptb.char.test.txt'
28 | 


--------------------------------------------------------------------------------
/elit/server/en_parser.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | # Date: 2021-01-08 12:44
20 | import elit
21 | from elit.server import parser_config
22 | from elit.server.en_util import eos, tokenize
23 | from elit.server.service_parser import ServiceParser
24 | from elit.server.service_tokenizer import ServiceTokenizer
25 | 
26 | service_tokenizer = ServiceTokenizer(eos, tokenize)
27 | service_parser = ServiceParser(
28 |     service_tokenizer=service_tokenizer,
29 |     model=elit.load(parser_config.MTL_MODEL)
30 | )
31 | 


--------------------------------------------------------------------------------
/elit/components/parsers/second_order/model.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from torch import nn
20 | 
21 | 
22 | # noinspection PyAbstractClass
23 | class DependencyModel(nn.Module):
24 |     def __init__(self, embed: nn.Module, encoder: nn.Module, decoder: nn.Module):
25 |         super().__init__()
26 |         self.embed = embed
27 |         self.encoder = encoder
28 |         self.decoder = decoder
29 | 
30 |     def forward(self, batch, mask):
31 |         x = self.embed(batch, mask=mask)
32 |         x = self.encoder(x, mask)
33 |         return self.decoder(x, mask=mask)
34 | 


--------------------------------------------------------------------------------
/docs/named_entity_recognition.md:
--------------------------------------------------------------------------------
 1 | # Named Entity Recognition
 2 | 
 3 | Named Entity Recognition (NER) is the task of recognizing the span and type of each entity in a sentence. It is also a built-in decoder of the MTL framework. Users can load a MTL component and specify `tasks='ner'` to have the NER annotated by ELIT. E.g.,
 4 | 
 5 | ```python
 6 | import elit
 7 | nlp = elit.load('LEM_POS_NER_DEP_ROBERTA_BASE_EN')
 8 | print(nlp([['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.'], 
 9 |            ['It', 'is', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.'], 
10 |            ['Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.']]
11 | , tasks='ner'))
12 | ```
13 | 
14 | Outputs:
15 | 
16 | ```python
17 | {
18 |   "ner": [
19 |     [["ORG", 0, 2, "Emory NLP"], ["GPE", 7, 8, "Atlanta"], ["GPE", 9, 10, "GA"]],
20 |     [["PERSON", 4, 7, "Jinho D. Choi"], ["DATE", 8, 9, "2014"]],
21 |     [["PERSON", 1, 2, "Choi"], ["ORG", 6, 8, "Emory University"]]
22 |   ],
23 |   "tok": [
24 |     ["Emory", "NLP", "is", "a", "research", "lab", "in", "Atlanta", ",", "GA", "."],
25 |     ["It", "is", "founded", "by", "Jinho", "D.", "Choi", "in", "2014", "."],
26 |     ["Dr.", "Choi", "is", "a", "professor", "at", "Emory", "University", "."]
27 |   ]
28 | }
29 | ```
30 | 
31 | In `ner` field, each entity is represented as a tupe of `(type, start, end, form)`.
32 | 
33 | 


--------------------------------------------------------------------------------
/elit/layers/feed_forward.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import Union, List
20 | 
21 | from alnlp.modules import feedforward
22 | 
23 | from elit.common.structure import ConfigTracker
24 | 
25 | 
26 | class FeedForward(feedforward.FeedForward, ConfigTracker):
27 |     def __init__(self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]],
28 |                  activations: Union[str, List[str]], dropout: Union[float, List[float]] = 0.0) -> None:
29 |         super().__init__(input_dim, num_layers, hidden_dims, activations, dropout)
30 |         ConfigTracker.__init__(self, locals())
31 | 


--------------------------------------------------------------------------------
/elit/metrics/chunking/chunking_f1.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import List
20 | 
21 | from elit.metrics.chunking.sequence_labeling import get_entities
22 | from elit.metrics.f1 import F1
23 | from elit.metrics.metric import Metric
24 | 
25 | 
26 | class ChunkingF1(F1):
27 | 
28 |     def __call__(self, pred_tags: List[List[str]], gold_tags: List[List[str]]):
29 |         for p, g in zip(pred_tags, gold_tags):
30 |             pred = set(get_entities(p))
31 |             gold = set(get_entities(g))
32 |             self.nb_pred += len(pred)
33 |             self.nb_true += len(gold)
34 |             self.nb_correct += len(pred & gold)
35 | 


--------------------------------------------------------------------------------
/elit/server/en_util.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs, Liyan Xu
19 | from typing import List
20 | 
21 | from elit.components.tokenizer import EnglishTokenizer
22 | 
23 | tokenizer = EnglishTokenizer()
24 | 
25 | 
26 | def eos(text: List[str]) -> List[List[str]]:
27 |     results = []
28 |     for doc in text:
29 |         tokens = tokenizer.tokenize(doc)
30 |         sents = tokenizer.segment(tokens)
31 |         results.append(['\0'.join(x) for x in sents])
32 |     return results
33 | 
34 | 
35 | def tokenize(sents: List[str]) -> List[List[str]]:
36 |     # Since text from user seldom contains \0, so we use \0 to indicate a pre-tokenized sentence
37 |     return [x.split('\0') if '\0' in x else tokenizer.tokenize(x) for x in sents]
38 | 


--------------------------------------------------------------------------------
/tests/demo/train_amr3_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: hankcs
 3 | # Date: 2021-04-28 13:25
 4 | from elit.common.dataset import SortingSamplerBuilder
 5 | from elit.components.amr.seq2seq.seq2seq_amr_parser import Seq2seq_AMR_Parser
 6 | from elit.utils.log_util import cprint
 7 | from tests import cdroot
 8 | 
 9 | cdroot()
10 | 
11 | parser = Seq2seq_AMR_Parser()
12 | save_dir = 'data/model/amr/amr3_bart_large'
13 | cprint(f'Model will be saved in [cyan]{save_dir}[/cyan]')
14 | parser.fit(
15 |     'data/amr/amr_3.0/train.txt',
16 |     'data/amr/amr_3.0/dev.txt',
17 |     save_dir,
18 |     epochs=30,
19 |     eval_after=20,
20 |     transformer='facebook/bart-large',
21 |     sampler_builder=SortingSamplerBuilder(batch_max_tokens=6000),
22 |     gradient_accumulation=10,
23 | )
24 | parser.load(save_dir)
25 | test_score = parser.evaluate('data/amr/amr_3.0/test.txt', save_dir)[-1]
26 | cprint(f'Official score on testset: [red]{test_score.score:.1%}[/red]')
27 | finegrained = ['Smatch',
28 |                'Unlabeled',
29 |                'No WSD',
30 |                'Concepts',
31 |                'SRL',
32 |                'Reentrancies',
33 |                'Negations',
34 |                'Named Ent.',
35 |                'Wikification']
36 | print('\t'.join(f'{test_score[k].score * 100:.1f}' for k in finegrained))
37 | cprint(f'Model saved in [cyan]{save_dir}[/cyan]')
38 | print(f'To perform wikification, refer to [BLINK](https://github.com/SapienzaNLP/spring/blob/8fad6a4ce59132b22d6bdb4d4eb3a9aa5223ead6/bin/blinkify.py) which will give you the extra `0.3` F1.')
39 | 


--------------------------------------------------------------------------------
/elit/common/component.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | import inspect
20 | from abc import ABC, abstractmethod
21 | from typing import Any
22 | 
23 | from elit.common.structure import Configurable
24 | 
25 | 
26 | class Component(Configurable, ABC):
27 |     @abstractmethod
28 |     def predict(self, data: Any, **kwargs):
29 |         """Predict on data
30 | 
31 |         Args:
32 |           data: Any type of data subject to sub-classes
33 |           kwargs: Additional arguments
34 |           data: Any: 
35 |           **kwargs: 
36 | 
37 |         Returns:
38 | 
39 |         """
40 |         raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
41 | 
42 |     def __call__(self, data, **kwargs):
43 |         return self.predict(data, **kwargs)
44 | 


--------------------------------------------------------------------------------
/elit/components/taggers/util.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import List, Tuple
20 | from alnlp.modules.conditional_random_field import allowed_transitions
21 | 
22 | 
23 | def guess_tagging_scheme(labels: List[str]) -> str:
24 |     tagset = set(y.split('-')[0] for y in labels)
25 |     for scheme in "BIO", "BIOUL", "BMES", 'IOBES':
26 |         if tagset == set(list(scheme)):
27 |             return scheme
28 | 
29 | 
30 | def guess_allowed_transitions(labels) -> List[Tuple[int, int]]:
31 |     scheme = guess_tagging_scheme(labels)
32 |     if not scheme:
33 |         return None
34 |     if scheme == 'IOBES':
35 |         scheme = 'BIOUL'
36 |         labels = [y.replace('E-', 'L-').replace('S-', 'U-') for y in labels]
37 |     return allowed_transitions(scheme, dict(enumerate(labels)))
38 | 


--------------------------------------------------------------------------------
/elit/server/format.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs, Liyan Xu
19 | from typing import Union, List, Tuple
20 | from pydantic import BaseModel
21 | 
22 | 
23 | class OnlineCorefContext(BaseModel):
24 |     input_ids: List[int]
25 |     sentence_map: List[int]
26 |     subtoken_map: List[int]
27 |     mentions: List[Tuple[int, int]]
28 |     uttr_start_idx: List[int]
29 |     speaker_ids: List[int] = None  # Others are required
30 | 
31 | 
32 | class Input(BaseModel):
33 |     text: Union[str, List[str]] = None
34 |     tokens: List[List[str]] = None
35 |     models: List[str] = ["lem", "pos", "ner", "con", "dep", "srl", "amr", "dcr", "ocr"]
36 | 
37 |     # For coref
38 |     speaker_ids: Union[int, List[int]] = None
39 |     genre: str = None
40 |     coref_context: OnlineCorefContext = None
41 |     return_coref_prob: bool = False
42 | 
43 |     language: str = 'en'
44 |     verbose: bool = True
45 | 


--------------------------------------------------------------------------------
/elit/datasets/parsing/ptb.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 
20 | _PTB_HOME = 'https://github.com/KhalilMrini/LAL-Parser/archive/master.zip#data/'
21 | 
22 | PTB_TRAIN = _PTB_HOME + '02-21.10way.clean'
23 | PTB_DEV = _PTB_HOME + '22.auto.clean'
24 | PTB_TEST = _PTB_HOME + '23.auto.clean'
25 | 
26 | PTB_SD330_TRAIN = _PTB_HOME + 'ptb_train_3.3.0.sd.clean'
27 | PTB_SD330_DEV = _PTB_HOME + 'ptb_dev_3.3.0.sd.clean'
28 | PTB_SD330_TEST = _PTB_HOME + 'ptb_test_3.3.0.sd.clean'
29 | 
30 | PTB_TOKEN_MAPPING = {
31 |     "-LRB-": "(",
32 |     "-RRB-": ")",
33 |     "-LCB-": "{",
34 |     "-RCB-": "}",
35 |     "-LSB-": "[",
36 |     "-RSB-": "]",
37 |     "``": '"',
38 |     "''": '"',
39 |     "`": "'",
40 |     '«': '"',
41 |     '»': '"',
42 |     '‘': "'",
43 |     '’': "'",
44 |     '“': '"',
45 |     '”': '"',
46 |     '„': '"',
47 |     '‹': "'",
48 |     '›': "'",
49 |     "\u2013": "--",  # en dash
50 |     "\u2014": "--",  # em dash
51 | }
52 | 


--------------------------------------------------------------------------------
/elit/components/lambda_wrapper.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import Callable, Any
20 | 
21 | from elit.common.component import Component
22 | from elit.utils.reflection import classpath_of, object_from_classpath, str_to_type
23 | 
24 | 
25 | class LambdaComponent(Component):
26 |     def __init__(self, function: Callable) -> None:
27 |         super().__init__()
28 |         self.function = function
29 |         self.config['function'] = classpath_of(function)
30 | 
31 |     def predict(self, data: Any, **kwargs):
32 |         unpack = kwargs.pop('_elit_unpack', None)
33 |         if unpack:
34 |             return self.function(*data, **kwargs)
35 |         return self.function(data, **kwargs)
36 | 
37 |     @staticmethod
38 |     def from_config(meta: dict, **kwargs):
39 |         cls = str_to_type(meta['classpath'])
40 |         function = meta['function']
41 |         function = object_from_classpath(function)
42 |         return cls(function)
43 | 


--------------------------------------------------------------------------------
/elit/metrics/metric.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from abc import ABC, abstractmethod
20 | 
21 | 
22 | class Metric(ABC):
23 | 
24 |     def __lt__(self, other):
25 |         return self.score < other
26 | 
27 |     def __le__(self, other):
28 |         return self.score <= other
29 | 
30 |     def __eq__(self, other):
31 |         return self.score == other
32 | 
33 |     def __ge__(self, other):
34 |         return self.score >= other
35 | 
36 |     def __gt__(self, other):
37 |         return self.score > other
38 | 
39 |     def __ne__(self, other):
40 |         return self.score != other
41 | 
42 |     @property
43 |     @abstractmethod
44 |     def score(self):
45 |         pass
46 | 
47 |     @abstractmethod
48 |     def __call__(self, pred, gold, mask=None):
49 |         pass
50 | 
51 |     def __repr__(self) -> str:
52 |         return f'{self.score}:.4f'
53 | 
54 |     def __float__(self):
55 |         return self.score
56 | 
57 |     @abstractmethod
58 |     def reset(self):
59 |         pass
60 | 


--------------------------------------------------------------------------------
/elit/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: hankcs
 3 | # Date: 2020-12-18 19:36
 4 | 
 5 | import argparse
 6 | import sys
 7 | 
 8 | from elit import __version__
 9 | from elit.server.format import Input
10 | from elit.server import parser_config
11 | 
12 | 
13 | def add_model_option(parser: argparse.ArgumentParser):
14 |     parser.add_argument('--parser_model', type=str, default=parser_config.MTL_MODEL,
15 |                         help='the identifier of a parser model')
16 | 
17 | 
18 | def main():
19 |     if len(sys.argv) == 1:
20 |         sys.argv.append('--help')
21 | 
22 |     arg_parser = argparse.ArgumentParser(description='ELIT-{}'.format(__version__))
23 |     task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?')
24 |     parse_parser = task_parser.add_parser(name='parse', help='interactive parse per document')
25 |     add_model_option(parse_parser)
26 |     server_parser = task_parser.add_parser(name='serve', help='start http server',
27 |                                            description='A http server for ELIT')
28 |     add_model_option(server_parser)
29 |     server_parser.add_argument('--port', type=int, default=8000)
30 |     server_parser.add_argument('--workers', type=int, default=1, help='number of workers')
31 | 
32 |     args = arg_parser.parse_args()
33 |     parser_config.MTL_MODEL = args.parser_model
34 | 
35 |     if args.task == 'parse':
36 |         from elit.server.en_parser import service_parser
37 |         for line in sys.stdin:
38 |             line = line.strip()
39 |             doc = service_parser.parse([Input(text=line)])[0]
40 |             print(doc)
41 |     elif args.task == 'serve':
42 |         from elit.server import server
43 |         server.run(port=args.port, workers=args.workers)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/docs/tokenization.md:
--------------------------------------------------------------------------------
 1 | # Tokenization
 2 | 
 3 | Tokenization is the task to split a sentence into tokens such that each token represents a word or a punctuation. ELIT features a rule-based English tokenizer which offers both tokenization and sentence segmentation.
 4 | 
 5 | ### Tokenization
 6 | 
 7 | The `EnglishTokenizer` class handles common abbreviations, apostrophes, concatenation words, hyphens, network protocols, emojis, emails, html entities, list item units with expert-crafted rules. E.g.:
 8 | 
 9 | ```python
10 | from elit.components.tokenizer import EnglishTokenizer
11 | tokenizer = EnglishTokenizer()
12 | text = "Emory NLP is a research lab in Atlanta, GA. It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University."
13 | print(tokenizer.tokenize(text))
14 | ```
15 | 
16 | Output:
17 | 
18 | ```python
19 | ['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.', 'It', 'is', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.', 'Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.']
20 | ```
21 | 
22 | ### Sentence Segmentation
23 | 
24 | The tokenized tokens can be fed into `tokenizer.segment` for sentence segmentation. E.g.:
25 | 
26 | ```python
27 | from elit.components.tokenizer import EnglishTokenizer
28 | tokenizer = EnglishTokenizer()
29 | print(tokenizer.segment(
30 |   ['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.', 'It', 'is', 'founded', 'by',
31 |    'Jinho', 'D.', 'Choi', 'in', '2014', '.', 'Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory',
32 |    'University', '.']))
33 | ```
34 | 
35 | Output:
36 | 
37 | ```python
38 | [['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.'], ['It', 'is', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.'], ['Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.']]
39 | ```
40 | 
41 | 


--------------------------------------------------------------------------------
/elit/layers/transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 
20 | from elit.common.constant import ELIT_URL
21 | # mute transformers
22 | import logging
23 | 
24 | logging.getLogger('transformers.file_utils').setLevel(logging.ERROR)
25 | logging.getLogger('transformers.filelock').setLevel(logging.ERROR)
26 | logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR)
27 | logging.getLogger('transformers.configuration_utils').setLevel(logging.ERROR)
28 | logging.getLogger('transformers.modeling_tf_utils').setLevel(logging.ERROR)
29 | logging.getLogger('transformers.modeling_utils').setLevel(logging.ERROR)
30 | logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR)
31 | 
32 | zh_albert_models_google = {
33 |     'albert_base_zh': ELIT_URL + 'embeddings/albert_base_zh.tar.gz',  # Provide mirroring
34 |     'albert_large_zh': 'https://storage.googleapis.com/albert_models/albert_large_zh.tar.gz',
35 |     'albert_xlarge_zh': 'https://storage.googleapis.com/albert_models/albert_xlarge_zh.tar.gz',
36 |     'albert_xxlarge_zh': 'https://storage.googleapis.com/albert_models/albert_xxlarge_zh.tar.gz',
37 | }
38 | # bert_models_google['chinese_L-12_H-768_A-12'] = elit_URL + 'embeddings/chinese_L-12_H-768_A-12.zip'
39 | 


--------------------------------------------------------------------------------
/elit/layers/transformers/pt_imports.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | import os
20 | 
21 | if os.environ.get('USE_TF', None) is None:
22 |     os.environ["USE_TF"] = 'NO'  # saves time loading transformers
23 | if os.environ.get('TOKENIZERS_PARALLELISM', None) is None:
24 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
25 | from transformers import BertTokenizer, BertConfig, PretrainedConfig, \
26 |     AutoConfig, AutoTokenizer, PreTrainedTokenizer, BertTokenizerFast, AlbertConfig, BertModel, AutoModel, \
27 |     PreTrainedModel, get_linear_schedule_with_warmup, AdamW, AutoModelForSequenceClassification, \
28 |     AutoModelForTokenClassification, optimization, BartModel
29 | 
30 | 
31 | class AutoModel_(AutoModel):
32 |     @classmethod
33 |     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, training=True, **kwargs):
34 |         if training:
35 |             return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
36 |         else:
37 |             if isinstance(pretrained_model_name_or_path, str):
38 |                 return super().from_config(AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs))
39 |             else:
40 |                 assert not kwargs
41 |                 return super().from_config(pretrained_model_name_or_path)
42 | 


--------------------------------------------------------------------------------
/docs/dependency_parsing.md:
--------------------------------------------------------------------------------
 1 | # Dependency Parsing
 2 | 
 3 | Constituency parsing is the task of parsing a sentence to its dependency grammar which represents its syntax in the form of head/dependent word pairs. 
 4 | 
 5 | Users can load a MTL component and specify `tasks='dep'` to have sentences parsed by ELIT. E.g.,
 6 | 
 7 | ```python
 8 | import elit
 9 | nlp = elit.load('LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN')
10 | print(nlp(['Emory','NLP','is','in','Atlanta']
11 | , tasks='dep'))
12 | ```
13 | 
14 | Outputs:
15 | 
16 | ```python
17 | {
18 |   "tok": [
19 |     ["Emory", "NLP", "is", "in", "Atlanta"]
20 |   ],
21 |   "dep": [
22 |     [[1, "com"], [3, "nsbj"], [3, "cop"], [-1, "root"], [3, "obj"]]
23 |   ]
24 | }
25 | ```
26 | 
27 | `dep` stores the `(head, relation)` of each token, with the offset starting from `-1` (ROOT). In this component, the primary dependency of Deep Dependency Graph Representation is used. The full representation with secondary dependencies are provided in `LEM_POS_NER_DEP_ROBERTA_BASE_EN`.
28 | 
29 | ### Korean Model
30 | 
31 | We have also released a single task dependency parsing model for Korea trained on the Korean Treebank, which are demonstrated using the following codes.
32 | 
33 | ```python
34 | import elit
35 | parser = elit.load('KOREAN_TREEBANK_BIAFFINE_DEP')
36 | tree = parser('그는 르노가 3 월말까지 인수제의 시한을 갖고 있다고 덧붙였다 .'.split())
37 | print(tree)
38 | ```
39 | 
40 | The input is a tokenized sentence or a list of multiple tokenized sentences. The type of the output is [`CoNLLSentence` ](https://github.com/emorynlp/elit/blob/main/elit/components/parsers/conll.py#L179) which is a list of [`CoNLLUWord`](https://github.com/emorynlp/elit/blob/main/elit/components/parsers/conll.py#L95). When the output is fed into `print` or `str`, a CoNLLU format will be returned:
41 | 
42 | ```
43 | 1	그는	_	_	_	_	9	nsubj	_	_
44 | 2	르노가	_	_	_	_	7	csubj	_	_
45 | 3	3	_	_	_	_	4	nummod	_	_
46 | 4	월말까지	_	_	_	_	7	obl	_	_
47 | 5	인수제의	_	_	_	_	6	compound	_	_
48 | 6	시한을	_	_	_	_	7	obj	_	_
49 | 7	갖고	_	_	_	_	9	advcl	_	_
50 | 8	있다고	_	_	_	_	7	aux	_	_
51 | 9	덧붙였다	_	_	_	_	0	root	_	_
52 | 10	.	_	_	_	_	9	punct	_	_
53 | ```
54 | 
55 | 


--------------------------------------------------------------------------------
/elit/metrics/chunking/binary_chunking_f1.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from collections import defaultdict
20 | from typing import List, Union
21 | 
22 | import torch
23 | 
24 | from elit.metrics.f1 import F1
25 | 
26 | 
27 | class BinaryChunkingF1(F1):
28 |     def __call__(self, pred_tags: torch.LongTensor, gold_tags: torch.LongTensor, lens: List[int] = None):
29 |         if lens is None:
30 |             lens = [gold_tags.size(1)] * gold_tags.size(0)
31 |         self.update(self.decode_spans(pred_tags, lens), self.decode_spans(gold_tags, lens))
32 | 
33 |     def update(self, pred_tags, gold_tags):
34 |         for pred, gold in zip(pred_tags, gold_tags):
35 |             super().__call__(set(pred), set(gold))
36 | 
37 |     @staticmethod
38 |     def decode_spans(pred_tags: torch.LongTensor, lens: Union[List[int], torch.LongTensor]):
39 |         if isinstance(lens, torch.Tensor):
40 |             lens = lens.tolist()
41 |         batch_pred = defaultdict(list)
42 |         for batch, offset in pred_tags.nonzero(as_tuple=False).tolist():
43 |             batch_pred[batch].append(offset)
44 |         batch_pred_spans = [[(0, l)] for l in lens]
45 |         for batch, offsets in batch_pred.items():
46 |             l = lens[batch]
47 |             batch_pred_spans[batch] = list(zip(offsets, offsets[1:] + [l]))
48 |         return batch_pred_spans
49 | 


--------------------------------------------------------------------------------
/elit/__init__.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | __author__ = "Jinho D. Choi"
20 | import elit.common
21 | import elit.components
22 | import elit.pretrained
23 | import elit.utils
24 | from elit.version import __version__
25 | 
26 | elit.utils.util.ls_resource_in_module(elit.pretrained)
27 | 
28 | 
29 | def load(save_dir: str, transform_only: bool = False, **kwargs) -> elit.common.component.Component:
30 |     """Load pretrained component from an identifier.
31 | 
32 |     Args:
33 |       save_dir (str): The identifier to the saved component. It could be a remote URL or a local path.
34 |       transform_only: Whether to load transform only for TensorFlow components. Default: ``False``.
35 |       **kwargs: Arguments passed to `Component.load`
36 | 
37 |     Returns:
38 |       A pretrained component.
39 | 
40 |     """
41 |     save_dir = elit.pretrained.ALL.get(save_dir, save_dir)
42 |     from elit.utils.component_util import load_from_meta_file
43 |     return load_from_meta_file(save_dir, 'meta.json', transform_only=transform_only, **kwargs)
44 | 
45 | 
46 | def pipeline(*pipes) -> elit.components.pipeline.Pipeline:
47 |     """Creates a pipeline of components.
48 | 
49 |     Args:
50 |       *pipes: Components if pre-defined any.
51 | 
52 |     Returns:
53 |       A pipeline
54 | 
55 |     """
56 |     return elit.components.pipeline.Pipeline(*pipes)
57 | 


--------------------------------------------------------------------------------
/elit/components/lemmatizer.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import List
20 | 
21 | from elit.common.transform import TransformList
22 | from elit.components.parsers.udify.lemma_edit import gen_lemma_rule, apply_lemma_rule
23 | from elit.components.taggers.transformers.transformer_tagger import TransformerTagger
24 | 
25 | 
26 | def add_lemma_rules_to_sample(sample: dict):
27 |     if 'tag' in sample and 'lemma' not in sample:
28 |         lemma_rules = [gen_lemma_rule(word, lemma)
29 |                        if lemma != "_" else "_"
30 |                        for word, lemma in zip(sample['token'], sample['tag'])]
31 |         sample['lemma'] = sample['tag'] = lemma_rules
32 |     return sample
33 | 
34 | 
35 | class TransformerLemmatizer(TransformerTagger):
36 |     def build_dataset(self, data, transform=None, **kwargs):
37 |         if not isinstance(transform, list):
38 |             transform = TransformList()
39 |         transform.append(add_lemma_rules_to_sample)
40 |         return super().build_dataset(data, transform, **kwargs)
41 | 
42 |     def prediction_to_human(self, pred, vocab: List[str], batch, token=None):
43 |         if token is None:
44 |             token = batch['token']
45 |         rules = super().prediction_to_human(pred, vocab, batch)
46 |         for token_per_sent, rule_per_sent in zip(token, rules):
47 |             lemma_per_sent = [apply_lemma_rule(t, r) for t, r in zip(token_per_sent, rule_per_sent)]
48 |             yield lemma_per_sent
49 | 


--------------------------------------------------------------------------------
/elit/components/parsers/second_order/treecrf_decoder.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import Any, Tuple
20 | 
21 | import torch
22 | 
23 | from elit.components.parsers.biaffine.biaffine_model import BiaffineDecoder
24 | from elit.components.parsers.biaffine.mlp import MLP
25 | from elit.components.parsers.constituency.treecrf import CRF2oDependency
26 | from elit.components.parsers.second_order.affine import Triaffine
27 | 
28 | 
29 | class TreeCRFDecoder(BiaffineDecoder):
30 |     def __init__(self, hidden_size, n_mlp_arc, n_mlp_sib, n_mlp_rel, mlp_dropout, n_rels) -> None:
31 |         super().__init__(hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels)
32 |         self.mlp_sib_s = MLP(hidden_size, n_mlp_sib, dropout=mlp_dropout)
33 |         self.mlp_sib_d = MLP(hidden_size, n_mlp_sib, dropout=mlp_dropout)
34 |         self.mlp_sib_h = MLP(hidden_size, n_mlp_sib, dropout=mlp_dropout)
35 | 
36 |         self.sib_attn = Triaffine(n_in=n_mlp_sib, bias_x=True, bias_y=True)
37 |         self.crf = CRF2oDependency()
38 | 
39 |     def forward(self, x, mask=None, **kwargs: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
40 |         s_arc, s_rel = super(TreeCRFDecoder, self).forward(x, mask)
41 |         sib_s = self.mlp_sib_s(x)
42 |         sib_d = self.mlp_sib_d(x)
43 |         sib_h = self.mlp_sib_h(x)
44 |         # [batch_size, seq_len, seq_len, seq_len]
45 |         s_sib = self.sib_attn(sib_s, sib_d, sib_h).permute(0, 3, 1, 2)
46 |         return s_arc, s_sib, s_rel
47 | 


--------------------------------------------------------------------------------
/elit/components/amr/seq2seq/dataset/penman.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from penman import load as load_, Graph, Triple
 4 | from penman import loads as loads_
 5 | from penman import encode as encode_
 6 | from penman.model import Model
 7 | from penman.models.noop import NoOpModel
 8 | from penman.models import amr
 9 | import penman
10 | import logging
11 | 
12 | op_model = Model()
13 | noop_model = NoOpModel()
14 | amr_model = amr.model
15 | DEFAULT = op_model
16 | 
17 | # Mute loggers
18 | penman.layout.logger.setLevel(logging.CRITICAL)
19 | penman._parse.logger.setLevel(logging.CRITICAL)
20 | 
21 | 
22 | def _get_model(dereify):
23 |     if dereify is None:
24 |         return DEFAULT
25 |     elif dereify:
26 |         return op_model
27 |     else:
28 |         return noop_model
29 | 
30 | 
31 | def _remove_wiki(graph):
32 |     metadata = graph.metadata
33 |     triples = []
34 |     for t in graph.triples:
35 |         v1, rel, v2 = t
36 |         if rel == ':wiki':
37 |             t = Triple(v1, rel, '+')
38 |         triples.append(t)
39 |     graph = Graph(triples)
40 |     graph.metadata = metadata
41 |     return graph
42 | 
43 | 
44 | def pm_load(source, dereify=None, remove_wiki=False) -> List[penman.Graph]:
45 |     """
46 | 
47 |     Args:
48 |         source:
49 |         dereify: Restore reverted relations
50 |         remove_wiki:
51 | 
52 |     Returns:
53 | 
54 |     """
55 |     model = _get_model(dereify)
56 |     out = load_(source=source, model=model)
57 |     if remove_wiki:
58 |         for i in range(len(out)):
59 |             out[i] = _remove_wiki(out[i])
60 |     return out
61 | 
62 | 
63 | def loads(string, dereify=None, remove_wiki=False):
64 |     model = _get_model(dereify)
65 |     out = loads_(string=string, model=model)
66 |     if remove_wiki:
67 |         for i in range(len(out)):
68 |             out[i] = _remove_wiki(out[i])
69 |     return out
70 | 
71 | 
72 | def encode(g, top=None, indent=-1, compact=False):
73 |     model = amr_model
74 |     return encode_(g=g, top=top, indent=indent, compact=compact, model=model)
75 | 
76 | 
77 | def role_is_reverted(role: str):
78 |     if role.endswith('consist-of'):
79 |         return False
80 |     return role.endswith('-of')
81 | 
82 | 
83 | class AMRGraph(penman.Graph):
84 |     def __str__(self):
85 |         return penman.encode(self)
86 | 


--------------------------------------------------------------------------------
/elit/utils/rules.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 
20 | import re
21 | 
22 | SEPARATOR = r'@'
23 | RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
24 | AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
25 | AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
26 | UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE)
27 | UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE)
28 | 
29 | 
30 | def replace_with_separator(text, separator, regexs):
31 |     replacement = r"\1" + separator + r"\2"
32 |     result = text
33 |     for regex in regexs:
34 |         result = regex.sub(replacement, result)
35 |     return result
36 | 
37 | 
38 | def split_sentence(text, best=True):
39 |     text = re.sub('([。！？\?])([^”’])', r"\1\n\2", text)
40 |     text = re.sub('(\.{6})([^”’])', r"\1\n\2", text)
41 |     text = re.sub('(\…{2})([^”’])', r"\1\n\2", text)
42 |     text = re.sub('([。！？\?][”’])([^，。！？\?])', r'\1\n\2', text)
43 |     for chunk in text.split("\n"):
44 |         chunk = chunk.strip()
45 |         if not chunk:
46 |             continue
47 |         if not best:
48 |             yield chunk
49 |             continue
50 |         processed = replace_with_separator(chunk, SEPARATOR, [AB_SENIOR, AB_ACRONYM])
51 |         for sentence in RE_SENTENCE.finditer(processed):
52 |             sentence = replace_with_separator(sentence.group(), r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
53 |             yield sentence
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "**" ]
 9 |   pull_request:
10 |     branches: [ "**" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: [3.7, 3.8, 3.9, '3.10']
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v2
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v2
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install -e .
31 |         python -m pip install pytest
32 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
33 | #    - name: Lint with flake8
34 | #      run: |
35 | #        # stop the build if there are Python syntax errors or undefined names
36 | #        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 | #        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 | #        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pytest tests
42 |   deploy:
43 |     needs: build
44 |     if: github.event_name == 'push' && github.ref == 'refs/heads/main'
45 |     runs-on: ubuntu-latest
46 | 
47 |     steps:
48 |       - uses: actions/checkout@v2
49 | #      - name: Set up Python 3.7
50 | #        uses: actions/setup-python@v2
51 | #        with:
52 | #          python-version: 3.7
53 |       - name: Install dependencies
54 |         run: |
55 |           python -m pip install --upgrade pip
56 |           python -m pip install setuptools wheel twine
57 |       - name: Deploy to PyPI
58 |         run: |
59 |           python setup.py sdist bdist_wheel
60 |           python -m twine upload dist/*
61 |         env:
62 |           TWINE_USERNAME: __token__
63 |           TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
64 |           TWINE_REPOSITORY: pypi
65 | 


--------------------------------------------------------------------------------
/docs/constituency_parsing.md:
--------------------------------------------------------------------------------
 1 | # Constituency Parsing
 2 | 
 3 | Constituency parsing is the task of parsing a sentence to its [phrase structure grammar](https://en.wikipedia.org/wiki/Phrase_structure_grammar) which represents its syntax in the form of nested constituent structure. 
 4 | 
 5 | Users can load a MTL component and specify `tasks='con'` to have sentences parsed by ELIT. E.g.,
 6 | 
 7 | ```python
 8 | import elit
 9 | nlp = elit.load('LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN')
10 | print(nlp([['Emory', 'NLP', 'is', 'a', 'research', 'lab', 'in', 'Atlanta', ',', 'GA', '.'], 
11 |            ['It', 'is', 'founded', 'by', 'Jinho', 'D.', 'Choi', 'in', '2014', '.'], 
12 |            ['Dr.', 'Choi', 'is', 'a', 'professor', 'at', 'Emory', 'University', '.']]
13 | , tasks='con'))
14 | ```
15 | 
16 | Outputs:
17 | 
18 | ```python
19 | {
20 |   "con": [
21 |     ["TOP", [["S", [["NP", [["_", ["Emory"]], ["_", ["NLP"]]]], ["VP", [["_", ["is"]], ["NP", [["NP", [["_", ["a"]], ["_", ["research"]], ["_", ["lab"]]]], ["PP", [["_", ["in"]], ["NP", [["NP", [["_", ["Atlanta"]]]], ["_", [","]], ["NP", [["_", ["GA"]]]]]]]]]]]], ["_", ["."]]]]]],
22 |     ["TOP", [["S", [["NP", [["_", ["It"]]]], ["VP", [["_", ["is"]], ["VP", [["_", ["founded"]], ["PP", [["_", ["by"]], ["NP", [["_", ["Jinho"]], ["_", ["D."]], ["_", ["Choi"]]]]]], ["PP", [["_", ["in"]], ["NP", [["_", ["2014"]]]]]]]]]], ["_", ["."]]]]]],
23 |     ["TOP", [["S", [["NP", [["_", ["Dr."]], ["_", ["Choi"]]]], ["VP", [["_", ["is"]], ["NP", [["NP", [["_", ["a"]], ["_", ["professor"]]]], ["PP", [["_", ["at"]], ["NP", [["_", ["Emory"]], ["_", ["University"]]]]]]]]]], ["_", ["."]]]]]]
24 |   ],
25 |   "tok": [
26 |     ["Emory", "NLP", "is", "a", "research", "lab", "in", "Atlanta", ",", "GA", "."],
27 |     ["It", "is", "founded", "by", "Jinho", "D.", "Choi", "in", "2014", "."],
28 |     ["Dr.", "Choi", "is", "a", "professor", "at", "Emory", "University", "."]
29 |   ]
30 | }
31 | ```
32 | 
33 | `con` stores the constituency trees, specifically `(label, child-constituents)` for the non-terminal constituents and `form` for the terminals. Note that we designed a nested list representation with the round brackets replaced by square brackets to avoid ambiguity and make it compatible with JSON. When not being printed out, the `Document` class will convert our nested list structure to the conventional bracketed tree.
34 | 


--------------------------------------------------------------------------------
/elit/metrics/mtl.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from elit.metrics.metric import Metric
20 | 
21 | 
22 | class MetricDict(Metric, dict):
23 |     _COLORS = ["magenta", "cyan", "green", "yellow"]
24 | 
25 |     @property
26 |     def score(self):
27 |         return sum(float(x) for x in self.values()) / len(self)
28 | 
29 |     def __call__(self, pred, gold):
30 |         for metric in self.values():
31 |             metric(pred, gold)
32 | 
33 |     def reset(self):
34 |         for metric in self.values():
35 |             metric.reset()
36 | 
37 |     def __repr__(self) -> str:
38 |         return ' '.join(f'({k} {v})' for k, v in self.items())
39 | 
40 |     def cstr(self, idx=None, level=0) -> str:
41 |         if idx is None:
42 |             idx = [0]
43 |         prefix = ''
44 |         for _, (k, v) in enumerate(self.items()):
45 |             color = self._COLORS[idx[0] % len(self._COLORS)]
46 |             idx[0] += 1
47 |             child_is_dict = isinstance(v, MetricDict)
48 |             _level = min(level, 2)
49 |             # if level != 0 and not child_is_dict:
50 |             #     _level = 2
51 |             lb = '{[('
52 |             rb = '}])'
53 |             k = f'[bold][underline]{k}[/underline][/bold]'
54 |             prefix += f'[{color}]{lb[_level]}{k} [/{color}]'
55 |             if child_is_dict:
56 |                 prefix += v.cstr(idx, level + 1)
57 |             else:
58 |                 prefix += f'[{color}]{v}[/{color}]'
59 |             prefix += f'[{color}]{rb[_level]}[/{color}]'
60 |         return prefix
61 | 


--------------------------------------------------------------------------------
/elit/utils/reflection.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | import importlib
20 | import inspect
21 | 
22 | 
23 | def classpath_of(obj) -> str:
24 |     """get the full class path of object
25 | 
26 |     Args:
27 |       obj: return:
28 | 
29 |     Returns:
30 | 
31 |     """
32 |     if inspect.isfunction(obj):
33 |         return module_path_of(obj)
34 |     return "{0}.{1}".format(obj.__class__.__module__, obj.__class__.__name__)
35 | 
36 | 
37 | def module_path_of(func) -> str:
38 |     return inspect.getmodule(func).__name__ + '.' + func.__name__
39 | 
40 | 
41 | def object_from_classpath(classpath, **kwargs):
42 |     classpath = str_to_type(classpath)
43 |     if inspect.isfunction(classpath):
44 |         return classpath
45 |     return classpath(**kwargs)
46 | 
47 | 
48 | def str_to_type(classpath):
49 |     """convert class path in str format to a type
50 | 
51 |     Args:
52 |       classpath: class path
53 | 
54 |     Returns:
55 |       type
56 | 
57 |     """
58 |     module_name, class_name = classpath.rsplit(".", 1)
59 |     cls = getattr(importlib.import_module(module_name), class_name)
60 |     return cls
61 | 
62 | 
63 | def type_to_str(type_object) -> str:
64 |     """convert a type object to class path in str format
65 | 
66 |     Args:
67 |       type_object: type
68 | 
69 |     Returns:
70 |       class path
71 | 
72 |     """
73 |     cls_name = str(type_object)
74 |     assert cls_name.startswith("<class '"), 'illegal input'
75 |     cls_name = cls_name[len("<class '"):]
76 |     assert cls_name.endswith("'>"), 'illegal input'
77 |     cls_name = cls_name[:-len("'>")]
78 |     return cls_name
79 | 


--------------------------------------------------------------------------------
/elit/components/coref/util.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: Liyan Xu
19 | from copy import deepcopy
20 | import torch
21 | from transformers import BertTokenizer
22 | 
23 | 
24 | def flatten(l):
25 |     return [item for sublist in l for item in sublist]
26 | 
27 | 
28 | def get_vocab_size(config):
29 |     tokenizer = BertTokenizer.from_pretrained(config['bert_tokenizer_name'])
30 |     if config['add_speaker_token']:
31 |         return len(tokenizer) + config['max_num_speakers'] + 1
32 |     else:
33 |         return len(tokenizer)
34 | 
35 | 
36 | def bucket_distance(offsets):
37 |     logspace_distance = torch.log2(offsets.to(torch.float)).to(torch.long) + 3
38 |     identity_mask = (offsets <= 4).to(torch.long)
39 |     combined_distance = identity_mask * offsets + (1 - identity_mask) * logspace_distance
40 |     combined_distance = torch.clamp(combined_distance, 0, 9)
41 |     return combined_distance
42 | 
43 | 
44 | def batch_select(tensor, idx, device=torch.device('cpu')):
45 |     assert tensor.shape[0] == idx.shape[0]
46 |     dim0_size, dim1_size = tensor.shape[0], tensor.shape[1]
47 | 
48 |     tensor = torch.reshape(tensor, [dim0_size * dim1_size, -1])
49 |     idx_offset = torch.unsqueeze(torch.arange(0, dim0_size, device=device) * dim1_size, 1)
50 |     new_idx = idx + idx_offset
51 |     selected = tensor[new_idx]
52 | 
53 |     if tensor.shape[-1] == 1:
54 |         selected = torch.squeeze(selected, -1)
55 | 
56 |     return selected
57 | 
58 | 
59 | def random_select(tensor, num_selection):
60 |     if tensor.shape[0] > num_selection:
61 |         rand_idx = torch.randperm(tensor.shape[0])[:num_selection]
62 |         return tensor[rand_idx]
63 |     else:
64 |         return tensor
65 | 


--------------------------------------------------------------------------------
/elit/metrics/f1.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from abc import ABC
20 | 
21 | from elit.metrics.metric import Metric
22 | 
23 | 
24 | class F1(Metric, ABC):
25 |     def __init__(self, nb_pred=0, nb_true=0, nb_correct=0) -> None:
26 |         super().__init__()
27 |         self.nb_correct = nb_correct
28 |         self.nb_pred = nb_pred
29 |         self.nb_true = nb_true
30 | 
31 |     def __repr__(self) -> str:
32 |         p, r, f = self.prf
33 |         return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}"
34 | 
35 |     @property
36 |     def prf(self):
37 |         nb_correct = self.nb_correct
38 |         nb_pred = self.nb_pred
39 |         nb_true = self.nb_true
40 |         p = nb_correct / nb_pred if nb_pred > 0 else .0
41 |         r = nb_correct / nb_true if nb_true > 0 else .0
42 |         f = 2 * p * r / (p + r) if p + r > 0 else .0
43 |         return p, r, f
44 | 
45 |     @property
46 |     def score(self):
47 |         return self.prf[-1]
48 | 
49 |     def reset(self):
50 |         self.nb_correct = 0
51 |         self.nb_pred = 0
52 |         self.nb_true = 0
53 | 
54 |     def __call__(self, pred: set, gold: set):
55 |         self.nb_correct += len(pred & gold)
56 |         self.nb_pred += len(pred)
57 |         self.nb_true += len(gold)
58 | 
59 | 
60 | class F1_(Metric):
61 |     def __init__(self, p, r, f) -> None:
62 |         super().__init__()
63 |         self.f = f
64 |         self.r = r
65 |         self.p = p
66 | 
67 |     @property
68 |     def score(self):
69 |         return self.f
70 | 
71 |     def __call__(self, pred, gold):
72 |         raise NotImplementedError()
73 | 
74 |     def reset(self):
75 |         self.f = self.r = self.p = 0
76 | 
77 |     def __repr__(self) -> str:
78 |         p, r, f = self.p, self.r, self.f
79 |         return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}"
80 | 


--------------------------------------------------------------------------------
/elit/metrics/srl/srlconll.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | import os
20 | 
21 | from elit.utils.io_util import get_resource, get_exitcode_stdout_stderr, run_cmd
22 | 
23 | 
24 | def official_conll_05_evaluate(pred_path, gold_path):
25 |     script_root = get_resource('http://www.lsi.upc.edu/~srlconll/srlconll-1.1.tgz')
26 |     lib_path = f'{script_root}/lib'
27 |     if lib_path not in os.environ.get("PERL5LIB", ""):
28 |         os.environ['PERL5LIB'] = f'{lib_path}:{os.environ.get("PERL5LIB", "")}'
29 |     bin_path = f'{script_root}/bin'
30 |     if bin_path not in os.environ.get('PATH', ''):
31 |         os.environ['PATH'] = f'{bin_path}:{os.environ.get("PATH", "")}'
32 |     eval_info_gold_pred = run_cmd(f'perl {script_root}/bin/srl-eval.pl {gold_path} {pred_path}')
33 |     eval_info_pred_gold = run_cmd(f'perl {script_root}/bin/srl-eval.pl {pred_path} {gold_path}')
34 |     conll_recall = float(eval_info_gold_pred.strip().split("\n")[6].strip().split()[5]) / 100
35 |     conll_precision = float(eval_info_pred_gold.strip().split("\n")[6].strip().split()[5]) / 100
36 |     if conll_recall + conll_precision > 0:
37 |         conll_f1 = 2 * conll_recall * conll_precision / (conll_recall + conll_precision)
38 |     else:
39 |         conll_f1 = 0
40 |     return conll_precision, conll_recall, conll_f1
41 | 
42 | 
43 | def run_perl(script, src, dst=None):
44 |     os.environ['PERL5LIB'] = f''
45 |     exitcode, out, err = get_exitcode_stdout_stderr(
46 |         f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}')
47 |     if exitcode:
48 |         # cpanm -l ~/.local namespace::autoclean
49 |         # cpanm -l ~/.local Moose
50 |         # cpanm -l ~/.local MooseX::SemiAffordanceAccessor module
51 |         raise RuntimeError(err)
52 |     with open(dst, 'w') as ofile:
53 |         ofile.write(out)
54 |     return dst
55 | 


--------------------------------------------------------------------------------
/elit/utils/string.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2018 ELIT
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | import string
17 | 
18 | __author__ = 'Jinho D. Choi'
19 | 
20 | 
21 | def is_range(c, begin, end):
22 |     return begin <= c <= end
23 | 
24 | 
25 | def is_single_quote(c):
26 |     return c in {'\'', '`'} or is_range(c, u'\u2018', u'\u201B')
27 | 
28 | 
29 | def is_double_quote(c):
30 |     return c == '"' or is_range(c, u'\u201C', u'\u201F')
31 | 
32 | 
33 | def is_left_bracket(c):
34 |     return c in {'(', '{', '[', '<'}
35 | 
36 | 
37 | def is_right_bracket(c):
38 |     return c in {')', '}', ']', '>'}
39 | 
40 | 
41 | def is_bracket(c):
42 |     return is_left_bracket(c) or is_right_bracket(c)
43 | 
44 | 
45 | def is_hyphen(c):
46 |     return c == '-' or is_range(c, u'\u2010', u'\u2014')
47 | 
48 | 
49 | def is_arrow(c):
50 |     return is_range(c, u'\u2190', u'\u21FF') or is_range(c, u'\u27F0', u'\u27FF') or is_range(c, u'\u2900', u'\u297F')
51 | 
52 | 
53 | def is_currency(c):
54 |     return c == '$' or is_range(c, u'\u00A2', u'\u00A5') or is_range(c, u'\u20A0', u'\u20CF')
55 | 
56 | 
57 | def is_final_mark(c):
58 |     return c in {'.', '?', '!', u'\u203C'} or is_range(c, u'\u2047', u'\u2049')
59 | 
60 | 
61 | def is_punct(c):
62 |     return c in string.punctuation
63 | 
64 | 
65 | def collapse_digits(s):
66 |     def get(i, c):
67 |         if i+1 < len(s) and digits[i+1]:
68 |             if is_currency(c) or c in {'.', '-', '+', '#'}:
69 |                 return ''
70 |             if i-1 >= 0 and digits[i-1] and (is_hyphen(c) or c in {',', ':', '/', '='}):
71 |                 return ''
72 |         elif i-1 >= 0 and digits[i-1]:
73 |             if c == '%':
74 |                 return ''
75 |         elif digits[i]:
76 |             if t and t[0] == '0':
77 |                 return ''
78 | 
79 |         t[0] = '0' if digits[i] else c
80 |         return t[0]
81 | 
82 |     t = ['']
83 |     digits = [c.isdigit() for c in s]
84 |     return ''.join([get(i, c) for i, c in enumerate(s)])
85 | 


--------------------------------------------------------------------------------
/elit/components/amr/seq2seq/dataset/dataset.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | from typing import Union, List, Callable
 3 | 
 4 | from penman import Graph
 5 | 
 6 | from elit.common.dataset import TransformDataset
 7 | from elit.components.amr.seq2seq.dataset.IO import read_raw_amr_data
 8 | from elit.components.amr.seq2seq.dataset.penman import role_is_reverted
 9 | from elit.components.amr.seq2seq.dataset.tokenization_bart import PENMANBartTokenizer
10 | 
11 | 
12 | class AMRDataset(TransformDataset):
13 | 
14 |     def __init__(self,
15 |                  data: Union[str, List],
16 |                  use_recategorization=False,
17 |                  remove_wiki=False,
18 |                  dereify=False,
19 |                  transform: Union[Callable, List] = None,
20 |                  cache=None,
21 |                  generate_idx=None) -> None:
22 |         self.dereify = dereify
23 |         self.remove_wiki = remove_wiki
24 |         self.use_recategorization = use_recategorization
25 |         super().__init__(data, transform, cache, generate_idx)
26 | 
27 |     def load_file(self, filepath: str):
28 |         graphs = read_raw_amr_data([filepath], self.use_recategorization, remove_wiki=self.remove_wiki,
29 |                                    dereify=self.dereify)
30 |         for g in graphs:
31 |             yield {'amr': g}
32 | 
33 |     def get_roles(self):
34 |         roles = Counter()
35 |         for sample in self.data:
36 |             g: Graph = sample['amr']
37 |             for s, r, t in g.triples:
38 |                 if role_is_reverted(r):
39 |                     r = r[:-3]
40 |                 roles[r] += 1
41 |         return roles
42 | 
43 |     def get_frames(self):
44 |         frames = Counter()
45 |         for sample in self.data:
46 |             g: Graph = sample['amr']
47 |             for i in g.instances():
48 |                 t = i.target
49 |                 cells = t.split('-')
50 |                 if len(cells) == 2 and len(cells[1]) == 2 and cells[1].isdigit():
51 |                     frames[t] += 1
52 |         return frames
53 | 
54 | 
55 | 
56 | 
57 | def dfs_linearize_tokenize(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False, text_key='snt') -> dict:
58 |     amr = sample.get('amr', None)
59 |     if amr:
60 |         l, e = tokenizer.linearize(amr)
61 |         sample['graph_tokens'] = e['linearized_graphs']
62 |         sample['graph_token_ids'] = l
63 |         text = amr.metadata[text_key]
64 |     else:
65 |         text = sample['text']
66 |     if remove_space:
67 |         text = ''.join(text.split())
68 |     sample['text'] = text
69 |     sample['text_token_ids'] = tokenizer.encode(text)
70 |     return sample
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/elit/layers/context_layer.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from alnlp.modules.pytorch_seq2seq_wrapper import LstmSeq2SeqEncoder
20 | from torch import nn
21 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
22 | 
23 | from elit.common.structure import ConfigTracker
24 | 
25 | 
26 | class _LSTMSeq2Seq(nn.Module):
27 |     def __init__(
28 |             self,
29 |             input_size: int,
30 |             hidden_size: int,
31 |             num_layers: int = 1,
32 |             bias: bool = True,
33 |             dropout: float = 0.0,
34 |             bidirectional: bool = False,
35 |     ):
36 |         """
37 |         Under construction, not ready for production
38 |         :param input_size:
39 |         :param hidden_size:
40 |         :param num_layers:
41 |         :param bias:
42 |         :param dropout:
43 |         :param bidirectional:
44 |         """
45 |         self.rnn = nn.LSTM(
46 |             input_size=input_size,
47 |             hidden_size=hidden_size,
48 |             num_layers=num_layers,
49 |             bias=bias,
50 |             batch_first=True,
51 |             dropout=dropout,
52 |             bidirectional=bidirectional,
53 |         )
54 | 
55 |     def forward(self, embed, lens, max_len):
56 |         x = pack_padded_sequence(embed, lens, True, False)
57 |         x, _ = self.rnn(x)
58 |         x, _ = pad_packed_sequence(x, True, total_length=max_len)
59 |         return x
60 | 
61 | 
62 | # We might update this to support yaml based configuration
63 | class LSTMContextualEncoder(LstmSeq2SeqEncoder, ConfigTracker):
64 | 
65 |     def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, bias: bool = True, dropout: float = 0.0,
66 |                  bidirectional: bool = False, stateful: bool = False):
67 |         super().__init__(input_size, hidden_size, num_layers, bias, dropout, bidirectional, stateful)
68 |         ConfigTracker.__init__(self, locals())
69 | 


--------------------------------------------------------------------------------
/elit/metrics/parsing/attachmentscore.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Yu Zhang
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from elit.metrics.metric import Metric
24 | 
25 | 
26 | class AttachmentScore(Metric):
27 | 
28 |     def __init__(self, eps=1e-12):
29 |         super(AttachmentScore, self).__init__()
30 | 
31 |         self.eps = eps
32 |         self.total = 0.0
33 |         self.correct_arcs = 0.0
34 |         self.correct_rels = 0.0
35 | 
36 |     def __repr__(self):
37 |         return f"UAS: {self.uas:.2%} LAS: {self.las:.2%}"
38 | 
39 |     # noinspection PyMethodOverriding
40 |     def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
41 |         arc_mask = arc_preds.eq(arc_golds)[mask]
42 |         rel_mask = rel_preds.eq(rel_golds)[mask] & arc_mask
43 | 
44 |         self.total += len(arc_mask)
45 |         self.correct_arcs += arc_mask.sum().item()
46 |         self.correct_rels += rel_mask.sum().item()
47 | 
48 |     def __lt__(self, other):
49 |         return self.score < other
50 | 
51 |     def __le__(self, other):
52 |         return self.score <= other
53 | 
54 |     def __ge__(self, other):
55 |         return self.score >= other
56 | 
57 |     def __gt__(self, other):
58 |         return self.score > other
59 | 
60 |     @property
61 |     def score(self):
62 |         return self.las
63 | 
64 |     @property
65 |     def uas(self):
66 |         return self.correct_arcs / (self.total + self.eps)
67 | 
68 |     @property
69 |     def las(self):
70 |         return self.correct_rels / (self.total + self.eps)
71 | 
72 |     def reset(self):
73 |         self.total = 0.0
74 |         self.correct_arcs = 0.0
75 |         self.correct_rels = 0.0
76 | 


--------------------------------------------------------------------------------
/docs/part_of_speech_tagging.md:
--------------------------------------------------------------------------------
 1 | # Part-of-Speech Tagging
 2 | 
 3 | Part-of-speech (POS) tagging is the task of tagging each word in a sentence with its part-of-speech, which is a grammatical category including noun, verb, adjective etc.
 4 | 
 5 | ## API
 6 | 
 7 | The part-of-speech (POS) tagger is built into the Multi-Task Learning framework of ELIT. To use the POS tagger, a MTL component is required. One can list the pre-trained MTL components via the following code snippets.
 8 | 
 9 | ```python
10 | import elit
11 | print(elit.pretrained.mtl.ALL)
12 | ```
13 | 
14 | Output:
15 | 
16 | ```python
17 | {
18 |   'LEM_POS_NER_DEP_SDP_CON_AMR_ELECTRA_BASE_EN': 'https://elit-models.s3-us-west-2.amazonaws.com/v2/en_pos_ner_srl_dep_con_amr_electra_base_20201222.zip',
19 |   'LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN': 'https://elit-models.s3-us-west-2.amazonaws.com/v2/en_pos_ner_srl_dep_con_amr_roberta_base_20210402_152521.zip',
20 |   'LEM_POS_NER_DEP_ROBERTA_BASE_EN': 'https://elit-models.s3-us-west-2.amazonaws.com/v2/en_lem_pos_ner_ddr_roberta_base_20210325_121606.zip'
21 | }
22 | ```
23 | 
24 | Each of them is a pair of an identifier and the URL to it. The identifier indicates the task it supports and the backbone transformer it was trained on. If you are only interested in POS, it's recommended to use `LEM_POS_NER_DEP_ROBERTA_BASE_EN` as it involves less tasks so its size is smaller. 
25 | 
26 | Once you've decided which model to use, you can pass its identifier to `elit.load`. This method will download the model and load it to the least occupied GPU if any. E.g.,
27 | 
28 | ```python
29 | nlp = elit.load('LEM_POS_NER_DEP_ROBERTA_BASE_EN')
30 | print(nlp('I banked 1 dollar in a bank .'.split(), tasks='pos'))
31 | ```
32 | 
33 | Output:
34 | 
35 | ```python
36 | {'pos': ['PRP', 'VBD', 'CD', 'NN', 'IN', 'DT', 'NN', '.'], 
37 |  'tok': ['I', 'banked', '1', 'dollar', 'in', 'a', 'bank', '.']}
38 | ```
39 | 
40 | The output is a dict mapping the task names to corresponding annotations. It always contains the tokens so that each tag can be easily aligned with its actual token.
41 | 
42 | ### Batched Prediction
43 | 
44 | Note that batched prediction is usually faster than sequential prediction even on CPUs. So it's always recommended to gather the input sentences into one list and make only 1 function call. E.g.,
45 | 
46 | ```python
47 | nlp([["The", "first", "sentence", "."],
48 |      ["The", "second", "sentence", "."]])
49 | ```
50 | 
51 | is faster than call `nlp` twice separately:
52 | 
53 | ```python
54 | nlp(["The", "first", "sentence", "."])
55 | nlp(["The", "second", "sentence", "."])
56 | ```
57 | 
58 | ## Annotation Guideline
59 | 
60 | Please refer to [Part-of-Speech Tagging Guidelines for the Penn Treebank](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1603&context=cis_reports).
61 | 


--------------------------------------------------------------------------------
/elit/components/amr/amr_parser/data.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Deng Cai
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import random
24 | 
25 | import numpy as np
26 | 
27 | from elit.common.constant import PAD
28 | from elit.common.vocab import Vocab
29 | 
30 | DUM, NIL, END = '[unused0]', '<NULL>', '[unused1]'
31 | REL = 'rel='
32 | 
33 | 
34 | def list_to_tensor(xs, vocab: Vocab = None, local_vocabs=None, unk_rate=0.):
35 |     pad = vocab.pad_idx if vocab else 0
36 | 
37 |     def to_idx(w, i):
38 |         if vocab is None:
39 |             return w
40 |         if isinstance(w, list):
41 |             return [to_idx(_, i) for _ in w]
42 |         if random.random() < unk_rate:
43 |             return vocab.unk_idx
44 |         if local_vocabs is not None:
45 |             local_vocab = local_vocabs[i]
46 |             if (local_vocab is not None) and (w in local_vocab):
47 |                 return local_vocab[w]
48 |         return vocab.get_idx(w)
49 | 
50 |     max_len = max(len(x) for x in xs)
51 |     ys = []
52 |     for i, x in enumerate(xs):
53 |         y = to_idx(x, i) + [pad] * (max_len - len(x))
54 |         ys.append(y)
55 |     data = np.transpose(np.array(ys))
56 |     return data
57 | 
58 | 
59 | def lists_of_string_to_tensor(xs, vocab: Vocab, max_string_len=20):
60 |     max_len = max(len(x) for x in xs)
61 |     ys = []
62 |     for x in xs:
63 |         y = x + [PAD] * (max_len - len(x))
64 |         zs = []
65 |         for z in y:
66 |             z = list(z[:max_string_len])
67 |             zs.append(vocab([DUM] + z + [END]) + [vocab.pad_idx] * (max_string_len - len(z)))
68 |         ys.append(zs)
69 | 
70 |     data = np.transpose(np.array(ys), (1, 0, 2))
71 |     return data
72 | 


--------------------------------------------------------------------------------
/elit/server/service_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # Author: hankcs
 3 | # Date: 2020-12-14 12:05
 4 | from collections import defaultdict
 5 | from typing import List, Callable
 6 | 
 7 | from elit.common.document import Document
 8 | from elit.server.format import Input
 9 | from elit.server.service_tokenizer import ServiceTokenizer
10 | from elit.server.en_util import eos, tokenize
11 | 
12 | 
13 | class ServiceParser(object):
14 | 
15 |     def __init__(self,
16 |                  service_tokenizer: ServiceTokenizer,
17 |                  model: Callable[[List[List[str]], List[str]], Document]) -> None:
18 |         super().__init__()
19 |         self.service_tokenizer = ServiceTokenizer(eos, tokenize) if service_tokenizer is None else service_tokenizer
20 |         self.model = model
21 | 
22 |     def parse_sents(self, sents: List[List[str]], tasks: List[str] = None) -> Document:
23 |         return self.model(sents, tasks=tasks)
24 | 
25 |     def parse(self, inputs: List[Input]) -> List[Document]:
26 |         self.service_tokenizer.tokenize_inputs(inputs)  # no effects (read-only) in server pipeline
27 | 
28 |         # We shall group by models
29 |         inputs_by_tasks = defaultdict(list)
30 |         for i, input in enumerate(inputs):
31 |             tasks = tuple(sorted(input.models))
32 |             inputs_by_tasks[tasks].append(i)
33 | 
34 |         results = [Document() for _ in inputs]
35 |         for tasks, input_ids in inputs_by_tasks.items():
36 |             group_inputs = [inputs[i] for i in input_ids]
37 |             group_tokens = sum([input.tokens for input in group_inputs], [])
38 |             annotations = self.parse_sents(group_tokens, tasks)
39 |             for k, v in annotations.items():
40 |                 # fit ELIT standard
41 |                 if k == 'ner':
42 |                     for j, s in enumerate(v):
43 |                         v[j] = [x[1:] + x[:1] for x in s]
44 |                 elif k == 'srl':
45 |                     for _v in v:
46 |                         for j, s in enumerate(_v):
47 |                             _v[j] = [x[1:] + x[:1] for x in s]
48 |             for i, input in zip(input_ids, group_inputs):
49 |                 for k, v in annotations.items():
50 |                     results[i][k] = v[:len(input.tokens)]
51 |                     if k == 'ner':
52 |                         if not input.verbose:
53 |                             for j, s in enumerate(results[i][k]):
54 |                                 results[i][k][j] = [x[:-1] for x in s]
55 |                     elif k == 'srl':
56 |                         if not input.verbose:
57 |                             for _v in v:
58 |                                 for j, s in enumerate(_v):
59 |                                     _v[j] = [x[:-1] for x in s]
60 |                     del v[:len(input.tokens)]
61 | 
62 |         return results
63 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | 
20 | from os.path import abspath, join, dirname
21 | from setuptools import find_packages, setup
22 | 
23 | this_dir = abspath(dirname(__file__))
24 | with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
25 |     long_description = file.read()
26 | version = {}
27 | with open(join(this_dir, "elit", "version.py")) as fp:
28 |     exec(fp.read(), version)
29 | 
30 | setup(
31 |     name='elit',
32 |     version=version['__version__'],
33 |     description='The Emory Language and Information Toolkit (ELIT)',
34 |     long_description=long_description,
35 |     long_description_content_type="text/markdown",
36 |     url='https://github.com/emorynlp/elit',
37 |     author='Han He, Liyan Xu',
38 |     license='Apache License 2.0',
39 |     classifiers=[
40 |         'Intended Audience :: Science/Research',
41 |         'Intended Audience :: Developers',
42 |         "Development Status :: 3 - Alpha",
43 |         'Operating System :: OS Independent',
44 |         "License :: OSI Approved :: Apache Software License",
45 |         'Programming Language :: Python :: 3 :: Only',
46 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
47 |         "Topic :: Text Processing :: Linguistic"
48 |     ],
49 |     keywords='corpus,machine-learning,NLU,NLP',
50 |     packages=find_packages(exclude=['docs', 'tests*']),
51 |     include_package_data=True,
52 |     install_requires=[
53 |         'torch>=1.6.0',
54 |         'sentencepiece>=0.1.95',
55 |         'termcolor==1.1.0',
56 |         'phrasetree==0.0.8',
57 |         'pynvml==8.0.4',
58 |         'alnlp==1.0.0rc27',
59 |         'penman==1.2.1',
60 |         'toposort==1.5',
61 |         'unofficial_stog==0.0.21',
62 |         'uvicorn==0.13.1',
63 |         'fastapi==0.65.2',
64 |         'transformers>=4.6.1',
65 |         'editdistance==0.5.3',
66 |         'scipy>=1.5.4'
67 |     ],
68 |     python_requires='>=3.7',
69 |     entry_points={
70 |         'console_scripts': [
71 |             'elit=elit.main:main',
72 |         ],
73 |     },
74 | )
75 | 


--------------------------------------------------------------------------------
/elit/datasets/ner/conll03.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import Union, List, Callable
20 | 
21 | from elit.common.dataset import TransformDataset
22 | from elit.utils.io_util import get_resource, generate_words_tags_from_tsv
23 | from elit.utils.string_util import split_long_sentence_into
24 | 
25 | 
26 | class TSVTaggingDataset(TransformDataset):
27 | 
28 |     def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None,
29 |                  generate_idx=None,
30 |                  delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False,
31 |                  ) -> None:
32 |         self.char_level = char_level
33 |         self.hard_constraint = hard_constraint
34 |         self.sent_delimiter = sent_delimiter
35 |         self.max_seq_len = max_seq_len
36 |         self.delimiter = delimiter
37 |         super().__init__(data, transform, cache, generate_idx)
38 | 
39 |     def load_file(self, filepath):
40 |         filepath = get_resource(filepath)
41 |         # idx = 0
42 |         for words, tags in generate_words_tags_from_tsv(filepath, lower=False):
43 |             # idx += 1
44 |             # if idx % 1000 == 0:
45 |             #     print(f'\rRead instances {idx // 1000}k', end='')
46 |             if self.max_seq_len:
47 |                 start = 0
48 |                 for short_sents in split_long_sentence_into(words, self.max_seq_len, self.sent_delimiter,
49 |                                                             char_level=self.char_level,
50 |                                                             hard_constraint=self.hard_constraint):
51 |                     end = start + len(short_sents)
52 |                     yield {'token': short_sents, 'tag': tags[start:end]}
53 |                     start = end
54 |             else:
55 |                 yield {'token': words, 'tag': tags}
56 |         # print('\r', end='')
57 | 
58 | 
59 | def main():
60 |     dataset = TSVTaggingDataset(CONLL03_EN_DEV)
61 |     print(dataset[3])
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/elit/server/service_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs, Liyan Xu
19 | from typing import List, Callable, Union
20 | 
21 | from elit.server.format import Input
22 | 
23 | 
24 | class ServiceTokenizer:
25 | 
26 |     def __init__(self,
27 |                  eos: Callable[[List[str]], List[List[str]]] = None,
28 |                  tokenizer: Callable[[List[str]], List[List[str]]] = None) -> None:
29 |         super().__init__()
30 |         self.eos = eos
31 |         self.tokenizer = tokenizer
32 | 
33 |     def split_sents(self, text: List[str]) -> List[List[str]]:
34 |         assert self.eos, 'eos is required to perform sentence split'
35 |         return self.eos(text)
36 | 
37 |     def tokenize(self, docs: List[List[str]]) -> List[List[List[str]]]:
38 |         assert self.tokenizer, 'tokenizer is required to perform tokenization'
39 |         results = []
40 |         tokens = self.tokenizer(sum(docs, []))
41 |         for doc in docs:
42 |             results.append([])
43 |             results[-1].extend(tokens[:len(doc)])
44 |             del tokens[:len(doc)]
45 |         return results
46 | 
47 |     def tokenize_inputs(self, inputs: Union[Input, List[Input]]) -> Union[Input, List[Input]]:
48 |         single_input = False
49 |         if isinstance(inputs, Input):
50 |             single_input = True
51 |             inputs = [inputs]
52 | 
53 |         needs_split = []
54 |         input_ids = []
55 |         for i, input in enumerate(inputs):
56 |             if isinstance(input.text, str):
57 |                 needs_split.append(input.text)
58 |                 input_ids.append(i)
59 |         for i, sents in zip(input_ids, self.split_sents(needs_split)):
60 |             inputs[i].text = sents
61 | 
62 |         needs_tokenize = []
63 |         input_ids = []
64 |         for i, input in enumerate(inputs):
65 |             if not input.tokens:
66 |                 needs_tokenize.append(input.text)
67 |                 input_ids.append(i)
68 |         for i, tokens in zip(input_ids, self.tokenize(needs_tokenize)):
69 |             inputs[i].tokens = tokens
70 | 
71 |         return inputs[0] if single_input else inputs
72 | 


--------------------------------------------------------------------------------
/elit/datasets/parsing/semeval15.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | import warnings
20 | 
21 | from elit.common.constant import ROOT, PAD
22 | from elit.components.parsers.conll import CoNLLSentence
23 | 
24 | 
25 | def unpack_deps_to_head_deprel(sample: dict, pad_rel=None, arc_key='arc', rel_key='rel'):
26 |     if 'DEPS' in sample:
27 |         deps = ['_'] + sample['DEPS']
28 |         sample[arc_key] = arc = []
29 |         sample[rel_key] = rel = []
30 |         for each in deps:
31 |             arc_per_token = [False] * len(deps)
32 |             rel_per_token = [None] * len(deps)
33 |             if each != '_':
34 |                 for ar in each.split('|'):
35 |                     a, r = ar.split(':')
36 |                     a = int(a)
37 |                     arc_per_token[a] = True
38 |                     rel_per_token[a] = r
39 |                     if not pad_rel:
40 |                         pad_rel = r
41 |             arc.append(arc_per_token)
42 |             rel.append(rel_per_token)
43 |         if not pad_rel:
44 |             pad_rel = PAD
45 |         for i in range(len(rel)):
46 |             rel[i] = [r if r else pad_rel for r in rel[i]]
47 |     return sample
48 | 
49 | 
50 | def append_bos_to_form_pos(sample, pos_key='CPOS'):
51 |     sample['token'] = [ROOT] + sample['FORM']
52 |     if pos_key in sample:
53 |         sample['pos'] = [ROOT] + sample[pos_key]
54 |     return sample
55 | 
56 | 
57 | def merge_head_deprel_with_2nd(sample: dict):
58 |     if 'arc' in sample:
59 |         arc_2nd = sample['arc_2nd']
60 |         rel_2nd = sample['rel_2nd']
61 |         for i, (arc, rel) in enumerate(zip(sample['arc'], sample['rel'])):
62 |             if i:
63 |                 if arc_2nd[i][arc] and rel_2nd[i][arc] != rel:
64 |                     sample_str = CoNLLSentence.from_dict(sample, conllu=True).to_markdown()
65 |                     warnings.warn(f'The main dependency conflicts with 2nd dependency at ID={i}, ' \
66 |                                   'which means joint mode might not be suitable. ' \
67 |                                   f'The sample is\n{sample_str}')
68 |                 arc_2nd[i][arc] = True
69 |                 rel_2nd[i][arc] = rel
70 |     return sample
71 | 


--------------------------------------------------------------------------------
/elit/components/distillation/distillable_component.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from abc import ABC
20 | from copy import copy
21 | 
22 | import elit
23 | from elit.common.torch_component import TorchComponent
24 | from elit.components.distillation.losses import KnowledgeDistillationLoss
25 | from elit.components.distillation.schedulers import TemperatureScheduler
26 | from elit.utils.torch_util import cuda_devices
27 | from elit.utils.util import merge_locals_kwargs
28 | 
29 | 
30 | class DistillableComponent(TorchComponent, ABC):
31 | 
32 |     # noinspection PyMethodMayBeStatic,PyTypeChecker
33 |     def build_teacher(self, teacher: str, devices) -> TorchComponent:
34 |         return elit.load(teacher, load_kwargs={'devices': devices})
35 | 
36 |     def distill(self,
37 |                 teacher: str,
38 |                 trn_data,
39 |                 dev_data,
40 |                 save_dir,
41 |                 batch_size=None,
42 |                 epochs=None,
43 |                 kd_criterion='kd_ce_loss',
44 |                 temperature_scheduler='flsw',
45 |                 devices=None,
46 |                 logger=None,
47 |                 seed=None,
48 |                 **kwargs):
49 |         devices = devices or cuda_devices()
50 |         if isinstance(kd_criterion, str):
51 |             kd_criterion = KnowledgeDistillationLoss(kd_criterion)
52 |         if isinstance(temperature_scheduler, str):
53 |             temperature_scheduler = TemperatureScheduler.from_name(temperature_scheduler)
54 |         teacher = self.build_teacher(teacher, devices=devices)
55 |         self.vocabs = teacher.vocabs
56 |         config = copy(teacher.config)
57 |         batch_size = batch_size or config.get('batch_size', None)
58 |         epochs = epochs or config.get('epochs', None)
59 |         config.update(kwargs)
60 |         return super().fit(**merge_locals_kwargs(locals(),
61 |                                                  config,
62 |                                                  excludes=('self', 'kwargs', '__class__', 'config')))
63 | 
64 |     @property
65 |     def _savable_config(self):
66 |         config = super(DistillableComponent, self)._savable_config
67 |         if 'teacher' in config:
68 |             config.teacher = config.teacher.load_path
69 |         return config
70 | 


--------------------------------------------------------------------------------
/elit/components/parsers/biaffine/mlp.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Yu Zhang
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | 
25 | import torch.nn as nn
26 | 
27 | from elit.layers.dropout import SharedDropout
28 | 
29 | 
30 | class MLP(nn.Module):
31 |     r"""
32 |     Applies a linear transformation together with a non-linear activation to the incoming tensor:
33 |     :math:`y = \mathrm{Activation}(x A^T + b)`
34 | 
35 |     Args:
36 |         n_in (~torch.Tensor):
37 |             The size of each input feature.
38 |         n_out (~torch.Tensor):
39 |             The size of each output feature.
40 |         dropout (float):
41 |             If non-zero, introduce a :class:`SharedDropout` layer on the output with this dropout ratio. Default: 0.
42 |         activation (bool):
43 |             Whether to use activations. Default: True.
44 |     """
45 | 
46 |     def __init__(self, n_in, n_out, dropout=0, activation=True):
47 |         super().__init__()
48 | 
49 |         self.n_in = n_in
50 |         self.n_out = n_out
51 |         self.linear = nn.Linear(n_in, n_out)
52 |         self.activation = nn.LeakyReLU(negative_slope=0.1) if activation else nn.Identity()
53 |         self.dropout = SharedDropout(p=dropout)
54 | 
55 |         self.reset_parameters()
56 | 
57 |     def __repr__(self):
58 |         s = f"n_in={self.n_in}, n_out={self.n_out}"
59 |         if self.dropout.p > 0:
60 |             s += f", dropout={self.dropout.p}"
61 | 
62 |         return f"{self.__class__.__name__}({s})"
63 | 
64 |     def reset_parameters(self):
65 |         nn.init.orthogonal_(self.linear.weight)
66 |         nn.init.zeros_(self.linear.bias)
67 | 
68 |     def forward(self, x):
69 |         r"""
70 |         Args:
71 |             x (~torch.Tensor):
72 |                 The size of each input feature is `n_in`.
73 | 
74 |         Returns:
75 |             A tensor with the size of each output feature `n_out`.
76 |         """
77 | 
78 |         x = self.linear(x)
79 |         x = self.activation(x)
80 |         x = self.dropout(x)
81 | 
82 |         return x
83 | 
84 | 


--------------------------------------------------------------------------------
/elit/components/amr/amr_parser/amrio.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Deng Cai
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from collections import Counter
24 | import json
25 | 
26 | from elit.components.amr.amr_parser.amr import AMR
27 | from elit.components.amr.amr_parser.amr_graph import _is_abs_form, AMRGraph
28 | 
29 | 
30 | class AMRIO:
31 | 
32 |     def __init__(self):
33 |         pass
34 | 
35 |     @staticmethod
36 |     def read(file_path):
37 |         with open(file_path, encoding='utf-8') as f:
38 |             idx = 0
39 |             for line in f:
40 |                 line = line.rstrip()
41 |                 if line.startswith('# ::id '):
42 |                     amr_id = line[len('# ::id '):]
43 |                 elif line.startswith('# ::snt '):
44 |                     sentence = line[len('# ::snt '):]
45 |                 elif line.startswith('# ::tokens '):
46 |                     tokens = json.loads(line[len('# ::tokens '):])
47 |                 elif line.startswith('# ::lemmas '):
48 |                     lemmas = json.loads(line[len('# ::lemmas '):])
49 |                     lemmas = [le if _is_abs_form(le) else le.lower() for le in lemmas]
50 |                 elif line.startswith('# ::pos_tags '):
51 |                     pos_tags = json.loads(line[len('# ::pos_tags '):])
52 |                 elif line.startswith('# ::ner_tags '):
53 |                     ner_tags = json.loads(line[len('# ::ner_tags '):])
54 |                 elif line.startswith('# ::abstract_map '):
55 |                     abstract_map = json.loads(line[len('# ::abstract_map '):])
56 |                     graph_line = AMR.get_amr_line(f)
57 |                     amr = AMR.parse_AMR_line(graph_line)
58 |                     myamr = AMRGraph(amr)
59 |                     assert len(tokens) == len(ner_tags)
60 |                     yield tokens, lemmas, pos_tags, ner_tags, myamr
61 |                     idx += 1
62 | 
63 | 
64 | def make_vocab(batch_seq, char_level=False):
65 |     cnt = Counter()
66 |     for seq in batch_seq:
67 |         cnt.update(seq)
68 |     if not char_level:
69 |         return cnt
70 |     char_cnt = Counter()
71 |     for x, y in cnt.most_common():
72 |         for ch in list(x):
73 |             char_cnt[ch] += y
74 |     return cnt, char_cnt
75 | 


--------------------------------------------------------------------------------
/elit/components/amr/amr_parser/customized_bart.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | import torch.nn.functional as F
20 | from transformers.modeling_bart import DecoderLayer
21 | 
22 | 
23 | # noinspection PyAbstractClass
24 | class CustomizedDecoderLayer(DecoderLayer):
25 |     def forward(
26 |             self,
27 |             x,
28 |             encoder_hidden_states,
29 |             encoder_attn_mask=None,
30 |             layer_state=None,
31 |             causal_mask=None,
32 |             decoder_padding_mask=None,
33 |             output_attentions=False,
34 |     ):
35 |         residual = x
36 | 
37 |         if layer_state is None:
38 |             layer_state = {}
39 |         if self.normalize_before:
40 |             x = self.self_attn_layer_norm(x)
41 |         # Self Attention
42 | 
43 |         x, self_attn_weights = self.self_attn(
44 |             query=x,
45 |             key=x,
46 |             layer_state=layer_state,  # adds keys to layer state
47 |             key_padding_mask=decoder_padding_mask,
48 |             attn_mask=causal_mask,
49 |             output_attentions=output_attentions,
50 |         )
51 |         x = F.dropout(x, p=self.dropout, training=self.training)
52 |         x = residual + x
53 |         if not self.normalize_before:
54 |             x = self.self_attn_layer_norm(x)
55 | 
56 |         # Cross attention
57 |         residual = x
58 |         assert self.encoder_attn.cache_key != self.self_attn.cache_key
59 |         if self.normalize_before:
60 |             x = self.encoder_attn_layer_norm(x)
61 |         x, cross_attn_weights = self.encoder_attn(
62 |             query=x,
63 |             key=encoder_hidden_states,
64 |             key_padding_mask=encoder_attn_mask,
65 |             layer_state=layer_state,  # mutates layer state
66 |             output_attentions=True
67 |         )
68 |         x = F.dropout(x, p=self.dropout, training=self.training)
69 |         x = residual + x
70 |         if not self.normalize_before:
71 |             x = self.encoder_attn_layer_norm(x)
72 | 
73 |         # Fully Connected
74 |         residual = x
75 |         if self.normalize_before:
76 |             x = self.final_layer_norm(x)
77 |         x = self.activation_fn(self.fc1(x))
78 |         x = F.dropout(x, p=self.activation_dropout, training=self.training)
79 |         x = self.fc2(x)
80 |         x = F.dropout(x, p=self.dropout, training=self.training)
81 |         x = residual + x
82 |         if not self.normalize_before:
83 |             x = self.final_layer_norm(x)
84 |         return (
85 |             x,
86 |             (self_attn_weights, cross_attn_weights),
87 |             layer_state,
88 |         )  # just self_attn weights for now, following t5, layer_state = cache for decoding
89 | 


--------------------------------------------------------------------------------
/elit/metrics/parsing/conllx_eval.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | import tempfile
20 | 
21 | from elit.components.parsers.conll import read_conll
22 | from elit.utils.io_util import get_resource, get_exitcode_stdout_stderr
23 | 
24 | CONLLX_EVAL = get_resource(
25 |     'https://github.com/elikip/bist-parser/archive/master.zip' + '#bmstparser/src/utils/eval.pl')
26 | 
27 | 
28 | def evaluate(gold_file, pred_file):
29 |     """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski)
30 | 
31 |     Args:
32 |       gold_file(str): The gold conllx file
33 |       pred_file(str): The pred conllx file
34 | 
35 |     Returns:
36 | 
37 |     
38 |     """
39 |     gold_file = get_resource(gold_file)
40 |     fixed_pred_file = tempfile.NamedTemporaryFile().name
41 |     copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False)
42 |     if gold_file.endswith('.conllu'):
43 |         fixed_gold_file = tempfile.NamedTemporaryFile().name
44 |         copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False)
45 |         gold_file = fixed_gold_file
46 | 
47 |     exitcode, out, err = get_exitcode_stdout_stderr(f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}')
48 |     if exitcode:
49 |         raise RuntimeError(f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.')
50 |     lines = out.split('\n')[-4:]
51 |     las = int(lines[0].split()[3]) / int(lines[0].split()[5])
52 |     uas = int(lines[1].split()[3]) / int(lines[1].split()[5])
53 |     return uas, las
54 | 
55 | 
56 | def copy_cols(gold_file, pred_file, copied_pred_file, keep_comments=True):
57 |     """Copy the first 6 columns from gold file to pred file
58 | 
59 |     Args:
60 |       gold_file: 
61 |       pred_file: 
62 |       copied_pred_file: 
63 |       keep_comments:  (Default value = True)
64 | 
65 |     Returns:
66 | 
67 |     
68 |     """
69 |     with open(copied_pred_file, 'w') as to_out, open(pred_file) as pred_file, open(gold_file) as gold_file:
70 |         for idx, (p, g) in enumerate(zip(pred_file, gold_file)):
71 |             while p.startswith('#'):
72 |                 p = next(pred_file)
73 |             if not g.strip():
74 |                 if p.strip():
75 |                     raise ValueError(
76 |                         f'Prediction file {pred_file.name} does not end a sentence at line {idx + 1}\n{p.strip()}')
77 |                 to_out.write('\n')
78 |                 continue
79 |             while g.startswith('#') or '-' in g.split('\t')[0]:
80 |                 if keep_comments or g.startswith('-'):
81 |                     to_out.write(g)
82 |                 g = next(gold_file)
83 |             to_out.write('\t'.join(str(x) for x in g.split('\t')[:6] + p.split('\t')[6:]))
84 | 


--------------------------------------------------------------------------------
/elit/components/amr/amr_parser/utils.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Deng Cai
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import math
24 | 
25 | import numpy as np
26 | import torch
27 | 
28 | 
29 | def move_to_device(maybe_tensor, device):
30 |     if torch.is_tensor(maybe_tensor):
31 |         return maybe_tensor.to(device)
32 |     elif isinstance(maybe_tensor, np.ndarray):
33 |         return torch.from_numpy(maybe_tensor).to(device).contiguous()
34 |     elif isinstance(maybe_tensor, dict):
35 |         return {
36 |             key: move_to_device(value, device)
37 |             for key, value in maybe_tensor.items()
38 |         }
39 |     elif isinstance(maybe_tensor, list):
40 |         return [move_to_device(x, device) for x in maybe_tensor]
41 |     elif isinstance(maybe_tensor, tuple):
42 |         return tuple([move_to_device(x, device) for x in maybe_tensor])
43 |     return maybe_tensor
44 | 
45 | 
46 | def compute_f_by_tensor(input, target, mask):
47 |     input = input.view(-1).tolist()
48 |     target = target.view(-1).tolist()
49 |     mask = mask.view(-1).tolist()
50 |     tp, fp, tn, fn = 0., 0., 0., 0.
51 |     for i, t, m in zip(input, target, mask):
52 |         if m == 1:
53 |             continue
54 |         else:
55 |             if i == 1:
56 |                 if t == 1:
57 |                     tp += 1
58 |                 else:
59 |                     fp += 1
60 |             else:
61 |                 if t == 1:
62 |                     fn += 1
63 |                 else:
64 |                     tn += 1
65 |     if tp == 0:
66 |         return 0., 0., 0.
67 | 
68 |     P = tp / (tp + fp)
69 |     R = tp / (tp + fn)
70 |     F = 2 * P * R / (P + R)
71 |     return P, R, F
72 | 
73 | 
74 | def gelu_fast(x):
75 |     if not hasattr(gelu_fast, "_a"):
76 |         gelu_fast._a = math.sqrt(2 / math.pi)
77 |     return 0.5 * x * (1 + torch.tanh(gelu_fast._a * (x + 0.044715 * torch.pow(x, 3))))
78 | 
79 | 
80 | def gelu(x: torch.Tensor) -> torch.Tensor:
81 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
82 | 
83 | 
84 | def label_smoothed_nll_loss(log_probs, target, eps):
85 |     # log_probs: N x C
86 |     # target: N
87 |     nll_loss = -log_probs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
88 |     if eps == 0.:
89 |         return nll_loss
90 |     smooth_loss = -log_probs.sum(dim=-1)
91 |     eps_i = eps / log_probs.size(-1)
92 |     loss = (1. - eps) * nll_loss + eps_i * smooth_loss
93 |     return loss
94 | 


--------------------------------------------------------------------------------
/elit/server/en.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs, Liyan Xu
19 | import elit
20 | from elit.pretrained.coref import DOC_COREF_SPANBERT_LARGE_EN, ONLINE_COREF_SPANBERT_LARGE_EN
21 | from elit.server.en_parser import service_tokenizer, service_parser
22 | from elit.server.format import Input
23 | from elit.server.service_tokenizer import ServiceTokenizer
24 | from elit.server.service_parser import ServiceParser
25 | from elit.server.service_coref import ServiceCoreference
26 | 
27 | 
28 | class BundledServices:
29 |     def __init__(self,
30 |                  tokenizer: ServiceTokenizer = None,
31 |                  parser: ServiceParser = None,
32 |                  doc_coref: ServiceCoreference = None,
33 |                  online_coref: ServiceCoreference = None):
34 |         self.tokenizer = tokenizer
35 |         self.parser = parser
36 |         self.doc_coref = doc_coref
37 |         self.online_coref = online_coref
38 |         self.emotion_detection = None
39 | 
40 | 
41 | service_doc_coref = ServiceCoreference(
42 |     service_tokenizer=service_tokenizer,
43 |     models=elit.load(DOC_COREF_SPANBERT_LARGE_EN)
44 | )
45 | # service_doc_coref = ServiceCoreference(
46 | #     service_tokenizer=service_tokenizer,
47 | #     models=[elit.load(DOC_COREF_SPANBERT_LARGE_EN, devices=0),
48 | #             elit.load(DOC_COREF_SPANBERT_LARGE_EN, devices=1)]
49 | # )
50 | 
51 | service_online_coref = ServiceCoreference(
52 |     service_tokenizer=service_tokenizer,
53 |     models=elit.load(ONLINE_COREF_SPANBERT_LARGE_EN)
54 | )
55 | 
56 | en_services = BundledServices(
57 |     tokenizer=service_tokenizer,
58 |     parser=service_parser,
59 |     doc_coref=service_doc_coref,
60 |     online_coref=service_online_coref
61 | )
62 | 
63 | 
64 | def main():
65 |     text = [
66 |         "Emory NLP is a research lab in Atlanta, GA. "
67 |         "It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University."
68 |     ]
69 |     input = Input(text=text)
70 |     input.models = ['lem']
71 |     docs = en_services.parser.parse([input])
72 |     for doc in docs:
73 |         print(doc)
74 | 
75 |     # See elit.client for coreference examples
76 |     text = 'Pfizer said last week it may need the U.S. government to help it secure some components needed to ' \
77 |            'make the vaccine. While the company halved its 2020 production target due to manufacturing issues, ' \
78 |            'it said last week its manufacturing is running smoothly now. The government also has the option to ' \
79 |            'acquire up to an additional 400 million doses of the vaccine.'
80 |     input_doc = Input(text=text, models=['dcr'])
81 |     doc = service_doc_coref.predict(input_doc)
82 |     print(doc)
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/docs/abstract_meaning_parsing.md:
--------------------------------------------------------------------------------
 1 | # Abstract Meaning Parsing
 2 | 
 3 | Abstract Meaning Representation parsing is the task of parsing a sentence to a representation of the abstract meaning in a sentence as a rooted, directed, acyclic graph.
 4 | 
 5 | ELIT provides two APIs for parsing a raw sentence into an AMR graph.
 6 | 
 7 | ## State-of-The-Art AMR Parser
 8 | 
 9 | We recommend to use the SOTA AMR parser with `83.3` Smatch score trained on the AMR 3.0 dataset:
10 | 
11 | ```python
12 | import elit
13 | parser = elit.load(elit.pretrained.amr.AMR3_BART_LARGE_EN)
14 | amr = parser('The boy wants the girl to believe him.')
15 | print(amr)
16 | ```
17 | 
18 | Outputs:
19 | 
20 | ```text
21 | (z0 / want-01
22 |     :ARG0 (z1 / boy)
23 |     :ARG1 (z2 / believe-01
24 |               :ARG0 (z3 / girl)
25 |               :ARG1 z1))
26 | ```
27 | 
28 | Feel free to parse multiple sentences in one batch to benefit from parallelization.
29 | 
30 | ### Evaluation
31 | 
32 | Its detailed performance is listed as follows:
33 | 
34 | ```bash
35 | $ cat ~/.elit/amr3_bart_large_elit_20211030_173300/test.log
36 | {Smatch P: 84.00% R: 82.60% F1: 83.30%}{Unlabeled P: 86.40% R: 84.90% F1: 85.70%}{No WSD P: 84.50% R: 83.10% F1: 83.80%}{Non_sense_frames P: 91.90% R: 91.30% F1: 91.60%}{Wikification P: 81.70% R: 80.80% F1: 81.20%}{Named Ent. P: 89.20% R: 87.00% F1: 88.10%}{Negations P: 71.70% R: 70.90% F1: 71.30%}{IgnoreVars P: 73.80% R: 73.10% F1: 73.50%}{Concepts P: 90.70% R: 89.60% F1: 90.10%}{Frames P: 88.50% R: 87.90% F1: 88.20%}{Reentrancies P: 70.40% R: 71.80% F1: 71.10%}{SRL P: 79.00% R: 79.60% F1: 79.30%}
37 | ```
38 | 
39 | To re-produce this evaluation, run the followling code:
40 | 
41 | ```python
42 | import elit
43 | parser = elit.load(elit.pretrained.amr.AMR3_BART_LARGE_EN)
44 | parser.evaluate('data/amr/amr_3.0/test.txt', '/tmp')
45 | ```
46 | 
47 | which will generate the Smatch score `/tmp/test.log` **before** wikification:
48 | 
49 | ```
50 | 81/81 {Smatch P: 83.70% R: 82.40% F1: 83.00%}{Unlabeled P: 86.10% R: 84.60% F1: 85.40%}{No WSD P: 84.20% R: 82.90% F1: 83.50%}{Non_sense_frames P: 91.90% R: 91.30% F1: 91.60%}{Wikification P: 75.00% R: 74.20% F1: 74.60%}{Named Ent. P: 89.20% R: 87.00% F1: 88.10%}{Negations P: 71.70% R: 70.90% F1: 71.30%}{IgnoreVars P: 73.50% R: 72.80% F1: 73.10%}{Concepts P: 90.70% R: 89.60% F1: 90.10%}{Frames P: 88.50% R: 87.90% F1: 88.20%}{Reentrancies P: 70.40% R: 71.80% F1: 71.10%}{SRL P: 79.00% R: 79.60% F1: 79.30%} ET: 8 m 1 s
51 | ```
52 | 
53 | The predictions are saved in `/tmp/test.pred.txt`. To perform wikification, refer to [BLINK](https://github.com/SapienzaNLP/spring/blob/8fad6a4ce59132b22d6bdb4d4eb3a9aa5223ead6/bin/blinkify.py) which will give you the extra `0.3` F1.
54 | 
55 | ## AMR Decoder in MTL
56 | 
57 | Users can also load a MTL component and specify `tasks=['lem', 'amr'])` to parse a sentence into an AMR graph. E.g.,
58 | 
59 | ```python
60 | import elit
61 | nlp = elit.load('LEM_POS_NER_DEP_SDP_CON_AMR_ROBERTA_BASE_EN')
62 | print(nlp(['Emory','NLP','is','in','Atlanta'], tasks=['lem', 'amr']))
63 | ```
64 | 
65 | Outputs:
66 | 
67 | ```python
68 | {
69 |   "tok": [
70 |     ["Emory", "NLP", "is", "in", "Atlanta"]
71 |   ],
72 | "lem": [
73 |     ["emory", "nlp", "be", "in", "atlanta"]
74 |   ],
75 |   "amr": [
76 |     [["c0", "ARG1", "c1"], ["c0", "ARG2", "c2"], ["c0", "instance", "be-located-at-91"], ["c1", "instance", "emory nlp"], ["c2", "instance", "atlanta"]]
77 |   ],
78 | }
79 | ```
80 | 
81 | `amr` stores the logical triples of Abstract Meaning Representation in the format of `(source, relation, target)`. Note that the `Document` class will convert it to Penman format when being accessed through code. Thus, you will get Penman `Graph`s by accessing `doc['amr']`.
82 | 
83 | 


--------------------------------------------------------------------------------
/elit/components/ner/rnn_ner.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import Any
20 | 
21 | import torch
22 | from alnlp.metrics import span_utils
23 | 
24 | from elit.components.taggers.rnn_tagger import RNNTagger
25 | from elit.metrics.chunking.conlleval import SpanF1
26 | from elit.utils.util import merge_locals_kwargs
27 | 
28 | 
29 | class RNNNamedEntityRecognizer(RNNTagger):
30 |     def build_metric(self, **kwargs):
31 |         return SpanF1(self.tagging_scheme)
32 | 
33 |     def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, **kwargs):
34 |         loss, metric = super().evaluate_dataloader(data, criterion, logger, ratio_width, **kwargs)
35 |         if logger:
36 |             logger.info(metric.result(True, False)[-1])
37 |         return loss, metric
38 | 
39 |     def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256,
40 |             drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='token', tagging_scheme=None,
41 |             anneal_factor: float = 0.5, delimiter=None, anneal_patience=2, devices=None,
42 |             token_delimiter=None,
43 |             logger=None,
44 |             verbose=True, **kwargs):
45 |         return super().fit(**merge_locals_kwargs(locals(), kwargs))
46 | 
47 |     def update_metrics(self, metric, logits, y, mask, batch, prediction):
48 |         logits = self.decode_output(logits, mask, batch)
49 |         if isinstance(logits, torch.Tensor):
50 |             logits = logits.tolist()
51 |         metric(self._id_to_tags(logits), batch['tag'])
52 | 
53 |     def predict(self, tokens: Any, batch_size: int = None, **kwargs):
54 |         return super().predict(tokens, batch_size, **kwargs)
55 | 
56 |     def predict_data(self, data, batch_size, **kwargs):
57 |         outputs = super().predict_data(data, batch_size)
58 |         tagging_scheme = self.tagging_scheme
59 |         if tagging_scheme == 'IOBES':
60 |             entities = [span_utils.iobes_tags_to_spans(y) for y in outputs]
61 |         elif tagging_scheme == 'BIO':
62 |             entities = [span_utils.bio_tags_to_spans(y) for y in outputs]
63 |         elif tagging_scheme == 'BIOUL':
64 |             entities = [span_utils.bioul_tags_to_spans(y) for y in outputs]
65 |         else:
66 |             raise ValueError(f'Unrecognized tag scheme {tagging_scheme}')
67 |         for i, (tokens, es) in enumerate(zip(data, entities)):
68 |             outputs[i] = [(self.config.token_delimiter.join(tokens[b:e + 1]), t, b, e + 1) for t, (b, e) in es]
69 |         return outputs
70 | 
71 |     def save_config(self, save_dir, filename='config.json'):
72 |         if self.config.token_delimiter is None:
73 |             self.config.token_delimiter = '' if all(
74 |                 [len(x) == 1 for x in self.vocabs[self.config.token_key].idx_to_token[-100:]]) else ' '
75 |         super().save_config(save_dir, filename)
76 | 


--------------------------------------------------------------------------------
/elit/metrics/parsing/span.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Yu Zhang
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from collections import Counter
 24 | 
 25 | from elit.metrics.metric import Metric
 26 | 
 27 | 
 28 | class SpanMetric(Metric):
 29 | 
 30 |     def __init__(self, eps=1e-12):
 31 |         super().__init__()
 32 |         self.reset(eps)
 33 | 
 34 |     # noinspection PyAttributeOutsideInit
 35 |     def reset(self, eps=1e-12):
 36 |         self.n = 0.0
 37 |         self.n_ucm = 0.0
 38 |         self.n_lcm = 0.0
 39 |         self.utp = 0.0
 40 |         self.ltp = 0.0
 41 |         self.pred = 0.0
 42 |         self.gold = 0.0
 43 |         self.eps = eps
 44 | 
 45 |     def __call__(self, preds, golds):
 46 |         for pred, gold in zip(preds, golds):
 47 |             upred = Counter([(i, j) for i, j, label in pred])
 48 |             ugold = Counter([(i, j) for i, j, label in gold])
 49 |             utp = list((upred & ugold).elements())
 50 |             lpred = Counter(pred)
 51 |             lgold = Counter(gold)
 52 |             ltp = list((lpred & lgold).elements())
 53 |             self.n += 1
 54 |             self.n_ucm += len(utp) == len(pred) == len(gold)
 55 |             self.n_lcm += len(ltp) == len(pred) == len(gold)
 56 |             self.utp += len(utp)
 57 |             self.ltp += len(ltp)
 58 |             self.pred += len(pred)
 59 |             self.gold += len(gold)
 60 |         return self
 61 | 
 62 |     def __repr__(self):
 63 |         s = f"UCM: {self.ucm:.2%} LCM: {self.lcm:.2%} "
 64 |         s += f"UP: {self.up:.2%} UR: {self.ur:.2%} UF: {self.uf:.2%} "
 65 |         s += f"LP: {self.lp:.2%} LR: {self.lr:.2%} LF: {self.lf:.2%}"
 66 | 
 67 |         return s
 68 | 
 69 |     @property
 70 |     def score(self):
 71 |         return self.lf
 72 | 
 73 |     @property
 74 |     def ucm(self):
 75 |         return self.n_ucm / (self.n + self.eps)
 76 | 
 77 |     @property
 78 |     def lcm(self):
 79 |         return self.n_lcm / (self.n + self.eps)
 80 | 
 81 |     @property
 82 |     def up(self):
 83 |         return self.utp / (self.pred + self.eps)
 84 | 
 85 |     @property
 86 |     def ur(self):
 87 |         return self.utp / (self.gold + self.eps)
 88 | 
 89 |     @property
 90 |     def uf(self):
 91 |         return 2 * self.utp / (self.pred + self.gold + self.eps)
 92 | 
 93 |     @property
 94 |     def lp(self):
 95 |         return self.ltp / (self.pred + self.eps)
 96 | 
 97 |     @property
 98 |     def lr(self):
 99 |         return self.ltp / (self.gold + self.eps)
100 | 
101 |     @property
102 |     def lf(self):
103 |         return 2 * self.ltp / (self.pred + self.gold + self.eps)
104 | 


--------------------------------------------------------------------------------
/tests/test_service_coref_doc.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: Liyan Xu
19 | import unittest
20 | import time
21 | 
22 | from elit.server.en import en_services
23 | from elit.server.format import Input
24 | 
25 | 
26 | class TestDocCoref(unittest.TestCase):
27 | 
28 |     def setUp(self) -> None:
29 |         super().setUp()
30 | 
31 |     def get_sample_text(self):
32 |         text = 'Pfizer said last week it may need the U.S. government to help it secure some components needed to ' \
33 |                'make the vaccine. While the company halved its 2020 production target due to manufacturing issues, ' \
34 |                'it said last week its manufacturing is running smoothly now. The government also has the option to ' \
35 |                'acquire up to an additional 400 million doses of the vaccine.'
36 |         return text
37 | 
38 |     def test_doc_coref_sequential(self):
39 | 
40 |         batch_size = 32
41 |         inputs = [Input(text=self.get_sample_text(), models=['dcr'])] * batch_size
42 | 
43 |         start_time = time.time()
44 |         docs = en_services.doc_coref.predict_sequentially(inputs)
45 |         end_time = time.time()
46 |         print(f'Sequential doc coref time elapse for {batch_size} small documents: {end_time - start_time :.2f}s')
47 | 
48 |         assert len(docs) == len(inputs)
49 |         print(docs[0])
50 |         print(docs[-1])
51 | 
52 |     def test_doc_coref_concurrent(self):
53 | 
54 |         batch_size = 32
55 |         inputs = [Input(text=self.get_sample_text(), models=['dcr'])] * batch_size
56 | 
57 |         start_time = time.time()
58 |         docs = en_services.doc_coref.predict(inputs)
59 |         end_time = time.time()
60 |         print(f'Concurrent doc coref time elapse for {batch_size} small documents: {end_time - start_time :.2f}s')
61 | 
62 |         assert len(docs) == len(inputs)
63 |         print(docs[0])
64 |         print(docs[-1])
65 | 
66 |     def test_doc_coref_text(self):
67 |         text = "Emory NLP is a research lab in Atlanta, GA. "
68 |         "It is founded by Jinho D. Choi in 2014. Dr. Choi is a professor at Emory University."
69 |         input_doc = Input(text=text, models=['dcr'])
70 |         print(en_services.doc_coref.predict(input_doc))
71 | 
72 |     def test_doc_coref_sents(self):
73 |         text = ["Emory NLP is a research lab in Atlanta, GA.",
74 |                 "It is founded by Jinho D. Choi in 2014.",
75 |                 'Dr. Choi is a professor at Emory University.']
76 |         input_doc = Input(text=text, models=['dcr'])
77 |         print(en_services.doc_coref.predict(input_doc))
78 | 
79 |     def test_doc_coref_tokens(self):
80 |         tokens = [
81 |             ["Emory", "NLP", "is", "a", "research", "lab", "in", "Atlanta", ",", "GA", "."],
82 |             ["It", "is", "founded", "by", "Jinho", "D.", "Choi", "in", "2014", ".", "Dr.", "Choi", "is", "a",
83 |              "professor", "at", "Emory", "University", "."]
84 |         ]
85 |         input_doc = Input(tokens=tokens, models=['dcr'])
86 |         print(en_services.doc_coref.predict(input_doc))
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     unittest.main()
91 | 


--------------------------------------------------------------------------------
/elit/layers/embeddings/util.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: hankcs
19 | from typing import Union
20 | 
21 | import torch
22 | from torch import nn
23 | 
24 | from elit.common.vocab import Vocab
25 | from elit.utils.init_util import embedding_uniform
26 | from elit.utils.io_util import load_word2vec, load_word2vec_as_vocab_tensor
27 | 
28 | 
29 | def index_word2vec_with_vocab(filepath: str,
30 |                               vocab: Vocab,
31 |                               extend_vocab=True,
32 |                               unk=None,
33 |                               lowercase=False,
34 |                               init='uniform',
35 |                               normalize=None) -> torch.Tensor:
36 |     pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath)
37 |     if unk and unk in pret_vocab:
38 |         pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk)
39 |     if extend_vocab:
40 |         vocab.unlock()
41 |         for word in pret_vocab:
42 |             vocab.get_idx(word.lower() if lowercase else word)
43 |     vocab.lock()
44 |     ids = []
45 | 
46 |     unk_id_offset = 0
47 |     for word, idx in vocab.token_to_idx.items():
48 |         word_id = pret_vocab.get(word, None)
49 |         # Retry lower case
50 |         if word_id is None:
51 |             word_id = pret_vocab.get(word.lower(), None)
52 |         if word_id is None:
53 |             word_id = len(pret_vocab) + unk_id_offset
54 |             unk_id_offset += 1
55 |         ids.append(word_id)
56 |     if unk_id_offset:
57 |         unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1))
58 |         if init and init != 'zeros':
59 |             if init == 'uniform':
60 |                 init = embedding_uniform
61 |             else:
62 |                 raise ValueError(f'Unsupported init {init}')
63 |             unk_embeds = init(unk_embeds)
64 |         pret_matrix = torch.cat([pret_matrix, unk_embeds])
65 |     ids = torch.LongTensor(ids)
66 |     embedding = pret_matrix.index_select(0, ids)
67 |     if normalize == 'norm':
68 |         embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)
69 |     elif normalize == 'std':
70 |         embedding /= torch.std(embedding)
71 |     return embedding
72 | 
73 | 
74 | def build_word2vec_with_vocab(embed: Union[str, int],
75 |                               vocab: Vocab,
76 |                               extend_vocab=True,
77 |                               unk=None,
78 |                               lowercase=False,
79 |                               trainable=False,
80 |                               init='zeros',
81 |                               normalize=None) -> nn.Embedding:
82 |     if isinstance(embed, str):
83 |         embed = index_word2vec_with_vocab(embed, vocab, extend_vocab, unk, lowercase, init, normalize)
84 |         embed = nn.Embedding.from_pretrained(embed, freeze=not trainable, padding_idx=vocab.pad_idx)
85 |         return embed
86 |     elif isinstance(embed, int):
87 |         embed = nn.Embedding(len(vocab), embed, padding_idx=vocab.pad_idx)
88 |         return embed
89 |     else:
90 |         raise ValueError(f'Unsupported parameter type: {embed}')
91 | 


--------------------------------------------------------------------------------
/tests/test_service_coref_online.py:
--------------------------------------------------------------------------------
 1 | # ========================================================================
 2 | # Copyright 2020 Emory University
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ========================================================================
16 | 
17 | # -*- coding:utf-8 -*-
18 | # Author: Liyan Xu
19 | import unittest
20 | import json
21 | 
22 | from elit.server.en import en_services
23 | from elit.server.format import Input, OnlineCorefContext
24 | from elit.components.coref.util import flatten
25 | 
26 | 
27 | class TestOnlineCoref(unittest.TestCase):
28 | 
29 |     def setUp(self) -> None:
30 |         super().setUp()
31 | 
32 |     @classmethod
33 |     def convert_output_to_context(cls, coref_output):
34 |         if coref_output is None:
35 |             return None
36 |         return OnlineCorefContext(
37 |             input_ids=coref_output['input_ids'],
38 |             sentence_map=coref_output['sentence_map'],
39 |             subtoken_map=coref_output['subtoken_map'],
40 |             mentions=[tuple(m) for m in coref_output['mentions']],
41 |             uttr_start_idx=coref_output['uttr_start_idx'],
42 |             speaker_ids=coref_output['speaker_ids']
43 |         )
44 | 
45 |     def test_online_coref(self):
46 |         # Test text, sents, tokens input
47 |         # utterances = [
48 |         #     {'speaker_id': 1, 'text': 'I read an article today. It is about US politics.'},
49 |         #     {'speaker_id': 2, 'tokens': [['What', 'does', 'it', 'say', 'about', 'US', 'politics', '?']]},  # Tokens
50 |         #     {'speaker_id': 1, 'text': 'It talks about the US presidential election.'},
51 |         #     {'speaker_id': 2, 'text': ['I am interested to hear.', 'Can you elaborate more?']},  # Sents
52 |         #     {'speaker_id': 1, 'text': 'Sure! The presidential election is indeed interesting.'}
53 |         # ]
54 | 
55 |         utterances = [
56 |             {'speaker_id': 1, 'tokens': [['I', 'read', 'an', 'article', 'today', '.']]},
57 |             {'speaker_id': 2, 'tokens': [['Can', 'you', 'tell', 'me', 'what', 'it', 'is', 'about', '?']]}
58 |         ]
59 | 
60 |         context = None
61 |         tokens_to_date = []
62 |         for turn, uttr in enumerate(utterances):
63 |             if 'tokens' in uttr:
64 |                 input_doc = Input(tokens=uttr['tokens'], speaker_ids=uttr['speaker_id'],
65 |                                   coref_context=self.convert_output_to_context(context), models=['ocr'])
66 |             else:
67 |                 input_doc = Input(text=uttr['text'], speaker_ids=uttr['speaker_id'],
68 |                                   coref_context=self.convert_output_to_context(context), models=['ocr'])
69 |             output_doc = en_services.online_coref.predict_sequentially(input_doc, check_sanitization=True)
70 |             print(json.dumps(output_doc))
71 |             context = output_doc['ocr']
72 | 
73 |             # Print cluster text
74 |             tokens_to_date += flatten(input_doc.tokens)
75 |             for cluster in output_doc['ocr']['clusters']:
76 |                 for i in range(len(cluster)):
77 |                     m1, m2 = tuple(cluster[i])
78 |                     cluster[i] = (m1, m2, ' '.join(tokens_to_date[m1:m2]))
79 |             print(f'cluster text: {output_doc["ocr"]["clusters"]}')
80 |             print()
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     unittest.main()
85 | 


--------------------------------------------------------------------------------
/elit/components/parsers/biaffine/biaffine.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Yu Zhang
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | import torch
25 | import torch.nn as nn
26 | 
27 | 
28 | class Biaffine(nn.Module):
29 |     r"""
30 |     Biaffine layer for first-order scoring.
31 | 
32 |     This function has a tensor of weights :math:`W` and bias terms if needed.
33 |     The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y`,
34 |     in which :math:`x` and :math:`y` can be concatenated with bias terms.
35 | 
36 |     References:
37 |         - Timothy Dozat and Christopher D. Manning. 2017.
38 |           `Deep Biaffine Attention for Neural Dependency Parsing`_.
39 | 
40 |     Args:
41 |         n_in (int):
42 |             The size of the input feature.
43 |         n_out (int):
44 |             The number of output channels.
45 |         bias_x (bool):
46 |             If ``True``, adds a bias term for tensor :math:`x`. Default: ``True``.
47 |         bias_y (bool):
48 |             If ``True``, adds a bias term for tensor :math:`y`. Default: ``True``.
49 | 
50 |     .. _Deep Biaffine Attention for Neural Dependency Parsing:
51 |         https://openreview.net/forum?id=Hk95PK9le
52 |     """
53 | 
54 |     def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
55 |         super().__init__()
56 | 
57 |         self.n_in = n_in
58 |         self.n_out = n_out
59 |         self.bias_x = bias_x
60 |         self.bias_y = bias_y
61 |         self.weight = nn.Parameter(torch.Tensor(n_out, n_in + bias_x, n_in + bias_y))
62 | 
63 |         self.reset_parameters()
64 | 
65 |     def __repr__(self):
66 |         s = f"n_in={self.n_in}, n_out={self.n_out}"
67 |         if self.bias_x:
68 |             s += f", bias_x={self.bias_x}"
69 |         if self.bias_y:
70 |             s += f", bias_y={self.bias_y}"
71 | 
72 |         return f"{self.__class__.__name__}({s})"
73 | 
74 |     def reset_parameters(self):
75 |         nn.init.zeros_(self.weight)
76 | 
77 |     def forward(self, x, y):
78 |         r"""
79 |         Args:
80 |             x (torch.Tensor): ``[batch_size, seq_len, n_in]``.
81 |             y (torch.Tensor): ``[batch_size, seq_len, n_in]``.
82 | 
83 |         Returns:
84 |             ~torch.Tensor:
85 |                 A scoring tensor of shape ``[batch_size, n_out, seq_len, seq_len]``.
86 |                 If ``n_out=1``, the dimension for ``n_out`` will be squeezed automatically.
87 |         """
88 | 
89 |         if self.bias_x:
90 |             x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
91 |         if self.bias_y:
92 |             y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
93 |         # [batch_size, n_out, seq_len, seq_len]
94 |         s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
95 |         # remove dim 1 if n_out == 1
96 |         s = s.squeeze(1)
97 | 
98 |         return s
99 | 


--------------------------------------------------------------------------------
/elit/layers/embeddings/fast_text.py:
--------------------------------------------------------------------------------
  1 | # ========================================================================
  2 | # Copyright 2020 Emory University
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ========================================================================
 16 | 
 17 | # -*- coding:utf-8 -*-
 18 | # Author: hankcs
 19 | import os
 20 | import sys
 21 | from typing import Optional, Callable
 22 | 
 23 | import fasttext
 24 | import torch
 25 | from torch import nn
 26 | from torch.nn.utils.rnn import pad_sequence
 27 | 
 28 | from elit.common.structure import AutoConfigurable
 29 | from elit.common.transform import EmbeddingNamedTransform
 30 | from elit.layers.embeddings.embedding import Embedding, EmbeddingDim
 31 | from elit.utils.io_util import get_resource, stdout_redirected
 32 | from elit.utils.log_util import flash
 33 | 
 34 | 
 35 | class FastTextTransform(EmbeddingNamedTransform):
 36 |     def __init__(self, filepath: str, src, dst=None, **kwargs) -> None:
 37 |         if not dst:
 38 |             dst = src + '_fasttext'
 39 |         self.filepath = filepath
 40 |         flash(f'Loading fasttext model {filepath} [blink][yellow]...[/yellow][/blink]')
 41 |         filepath = get_resource(filepath)
 42 |         with stdout_redirected(to=os.devnull, stdout=sys.stderr):
 43 |             self._model = fasttext.load_model(filepath)
 44 |         flash('')
 45 |         output_dim = self._model['king'].size
 46 |         super().__init__(output_dim, src, dst)
 47 | 
 48 |     def __call__(self, sample: dict):
 49 |         word = sample[self.src]
 50 |         if isinstance(word, str):
 51 |             vector = self.embed(word)
 52 |         else:
 53 |             vector = torch.stack([self.embed(each) for each in word])
 54 |         sample[self.dst] = vector
 55 |         return sample
 56 | 
 57 |     def embed(self, word: str):
 58 |         return torch.tensor(self._model[word])
 59 | 
 60 | 
 61 | class PassThroughModule(torch.nn.Module):
 62 |     def __init__(self, key) -> None:
 63 |         super().__init__()
 64 |         self.key = key
 65 | 
 66 |     def __call__(self, batch: dict, mask=None, **kwargs):
 67 |         return batch[self.key]
 68 | 
 69 | 
 70 | class FastTextEmbeddingModule(PassThroughModule):
 71 | 
 72 |     def __init__(self, key, embedding_dim: int) -> None:
 73 |         super().__init__(key)
 74 |         self.embedding_dim = embedding_dim
 75 | 
 76 |     def __call__(self, batch: dict, mask=None, **kwargs):
 77 |         outputs = super().__call__(batch, **kwargs)
 78 |         outputs = pad_sequence(outputs, True, 0).to(mask.device)
 79 |         return outputs
 80 | 
 81 |     def __repr__(self):
 82 |         s = self.__class__.__name__ + '('
 83 |         s += f'key={self.key}, embedding_dim={self.embedding_dim}'
 84 |         s += ')'
 85 |         return s
 86 | 
 87 |     def get_output_dim(self):
 88 |         return self.embedding_dim
 89 | 
 90 | 
 91 | class FastTextEmbedding(Embedding, AutoConfigurable):
 92 |     def __init__(self, src: str, filepath: str) -> None:
 93 |         super().__init__()
 94 |         self.src = src
 95 |         self.filepath = filepath
 96 |         self._fasttext = FastTextTransform(self.filepath, self.src)
 97 | 
 98 |     def transform(self, **kwargs) -> Optional[Callable]:
 99 |         return self._fasttext
100 | 
101 |     def module(self, **kwargs) -> Optional[nn.Module]:
102 |         return FastTextEmbeddingModule(self._fasttext.dst, self._fasttext.output_dim)
103 | 


--------------------------------------------------------------------------------
/elit/metrics/parsing/labeled_f1.py:
--------------------------------------------------------------------------------
  1 | # ========================================================================
  2 | # Copyright 2020 Emory University
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ========================================================================
 16 | 
 17 | # -*- coding:utf-8 -*-
 18 | # Author: hankcs
 19 | 
 20 | from elit.metrics.metric import Metric
 21 | 
 22 | 
 23 | class LabeledF1(Metric):
 24 | 
 25 |     def __init__(self):
 26 |         super(LabeledF1, self).__init__()
 27 | 
 28 |         self.sum_gold_arcs_wo_punc = 0.0
 29 |         self.sum_pred_arcs_wo_punc = 0.0
 30 |         self.correct_arcs_wo_punc = 0.0
 31 |         self.correct_rels_wo_punc = 0.0
 32 | 
 33 |     def __repr__(self):
 34 |         return f"UF: {self.uf:4.2%} LF: {self.lf:4.2%}"
 35 | 
 36 |     def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
 37 |         mask_gold = mask & arc_golds
 38 |         mask_pred = mask & arc_preds
 39 | 
 40 |         correct_mask = mask_gold & mask_pred
 41 |         correct_arcs_wo_punc = (arc_preds == arc_golds)[correct_mask]
 42 |         correct_rels_wo_punc = (rel_preds == rel_golds)[correct_mask] & correct_arcs_wo_punc
 43 | 
 44 |         self.sum_gold_arcs_wo_punc += float(mask_gold.sum())
 45 |         self.sum_pred_arcs_wo_punc += float(mask_pred.sum())
 46 |         self.correct_arcs_wo_punc += float(correct_arcs_wo_punc.sum())
 47 |         self.correct_rels_wo_punc += float(correct_rels_wo_punc.sum())
 48 | 
 49 |     def __lt__(self, other):
 50 |         return self.score < other
 51 | 
 52 |     def __le__(self, other):
 53 |         return self.score <= other
 54 | 
 55 |     def __ge__(self, other):
 56 |         return self.score >= other
 57 | 
 58 |     def __gt__(self, other):
 59 |         return self.score > other
 60 | 
 61 |     @property
 62 |     def score(self):
 63 |         return self.las
 64 | 
 65 |     @property
 66 |     def uas(self):
 67 |         return self.uf
 68 | 
 69 |     @property
 70 |     def las(self):
 71 |         return self.lf
 72 | 
 73 |     @property
 74 |     def ur(self):
 75 |         if not self.sum_gold_arcs_wo_punc:
 76 |             return .0
 77 |         return self.correct_arcs_wo_punc / self.sum_gold_arcs_wo_punc
 78 | 
 79 |     @property
 80 |     def up(self):
 81 |         if not self.sum_pred_arcs_wo_punc:
 82 |             return .0
 83 |         return self.correct_arcs_wo_punc / self.sum_pred_arcs_wo_punc
 84 | 
 85 |     @property
 86 |     def lr(self):
 87 |         if not self.sum_gold_arcs_wo_punc:
 88 |             return .0
 89 |         return self.correct_rels_wo_punc / self.sum_gold_arcs_wo_punc
 90 | 
 91 |     @property
 92 |     def lp(self):
 93 |         if not self.sum_pred_arcs_wo_punc:
 94 |             return .0
 95 |         return self.correct_rels_wo_punc / self.sum_pred_arcs_wo_punc
 96 | 
 97 |     @property
 98 |     def uf(self):
 99 |         rp = self.ur + self.up
100 |         if not rp:
101 |             return .0
102 |         return 2 * self.ur * self.up / rp
103 | 
104 |     @property
105 |     def lf(self):
106 |         rp = self.lr + self.lp
107 |         if not rp:
108 |             return .0
109 |         return 2 * self.lr * self.lp / rp
110 | 
111 |     def reset(self):
112 |         self.sum_gold_arcs_wo_punc = 0.0
113 |         self.sum_pred_arcs_wo_punc = 0.0
114 |         self.correct_arcs_wo_punc = 0.0
115 |         self.correct_rels_wo_punc = 0.0
116 | 
117 |     def to_dict(self) -> dict:
118 |         return {'UF': self.uf, 'LF': self.lf}
119 | 


--------------------------------------------------------------------------------
/elit/components/distillation/schedulers.py:
--------------------------------------------------------------------------------
  1 | # Adopted from https://github.com/airaria/TextBrewer
  2 | # Apache License Version 2.0
  3 | from abc import ABC, abstractmethod
  4 | 
  5 | import torch
  6 | 
  7 | # x is between 0 and 1
  8 | from elit.common.structure import AutoConfigurable
  9 | 
 10 | 
 11 | def linear_growth_weight_scheduler(x):
 12 |     return x
 13 | 
 14 | 
 15 | def linear_decay_weight_scheduler(x):
 16 |     return 1 - x
 17 | 
 18 | 
 19 | def constant_temperature_scheduler(logits_S, logits_T, base_temperature):
 20 |     '''
 21 |     Remember to detach logits_S 
 22 |     '''
 23 |     return base_temperature
 24 | 
 25 | 
 26 | def flsw_temperature_scheduler_builder(beta, gamma, eps=1e-4, *args):
 27 |     '''
 28 |     adapted from arXiv:1911.07471
 29 |     '''
 30 | 
 31 |     def flsw_temperature_scheduler(logits_S, logits_T, base_temperature):
 32 |         v = logits_S.detach()
 33 |         t = logits_T.detach()
 34 |         with torch.no_grad():
 35 |             v = v / (torch.norm(v, dim=-1, keepdim=True) + eps)
 36 |             t = t / (torch.norm(t, dim=-1, keepdim=True) + eps)
 37 |             w = torch.pow((1 - (v * t).sum(dim=-1)), gamma)
 38 |             tau = base_temperature + (w.mean() - w) * beta
 39 |         return tau
 40 | 
 41 |     return flsw_temperature_scheduler
 42 | 
 43 | 
 44 | def cwsm_temperature_scheduler_builder(beta, *args):
 45 |     '''
 46 |     adapted from arXiv:1911.07471
 47 |     '''
 48 | 
 49 |     def cwsm_temperature_scheduler(logits_S, logits_T, base_temperature):
 50 |         v = logits_S.detach()
 51 |         with torch.no_grad():
 52 |             v = torch.softmax(v, dim=-1)
 53 |             v_max = v.max(dim=-1)[0]
 54 |             w = 1 / (v_max + 1e-3)
 55 |             tau = base_temperature + (w.mean() - w) * beta
 56 |         return tau
 57 | 
 58 |     return cwsm_temperature_scheduler
 59 | 
 60 | 
 61 | class LinearTeacherAnnealingScheduler(object):
 62 |     def __init__(self, num_training_steps: int) -> None:
 63 |         super().__init__()
 64 |         self._num_training_steps = num_training_steps
 65 |         self._current_training_steps = 0
 66 | 
 67 |     def step(self):
 68 |         self._current_training_steps += 1
 69 | 
 70 |     def __float__(self):
 71 |         return self._current_training_steps / self._num_training_steps
 72 | 
 73 | 
 74 | class TemperatureScheduler(ABC, AutoConfigurable):
 75 | 
 76 |     def __init__(self, base_temperature) -> None:
 77 |         super().__init__()
 78 |         self.base_temperature = base_temperature
 79 | 
 80 |     def __call__(self, logits_S, logits_T):
 81 |         return self.forward(logits_S, logits_T)
 82 | 
 83 |     @abstractmethod
 84 |     def forward(self, logits_S, logits_T):
 85 |         raise NotImplementedError()
 86 | 
 87 |     @staticmethod
 88 |     def from_name(name):
 89 |         classes = {
 90 |             'constant': ConstantScheduler,
 91 |             'flsw': FlswScheduler,
 92 |             'cwsm': CwsmScheduler,
 93 |         }
 94 |         assert name in classes, f'Unsupported temperature scheduler {name}. Expect one from {list(classes.keys())}.'
 95 |         return classes[name]()
 96 | 
 97 | 
 98 | class FunctionalScheduler(TemperatureScheduler):
 99 | 
100 |     def __init__(self, scheduler_func, base_temperature) -> None:
101 |         super().__init__(base_temperature)
102 |         self._scheduler_func = scheduler_func
103 | 
104 |     def forward(self, logits_S, logits_T):
105 |         return self._scheduler_func(logits_S, logits_T, self.base_temperature)
106 | 
107 | 
108 | class ConstantScheduler(TemperatureScheduler):
109 |     def forward(self, logits_S, logits_T):
110 |         return self.base_temperature
111 | 
112 | 
113 | class FlswScheduler(FunctionalScheduler):
114 |     def __init__(self, beta=1, gamma=1, eps=1e-4, base_temperature=8):
115 |         super().__init__(flsw_temperature_scheduler_builder(beta, gamma, eps), base_temperature)
116 |         self.beta = beta
117 |         self.gamma = gamma
118 |         self.eps = eps
119 | 
120 | 
121 | class CwsmScheduler(FunctionalScheduler):
122 |     def __init__(self, beta=1, base_temperature=8):
123 |         super().__init__(cwsm_temperature_scheduler_builder(beta), base_temperature)
124 |         self.beta = beta
125 | 


--------------------------------------------------------------------------------
/elit/datasets/parsing/conll_dataset.py:
--------------------------------------------------------------------------------
  1 | # ========================================================================
  2 | # Copyright 2020 Emory University
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ========================================================================
 16 | 
 17 | # -*- coding:utf-8 -*-
 18 | # Author: hankcs
 19 | from typing import Union, List, Callable, Dict
 20 | 
 21 | from elit.common.constant import ROOT, EOS, BOS
 22 | from elit.common.dataset import TransformDataset
 23 | from elit.components.parsers.conll import read_conll
 24 | from elit.utils.io_util import TimingFileIterator
 25 | from elit.utils.log_util import flash
 26 | 
 27 | 
 28 | class CoNLLParsingDataset(TransformDataset):
 29 | 
 30 |     def __init__(self,
 31 |                  data: Union[str, List],
 32 |                  transform: Union[Callable, List] = None,
 33 |                  cache=None,
 34 |                  generate_idx=None,
 35 |                  prune: Callable[[Dict[str, List[str]]], bool] = None) -> None:
 36 |         self._prune = prune
 37 |         super().__init__(data, transform, cache, generate_idx)
 38 | 
 39 |     def load_file(self, filepath):
 40 |         if filepath.endswith('.conllu'):
 41 |             # See https://universaldependencies.org/format.html
 42 |             field_names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS',
 43 |                            'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
 44 |         else:
 45 |             field_names = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS',
 46 |                            'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
 47 |         fp = TimingFileIterator(filepath)
 48 |         for idx, sent in enumerate(read_conll(fp)):
 49 |             sample = {}
 50 |             for i, field in enumerate(field_names):
 51 |                 sample[field] = [cell[i] for cell in sent]
 52 |             if not self._prune or not self._prune(sample):
 53 |                 yield sample
 54 |             fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]')
 55 | 
 56 |     def __len__(self) -> int:
 57 |         return len(self.data)
 58 | 
 59 | 
 60 | def append_bos(sample: dict, pos_key='CPOS', bos=ROOT) -> dict:
 61 |     """
 62 | 
 63 |     Args:
 64 |         sample:
 65 |         pos_key:
 66 |         bos: A special token inserted to the head of tokens.
 67 | 
 68 |     Returns:
 69 | 
 70 |     """
 71 |     sample['token'] = [bos] + sample['FORM']
 72 |     if pos_key in sample:
 73 |         sample['pos'] = [ROOT] + sample[pos_key]
 74 |     if 'HEAD' in sample:
 75 |         sample['arc'] = [0] + sample['HEAD']
 76 |         sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL']
 77 |     return sample
 78 | 
 79 | 
 80 | def append_bos_eos(sample: dict) -> dict:
 81 |     sample['token'] = [BOS] + sample['FORM'] + [EOS]
 82 |     if 'CPOS' in sample:
 83 |         sample['pos'] = [BOS] + sample['CPOS'] + [EOS]
 84 |     if 'HEAD' in sample:
 85 |         sample['arc'] = [0] + sample['HEAD'] + [0]
 86 |         sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] + sample['DEPREL'][:1]
 87 |     return sample
 88 | 
 89 | 
 90 | def get_sibs(sample: dict) -> dict:
 91 |     heads = sample.get('arc', None)
 92 |     if heads:
 93 |         sibs = [-1] * len(heads)
 94 |         for i in range(1, len(heads)):
 95 |             hi = heads[i]
 96 |             for j in range(i + 1, len(heads)):
 97 |                 hj = heads[j]
 98 |                 di, dj = hi - i, hj - j
 99 |                 if hi >= 0 and hj >= 0 and hi == hj and di * dj > 0:
100 |                     if abs(di) > abs(dj):
101 |                         sibs[i] = j
102 |                     else:
103 |                         sibs[j] = i
104 |                     break
105 |         sample['sib_id'] = [0] + sibs[1:]
106 |     return sample
107 | 


--------------------------------------------------------------------------------
/elit/layers/embeddings/char_rnn.py:
--------------------------------------------------------------------------------
  1 | # ========================================================================
  2 | # Copyright 2020 Emory University
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ========================================================================
 16 | 
 17 | # -*- coding:utf-8 -*-
 18 | # Author: hankcs
 19 | from typing import Optional, Callable, Union
 20 | 
 21 | import torch
 22 | import torch.nn as nn
 23 | from torch.nn.utils.rnn import pack_padded_sequence
 24 | 
 25 | from elit.common.structure import AutoConfigurable
 26 | from elit.common.transform import VocabDict, ToChar
 27 | from elit.common.vocab import Vocab
 28 | from elit.layers.embeddings.embedding import Embedding, EmbeddingDim
 29 | 
 30 | 
 31 | class CharRNN(nn.Module, EmbeddingDim):
 32 |     def __init__(self,
 33 |                  field,
 34 |                  vocab_size,
 35 |                  embed: Union[int, nn.Embedding],
 36 |                  hidden_size):
 37 |         super(CharRNN, self).__init__()
 38 |         self.field = field
 39 |         # the embedding layer
 40 |         if isinstance(embed, int):
 41 |             self.embed = nn.Embedding(num_embeddings=vocab_size,
 42 |                                       embedding_dim=embed)
 43 |         elif isinstance(embed, nn.Module):
 44 |             self.embed = embed
 45 |             embed = embed.embedding_dim
 46 |         else:
 47 |             raise ValueError(f'Unrecognized type for {embed}')
 48 |         # the lstm layer
 49 |         self.lstm = nn.LSTM(input_size=embed,
 50 |                             hidden_size=hidden_size,
 51 |                             batch_first=True,
 52 |                             bidirectional=True)
 53 | 
 54 |     def forward(self, batch, mask, **kwargs):
 55 |         x = batch[f'{self.field}_char_id']
 56 |         # [batch_size, seq_len, fix_len]
 57 |         mask = x.ne(0)
 58 |         # [batch_size, seq_len]
 59 |         lens = mask.sum(-1)
 60 |         char_mask = lens.gt(0)
 61 | 
 62 |         # [n, fix_len, n_embed]
 63 |         x = self.embed(x[char_mask])
 64 |         x = pack_padded_sequence(x, lens[char_mask], True, False)
 65 |         x, (h, _) = self.lstm(x)
 66 |         # [n, fix_len, n_out]
 67 |         h = torch.cat(torch.unbind(h), -1)
 68 |         # [batch_size, seq_len, n_out]
 69 |         embed = h.new_zeros(*lens.shape, h.size(-1))
 70 |         embed = embed.masked_scatter_(char_mask.unsqueeze(-1), h)
 71 | 
 72 |         return embed
 73 | 
 74 |     @property
 75 |     def embedding_dim(self) -> int:
 76 |         return self.lstm.hidden_size * 2
 77 | 
 78 | 
 79 | class CharRNNEmbedding(Embedding, AutoConfigurable):
 80 |     def __init__(self,
 81 |                  field,
 82 |                  embed,
 83 |                  hidden_size,
 84 |                  max_word_length=None) -> None:
 85 |         super().__init__()
 86 |         self.field = field
 87 |         self.hidden_size = hidden_size
 88 |         self.embed = embed
 89 |         self.max_word_length = max_word_length
 90 | 
 91 |     def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]:
 92 |         if isinstance(self.embed, Embedding):
 93 |             self.embed.transform(vocabs=vocabs)
 94 |         vocab_name = self.vocab_name
 95 |         if vocab_name not in vocabs:
 96 |             vocabs[vocab_name] = Vocab()
 97 |         return ToChar(self.field, vocab_name, max_word_length=self.max_word_length)
 98 | 
 99 |     @property
100 |     def vocab_name(self):
101 |         vocab_name = f'{self.field}_char'
102 |         return vocab_name
103 | 
104 |     def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]:
105 |         embed = self.embed
106 |         if isinstance(self.embed, Embedding):
107 |             embed = self.embed.module(vocabs=vocabs)
108 |         return CharRNN(self.field, len(vocabs[self.vocab_name]), embed, self.hidden_size)
109 | 


--------------------------------------------------------------------------------