├── examples ├── load_commoncrawl_tranformers.py └── load_wikipedia_tranformers.py ├── LICENSE ├── hubconf.py ├── .gitignore └── README.md /examples/load_commoncrawl_tranformers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 -u 2 | # Copyright (c) Musixmatch, spa 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from transformers import pipeline 8 | 9 | # Fill mask pipeline, with umberto-commoncrawl-cased 10 | fill_mask = pipeline( 11 | "fill-mask", 12 | model="Musixmatch/umberto-commoncrawl-cased-v1", 13 | tokenizer="Musixmatch/umberto-commoncrawl-cased-v1" 14 | ) 15 | 16 | result = fill_mask("Umberto Eco è un grande scrittore") 17 | print(result) 18 | -------------------------------------------------------------------------------- /examples/load_wikipedia_tranformers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 -u 2 | # Copyright (c) Musixmatch, spa 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from transformers import pipeline 8 | 9 | # Fill mask pipeline, with umberto-wikipedia-uncased 10 | fill_mask = pipeline( 11 | "fill-mask", 12 | model="Musixmatch/umberto-wikipedia-uncased-v1", 13 | tokenizer="Musixmatch/umberto-wikipedia-uncased-v1" 14 | ) 15 | 16 | result = fill_mask("Umberto Eco è un grande scrittore") 17 | print(result) 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Musixmatch Research 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Musixmatch, spa. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | dependencies = [ 7 | 'fairseq', 8 | 'sentencepiece', 9 | 'torch', 10 | ] 11 | 12 | def umberto_commoncrawl_cased(**kwargs): 13 | from fairseq import hub_utils 14 | from fairseq.models.roberta.hub_interface import RobertaHubInterface 15 | x = hub_utils.from_pretrained( 16 | model_name_or_path='https://mxmdownloads.s3.amazonaws.com/umberto/umberto.commoncrawl.cased.tar.gz', 17 | checkpoint_file='model.pt', 18 | data_name_or_path='.', 19 | bpe='sentencepiece', 20 | load_checkpoint_heads=True, 21 | **kwargs, 22 | ) 23 | return RobertaHubInterface(x['args'], x['task'], x['models'][0]) 24 | 25 | 26 | def umberto_wikipedia_uncased(**kwargs): 27 | from fairseq import hub_utils 28 | from fairseq.models.roberta.hub_interface import RobertaHubInterface 29 | x = hub_utils.from_pretrained( 30 | model_name_or_path='https://mxmdownloads.s3.amazonaws.com/umberto/umberto.wikipedia.uncased.tar.gz', 31 | checkpoint_file='model.pt', 32 | data_name_or_path='.', 33 | bpe='sentencepiece', 34 | load_checkpoint_heads=True, 35 | **kwargs, 36 | ) 37 | return RobertaHubInterface(x['args'], x['task'], x['models'][0]) 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 | Marco Lodola, Monument to Umberto Eco, Alessandria 2019 4 |

5 | 6 | # UmBERTo: an Italian Language Model trained with Whole Word Masking 7 | 8 | UmBERTo is a Roberta-based Language Model trained on large Italian Corpora. 9 | This implementation is based on Facebook Research AI code (https://github.com/pytorch/fairseq) 10 | 11 | # Description 12 | 13 | UmBERTo inherits from RoBERTa base model architecture which improves the initial BERT by identifying key hyperparameters for better results. 14 | Umberto extends Roberta and uses two innovative approaches: ***SentencePiece*** and ***Whole Word Masking***. 15 | SentencePiece Model (**SPM**) is a language-independent subword tokenizer and detokenizer designed for Neural-based text processing and creates sub-word units specifically to the size of the chosen vocabulary and the language of the corpus. 16 | Whole Word Masking (**WWM**) applies mask to an entire word, if at least one of all tokens created by SentencePiece Tokenizer was originally chosen as mask. So only entire word are masked, not subwords. 17 | 18 | Two models are released: 19 | - **umberto-wikipedia-uncased-v1**, an uncased model trained on a relative small corpus (~7GB) extracted from 20 | [Wikipedia-ITA](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/). 21 | - **umberto-commoncrawl-cased-v1**, a cased model trained on Commoncrawl ITA exploiting [OSCAR](https://traces1.inria.fr/oscar/) (Open Super-large Crawled ALMAnaCH coRpus) Italian large corpus ( ~69GB) 22 | 23 | Both models have 12-layer, 768-hidden, 12-heads, 110M parameters (BASE). 24 | 25 | 26 | | Model | WWM | CASED | TOKENIZER | VOCAB SIZE | TRAIN STEPS | FAIRSEQ | TRANSFORMERS | 27 | | ------ | ------ | ------ | ------ | ------ |------ | ------ | --- | 28 | | `umberto-wikipedia-uncased-v1` | YES | NO | SPM | 32K | 100k | [Link](http://bit.ly/2s7JmXh)| [Link](http://bit.ly/35wbSj6) | 29 | | `umberto-commoncrawl-cased-v1` | YES | YES | SPM | 32K | 125k | [Link](http://bit.ly/2TakHfJ)| [Link](http://bit.ly/35zO7GH) | 30 | 31 | We trained both the models on 8 Nvidia V100 GPUs (p2.8xlarge P2 EC2 instance) during 4 days on [AWS Sagemaker](https://aws.amazon.com/it/sagemaker/). 32 | 33 | # Installation 34 | 35 | ### Dependencies: 36 | ``` 37 | torch >= 1.3.1 38 | sentencepiece 39 | transformers 40 | fairseq 41 | ``` 42 | 43 | 44 | #### Transformers 45 | 46 | 47 | ```pip install transformers``` 48 | 49 | To install transformers from original repo (TESTED): 50 | ```bash 51 | git clone https://github.com/huggingface/transformers.git 52 | cd transformers 53 | pip install . 54 | ``` 55 | 56 | #### Fairseq 57 | 58 | To use a version of `fairseq` with UmBERTo support, build from source doing these steps: 59 | ```bash 60 | git clone https://github.com/musixmatchresearch/fairseq 61 | cd fairseq 62 | pip install . 63 | ``` 64 | 65 | 66 | # Examples 67 | 68 | ### Transformers 69 | 70 | From official [HuggingFace](https://github.com/huggingface/transformers) code. 71 | 72 | #### with transformer pipeline 73 | 74 | ```python 75 | 76 | from transformers import pipeline 77 | 78 | fill_mask = pipeline( 79 | "fill-mask", 80 | model="Musixmatch/umberto-commoncrawl-cased-v1", 81 | tokenizer="Musixmatch/umberto-commoncrawl-cased-v1" 82 | ) 83 | 84 | result = fill_mask("Umberto Eco è un grande scrittore") 85 | 86 | #[{'sequence': ' Umberto Eco è considerato un grande scrittore', 'score': 0.1859988570213318, 'token': 5032}, 87 | #{'sequence': ' Umberto Eco è stato un grande scrittore', 'score': 0.1781671643257141, 'token': 471}, 88 | #{'sequence': ' Umberto Eco è sicuramente un grande scrittore', 'score': 0.16565577685832977, 'token': 2654}, 89 | #{'sequence': ' UmbertoEco è indubbiamente un grande scrittore', 'score': 0.09328985959291458, 'token': 17908}, 90 | #{'sequence': ' Umberto Eco è certamente un grande scrittore', 'score': 0.05470150709152222, 'token': 5269}] 91 | ``` 92 | 93 | #### with transformer AutoTokenizer,AutoModel 94 | ```python 95 | import torch 96 | from transformers import AutoTokenizer, AutoModel 97 | 98 | tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1") 99 | umberto = AutoModel.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1") 100 | 101 | encoded_input = tokenizer.encode("Umberto Eco è stato un grande scrittore") 102 | input_ids = torch.tensor(encoded_input).unsqueeze(0) # Batch size 1 103 | outputs = umberto(input_ids) 104 | last_hidden_states = outputs[0] # The last hidden-state is the first element of the output 105 | ``` 106 | 107 | ### Fairseq 108 | 109 | ```python 110 | import torch 111 | 112 | umberto = torch.hub.load('musixmatchresearch/umberto', 'umberto_commoncrawl_cased') 113 | 114 | assert isinstance(umberto.model, torch.nn.Module) 115 | umberto.eval() # disable dropout (or leave in train mode to finetune) 116 | 117 | # Masked LM Inference 118 | masked_line = 'Umberto Eco è un grande scrittore' 119 | result = umberto.fill_mask(masked_line, topk=20) 120 | # Output: 121 | #('Umberto Eco è considerato un grande scrittore', 0.19939924776554108, ' considerato'), 122 | #('Umberto Eco è sicuramente un grande scrittore', 0.1669664829969406, ' sicuramente'), 123 | #('Umberto Eco è stato un grande scrittore', 0.16225320100784302, ' stato'), 124 | #('Umberto Eco è indubbiamente un grande scrittore', 0.09528309106826782, ' indubbiamente') 125 | ... 126 | ``` 127 | 128 | 129 | 130 | # Results 131 | We obtained state-of-the-art results for POS tagging, confirming that cased models trained with WWM perform better than uncased ones. 132 | Our model `Umberto-Wikipedia-Uncased` trained with WWM on a smaller dataset and uncased, produces important results comparable to the cased results. 133 | 134 | ### Umberto-Wikipedia-Uncased 135 | These results refers to umberto-wikipedia-uncased model. 136 | 137 | #### Part of Speech (POS) 138 | 139 | | Dataset | F1 | Precision | Recall | Accuracy | 140 | | ------ | ------ | ------ | ------ | ------ | 141 | | **UD_Italian-ISDT** | 98.563 | 98.508 | 98.618 | **98.717** | 142 | | **UD_Italian-ParTUT** | 97.810 | 97.835 | 97.784 | **98.060** | 143 | 144 | #### Named Entity Recognition (NER) 145 | 146 | | Dataset | F1 | Precision | Recall | Accuracy | 147 | | ------ | ------ | ------ | ------ | ----- | 148 | | **ICAB-EvalITA07** | **86.240** | 85.939 | 86.544 | 98.534 | 149 | | **WikiNER-ITA** | **90.483** | 90.328 | 90.638 | 98.661 | 150 | 151 | ### Umberto-Commoncrawl-Cased 152 | 153 | These results refers to umberto-commoncrawl-cased model. 154 | 155 | #### Part of Speech (POS) 156 | 157 | | Dataset | F1 | Precision | Recall | Accuracy | 158 | | ------ | ------ | ------ | ------ | ------ | 159 | | **UD_Italian-ISDT** | 98.870 | 98.861 | 98.879 | **98.977** | 160 | | **UD_Italian-ParTUT** | 98.786 | 98.812 | 98.760 | **98.903** | 161 | 162 | #### Named Entity Recognition (NER) 163 | 164 | | Dataset | F1 | Precision | Recall | Accuracy | 165 | | ------ | ------ | ------ | ------ | ------ | 166 | | **ICAB-EvalITA07** | **87.565** | 86.596 | 88.556 | 98.690 | 167 | | **WikiNER-ITA** | **92.531** | 92.509 | 92.553 | 99.136 | 168 | 169 | 170 | 171 | ## References: 172 | * BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding [Paper](https://arxiv.org/abs/1810.04805), [Github](https://github.com/google-research/bert) 173 | * CamemBERT: a Tasty French Language Model [Paper](https://www.researchgate.net/publication/337183733_CamemBERT_a_Tasty_French_Language_Model), [Page](https://camembert-model.fr/) 174 | * GilBERTo: An Italian pretrained language model based on RoBERTa [Github](https://github.com/idb-ita/GilBERTo) 175 | * RoBERTa: A Robustly Optimized BERT Pretraining Approach [Paper](https://arxiv.org/abs/1907.11692), [Github](https://github.com/pytorch/fairseq/tree/master/fairseq/models) 176 | * Sentencepiece: A simple and language independent subword tokenizer and detokenizer for neural text processing [Paper](https://www.aclweb.org/anthology/D18-2012/), [Github](https://github.com/google/sentencepiece) 177 | * Asynchronous Pipeline for Processing Huge Corpora on Medium to Low Resource Infrastructures [Paper](https://hal.inria.fr/hal-02148693), [Page]() 178 | * Italy goes to Stanford: a collection of CoreNLP modules for Italian (TINT) [Paper](https://arxiv.org/abs/1609.06204), [Github](https://github.com/dhfbk/tint), [Page](https://dh.fbk.eu/technologies/tint-italian-nlp-tool) 179 | 180 | 181 | ## Credits 182 | All of the original datasets are publicly available or were released with the owners' grant. The datasets are all released under a CC0 or CCBY license. 183 | 184 | * UD Italian-ISDT Dataset [Github](https://github.com/UniversalDependencies/UD_Italian-ISDT) 185 | * UD Italian-ParTUT Dataset [Github](https://github.com/UniversalDependencies/UD_Italian-ParTUT) 186 | * WIKINER [Page](https://figshare.com/articles/Learning_multilingual_named_entity_recognition_from_Wikipedia/5462500) , [Paper](https://www.sciencedirect.com/science/article/pii/S0004370212000276?via%3Dihub) 187 | * I-CAB (Italian Content Annotation Bank), EvalITA [Page](http://www.evalita.it/) 188 | ``` 189 | @inproceedings {magnini2006annotazione, 190 | title = {Annotazione di contenuti concettuali in un corpus italiano: I - CAB}, 191 | author = {Magnini,Bernardo and Cappelli,Amedeo and Pianta,Emanuele and Speranza,Manuela and Bartalesi Lenzi,V and Sprugnoli,Rachele and Romano,Lorenza and Girardi,Christian and Negri,Matteo}, 192 | booktitle = {Proc.of SILFI 2006}, 193 | year = {2006} 194 | } 195 | @inproceedings {magnini2006cab, 196 | title = {I - CAB: the Italian Content Annotation Bank.}, 197 | author = {Magnini,Bernardo and Pianta,Emanuele and Girardi,Christian and Negri,Matteo and Romano,Lorenza and Speranza,Manuela and Lenzi,Valentina Bartalesi and Sprugnoli,Rachele}, 198 | booktitle = {LREC}, 199 | pages = {963--968}, 200 | year = {2006}, 201 | organization = {Citeseer} 202 | } 203 | ``` 204 | 205 | ## Acknowledgments 206 | Special thanks to I-CAB (Italian Content Annotation Bank) and [EvalITA](http://www.evalita.it/) authors to provide the datasets as part of Master Thesis Research project with [School of Engineering, University of Bologna](https://www.unibo.it/en/university/campuses-and-structures/schools/school-of-engineering). 207 | 208 | ## Authors 209 | 210 | **Loreto Parisi**: `loreto at musixmatch dot com`, [loretoparisi](https://github.com/loretoparisi)
211 | **Simone Francia**: `simone.francia at musixmatch dot com`, [simonefrancia](https://github.com/simonefrancia)
212 | **Paolo Magnani**: `paul.magnani95 at gmail dot com`, [paulthemagno](https://github.com/paulthemagno)
213 | 214 | ## Citation 215 | 216 | Cite this work with: 217 | ```bibtex 218 | @misc{musixmatch-2020-umberto, 219 | author = {Loreto Parisi and Simone Francia and Paolo Magnani}, 220 | title = {UmBERTo: an Italian Language Model trained with Whole Word Masking}, 221 | year = {2020}, 222 | publisher = {GitHub}, 223 | journal = {GitHub repository}, 224 | howpublished = {\url{https://github.com/musixmatchresearch/umberto}} 225 | } 226 | ``` 227 | 228 | ## About Musixmatch AI 229 | ![Musxmatch Ai mac app icon-128](https://user-images.githubusercontent.com/163333/72244273-396aa380-35ee-11ea-894b-4ea48230c02b.png)
230 | We do Machine Learning and Artificial Intelligence @[musixmatch](https://twitter.com/Musixmatch)
231 | Follow us on [Twitter](https://twitter.com/musixmatchai) [Github](https://github.com/musixmatchresearch) 232 | 233 | 234 | --------------------------------------------------------------------------------