├── examples
    ├── load_commoncrawl_tranformers.py
    └── load_wikipedia_tranformers.py
├── LICENSE
├── hubconf.py
├── .gitignore
└── README.md


/examples/load_commoncrawl_tranformers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3 -u
 2 | # Copyright (c) Musixmatch, spa
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from transformers import pipeline
 8 | 
 9 | # Fill mask pipeline, with umberto-commoncrawl-cased
10 | fill_mask = pipeline(
11 | 	"fill-mask",
12 | 	model="Musixmatch/umberto-commoncrawl-cased-v1",
13 | 	tokenizer="Musixmatch/umberto-commoncrawl-cased-v1"
14 | )
15 | 
16 | result = fill_mask("Umberto Eco è <mask> un grande scrittore")
17 | print(result)
18 | 


--------------------------------------------------------------------------------
/examples/load_wikipedia_tranformers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3 -u
 2 | # Copyright (c) Musixmatch, spa
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from transformers import pipeline
 8 | 
 9 | # Fill mask pipeline, with umberto-wikipedia-uncased
10 | fill_mask = pipeline(
11 | 	"fill-mask",
12 | 	model="Musixmatch/umberto-wikipedia-uncased-v1",
13 | 	tokenizer="Musixmatch/umberto-wikipedia-uncased-v1"
14 | )
15 | 
16 | result = fill_mask("Umberto Eco è <mask> un grande scrittore")
17 | print(result)
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Musixmatch Research
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/hubconf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Musixmatch, spa. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | dependencies = [
 7 |     'fairseq',
 8 |     'sentencepiece',
 9 |     'torch',
10 | ]
11 | 
12 | def umberto_commoncrawl_cased(**kwargs):
13 |     from fairseq import hub_utils
14 |     from fairseq.models.roberta.hub_interface import RobertaHubInterface
15 |     x = hub_utils.from_pretrained(
16 |         model_name_or_path='https://mxmdownloads.s3.amazonaws.com/umberto/umberto.commoncrawl.cased.tar.gz',
17 |         checkpoint_file='model.pt',
18 |         data_name_or_path='.',
19 |         bpe='sentencepiece',
20 |         load_checkpoint_heads=True,
21 |         **kwargs,
22 |     )
23 |     return RobertaHubInterface(x['args'], x['task'], x['models'][0])
24 | 
25 | 
26 | def umberto_wikipedia_uncased(**kwargs):
27 |     from fairseq import hub_utils
28 |     from fairseq.models.roberta.hub_interface import RobertaHubInterface
29 |     x = hub_utils.from_pretrained(
30 |         model_name_or_path='https://mxmdownloads.s3.amazonaws.com/umberto/umberto.wikipedia.uncased.tar.gz',
31 |         checkpoint_file='model.pt',
32 |         data_name_or_path='.',
33 |         bpe='sentencepiece',
34 |         load_checkpoint_heads=True,
35 |         **kwargs,
36 |     )
37 |     return RobertaHubInterface(x['args'], x['task'], x['models'][0])
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <img src="https://user-images.githubusercontent.com/7140210/72913702-d55a8480-3d3d-11ea-99fc-f2ef29af4e72.jpg" width="700"> </br>
  3 |     Marco Lodola, Monument to Umberto Eco, Alessandria 2019
  4 | </p>
  5 | 
  6 | # UmBERTo: an Italian Language Model trained with Whole Word Masking
  7 | 
  8 | UmBERTo is a Roberta-based Language Model trained on large Italian Corpora.
  9 | This implementation is based on Facebook Research AI code (https://github.com/pytorch/fairseq)
 10 | 
 11 | # Description
 12 | 
 13 | UmBERTo inherits from RoBERTa base model architecture which improves the initial BERT by identifying key hyperparameters for better results.
 14 | Umberto extends Roberta and uses two innovative approaches: ***SentencePiece*** and ***Whole Word Masking***.
 15 | SentencePiece Model (**SPM**) is a language-independent subword tokenizer and detokenizer designed for Neural-based text processing and creates sub-word units specifically to the size of the chosen vocabulary and the language of the corpus. 
 16 | Whole Word Masking (**WWM**) applies mask to an entire word, if at least one of all tokens created by SentencePiece Tokenizer was originally chosen as mask. So only entire word are masked, not subwords.
 17 | 
 18 | Two models are released:
 19 |   - **umberto-wikipedia-uncased-v1**, an uncased model trained on a relative small corpus (~7GB) extracted from 
 20 |   [Wikipedia-ITA](https://linguatools.org/tools/corpora/wikipedia-monolingual-corpora/).
 21 |   - **umberto-commoncrawl-cased-v1**, a cased model trained on Commoncrawl ITA exploiting [OSCAR](https://traces1.inria.fr/oscar/) (Open Super-large Crawled ALMAnaCH coRpus) Italian large corpus ( ~69GB)
 22 | 
 23 | Both models have 12-layer, 768-hidden, 12-heads, 110M parameters (BASE).
 24 | 
 25 | 
 26 | | Model | WWM | CASED | TOKENIZER | VOCAB SIZE  | TRAIN STEPS | FAIRSEQ  | TRANSFORMERS |
 27 | | ------ | ------ | ------ | ------ | ------ |------ | ------ | --- |
 28 | | `umberto-wikipedia-uncased-v1` | YES  | NO | SPM | 32K | 100k | [Link](http://bit.ly/2s7JmXh)| [Link](http://bit.ly/35wbSj6) |
 29 | | `umberto-commoncrawl-cased-v1` | YES | YES | SPM | 32K | 125k | [Link](http://bit.ly/2TakHfJ)| [Link](http://bit.ly/35zO7GH) |
 30 | 
 31 | We trained both the models on 8 Nvidia V100 GPUs (p2.8xlarge P2 EC2 instance) during 4 days on [AWS Sagemaker](https://aws.amazon.com/it/sagemaker/).
 32 | 
 33 | # Installation
 34 | 
 35 | ### Dependencies:
 36 | ```
 37 | torch >= 1.3.1
 38 | sentencepiece
 39 | transformers
 40 | fairseq
 41 | ```
 42 | 
 43 | 
 44 | #### Transformers
 45 | 
 46 | 
 47 | ```pip install transformers```
 48 | 
 49 | To install transformers from original repo (TESTED):
 50 | ```bash
 51 | git clone https://github.com/huggingface/transformers.git
 52 | cd transformers
 53 | pip install .
 54 | ```
 55 | 
 56 | #### Fairseq
 57 | 
 58 | To use a version of `fairseq` with UmBERTo support, build from source doing these steps:
 59 | ```bash
 60 | git clone https://github.com/musixmatchresearch/fairseq
 61 | cd fairseq
 62 | pip install .
 63 | ```
 64 | 
 65 | 
 66 | # Examples
 67 | 
 68 | ### Transformers
 69 | 
 70 | From official [HuggingFace](https://github.com/huggingface/transformers) code.
 71 | 
 72 | #### with transformer pipeline
 73 | 
 74 | ```python
 75 | 
 76 | from transformers import pipeline
 77 | 
 78 | fill_mask = pipeline(
 79 | 	"fill-mask",
 80 | 	model="Musixmatch/umberto-commoncrawl-cased-v1",
 81 | 	tokenizer="Musixmatch/umberto-commoncrawl-cased-v1"
 82 | )
 83 | 
 84 | result = fill_mask("Umberto Eco è <mask> un grande scrittore")
 85 | 
 86 | #[{'sequence': '<s> Umberto Eco è considerato un grande scrittore</s>', 'score': 0.1859988570213318, 'token': 5032}, 
 87 | #{'sequence': '<s> Umberto Eco è stato un grande scrittore</s>', 'score': 0.1781671643257141, 'token': 471}, 
 88 | #{'sequence': '<s> Umberto Eco è sicuramente un grande scrittore</s>', 'score': 0.16565577685832977, 'token': 2654}, 
 89 | #{'sequence': '<s> UmbertoEco è indubbiamente un grande scrittore</s>', 'score': 0.09328985959291458, 'token': 17908}, 
 90 | #{'sequence': '<s> Umberto Eco è certamente un grande scrittore</s>', 'score': 0.05470150709152222, 'token': 5269}]
 91 | ```
 92 | 
 93 | #### with transformer AutoTokenizer,AutoModel  
 94 | ```python
 95 | import torch
 96 | from transformers import AutoTokenizer, AutoModel
 97 | 
 98 | tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")
 99 | umberto = AutoModel.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")
100 | 
101 | encoded_input = tokenizer.encode("Umberto Eco è stato un grande scrittore")
102 | input_ids = torch.tensor(encoded_input).unsqueeze(0)  # Batch size 1
103 | outputs = umberto(input_ids)
104 | last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output
105 | ```
106 | 
107 | ### Fairseq
108 | 
109 | ```python
110 | import torch
111 | 
112 | umberto = torch.hub.load('musixmatchresearch/umberto', 'umberto_commoncrawl_cased')
113 | 
114 | assert isinstance(umberto.model, torch.nn.Module)
115 | umberto.eval()  # disable dropout (or leave in train mode to finetune)
116 | 
117 | # Masked LM Inference
118 | masked_line = 'Umberto Eco è <mask> un grande scrittore'
119 | result = umberto.fill_mask(masked_line, topk=20)
120 | # Output:
121 | #('Umberto Eco è considerato un grande scrittore', 0.19939924776554108, ' considerato'), 
122 | #('Umberto Eco è sicuramente un grande scrittore', 0.1669664829969406, ' sicuramente'), 
123 | #('Umberto Eco è stato un grande scrittore', 0.16225320100784302, ' stato'), 
124 | #('Umberto Eco è indubbiamente un grande scrittore', 0.09528309106826782, ' indubbiamente')
125 | ...
126 | ```
127 | 
128 | 
129 | 
130 | # Results
131 | We obtained state-of-the-art results for POS tagging, confirming that cased models trained with WWM perform better than uncased ones.
132 | Our model `Umberto-Wikipedia-Uncased` trained with WWM on a smaller dataset and uncased, produces important results comparable to the cased results.
133 | 
134 | ### Umberto-Wikipedia-Uncased
135 | These results refers to umberto-wikipedia-uncased model.
136 | 
137 | #### Part of Speech (POS)
138 | 
139 | | Dataset | F1 | Precision | Recall | Accuracy |
140 | | ------ | ------ | ------ |  ------ |  ------ |
141 | | **UD_Italian-ISDT** | 98.563  | 98.508 | 98.618 | **98.717** | 
142 | | **UD_Italian-ParTUT** | 97.810 | 97.835 |  97.784 | **98.060** | 
143 | 
144 | #### Named Entity Recognition (NER)
145 | 
146 | | Dataset | F1 | Precision | Recall | Accuracy |
147 | | ------ | ------ | ------ |  ------ |  ----- |
148 | | **ICAB-EvalITA07** | **86.240** | 85.939 | 86.544 | 98.534 | 
149 | | **WikiNER-ITA** | **90.483** | 90.328 | 90.638 | 98.661 | 
150 | 
151 | ### Umberto-Commoncrawl-Cased
152 | 
153 | These results refers to umberto-commoncrawl-cased model.
154 | 
155 | #### Part of Speech (POS)
156 | 
157 | | Dataset | F1 | Precision | Recall | Accuracy |
158 | | ------ | ------ | ------ |  ------ |  ------ |
159 | | **UD_Italian-ISDT** | 98.870  | 98.861 | 98.879 | **98.977** | 
160 | | **UD_Italian-ParTUT** | 98.786 | 98.812 |  98.760 | **98.903** | 
161 | 
162 | #### Named Entity Recognition (NER)
163 | 
164 | | Dataset | F1 | Precision | Recall | Accuracy |
165 | | ------ | ------ | ------ |  ------ |  ------ |
166 | | **ICAB-EvalITA07** | **87.565**  | 86.596  | 88.556  | 98.690 | 
167 | | **WikiNER-ITA** | **92.531**  | 92.509 | 92.553 | 99.136 | 
168 | 
169 | 
170 | 
171 | ## References:
172 | * BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding [Paper](https://arxiv.org/abs/1810.04805), [Github](https://github.com/google-research/bert)
173 | * CamemBERT: a Tasty French Language Model [Paper](https://www.researchgate.net/publication/337183733_CamemBERT_a_Tasty_French_Language_Model), [Page](https://camembert-model.fr/)
174 | * GilBERTo: An Italian pretrained language model based on RoBERTa [Github](https://github.com/idb-ita/GilBERTo)
175 | * RoBERTa: A Robustly Optimized BERT Pretraining Approach [Paper](https://arxiv.org/abs/1907.11692), [Github](https://github.com/pytorch/fairseq/tree/master/fairseq/models)
176 | * Sentencepiece: A simple and language independent subword tokenizer and detokenizer for neural text processing [Paper](https://www.aclweb.org/anthology/D18-2012/), [Github](https://github.com/google/sentencepiece)
177 | * Asynchronous Pipeline for Processing Huge Corpora on Medium to Low Resource Infrastructures [Paper](https://hal.inria.fr/hal-02148693), [Page]()
178 | * Italy goes to Stanford: a collection of CoreNLP modules for Italian (TINT) [Paper](https://arxiv.org/abs/1609.06204), [Github](https://github.com/dhfbk/tint), [Page](https://dh.fbk.eu/technologies/tint-italian-nlp-tool) 
179 | 
180 | 
181 | ## Credits
182 | All of the original datasets are publicly available or were released with the owners' grant. The datasets are all released under a CC0 or CCBY license.
183 | 
184 | * UD Italian-ISDT Dataset [Github](https://github.com/UniversalDependencies/UD_Italian-ISDT)
185 | * UD Italian-ParTUT Dataset [Github](https://github.com/UniversalDependencies/UD_Italian-ParTUT)
186 | * WIKINER [Page](https://figshare.com/articles/Learning_multilingual_named_entity_recognition_from_Wikipedia/5462500) , [Paper](https://www.sciencedirect.com/science/article/pii/S0004370212000276?via%3Dihub)
187 | * I-CAB (Italian Content Annotation Bank), EvalITA [Page](http://www.evalita.it/)
188 | ```
189 | @inproceedings {magnini2006annotazione,
190 | 	title = {Annotazione di contenuti concettuali in un corpus italiano: I - CAB},
191 | 	author = {Magnini,Bernardo and Cappelli,Amedeo and Pianta,Emanuele and Speranza,Manuela and Bartalesi Lenzi,V and Sprugnoli,Rachele and Romano,Lorenza and Girardi,Christian and Negri,Matteo},
192 | 	booktitle = {Proc.of SILFI 2006},
193 | 	year = {2006}
194 | }
195 | @inproceedings {magnini2006cab,
196 | 	title = {I - CAB: the Italian Content Annotation Bank.},
197 | 	author = {Magnini,Bernardo and Pianta,Emanuele and Girardi,Christian and Negri,Matteo and Romano,Lorenza and Speranza,Manuela and Lenzi,Valentina Bartalesi and Sprugnoli,Rachele},
198 | 	booktitle = {LREC},
199 | 	pages = {963--968},
200 | 	year = {2006},
201 | 	organization = {Citeseer}
202 | }
203 | ```
204 | 
205 | ## Acknowledgments
206 | Special thanks to I-CAB (Italian Content Annotation Bank) and [EvalITA](http://www.evalita.it/) authors to provide the datasets as part of Master Thesis Research project with [School of Engineering, University of Bologna](https://www.unibo.it/en/university/campuses-and-structures/schools/school-of-engineering).
207 | 
208 | ## Authors
209 | 
210 | **Loreto Parisi**: `loreto at musixmatch dot com`, [loretoparisi](https://github.com/loretoparisi)<br>
211 | **Simone Francia**: `simone.francia at musixmatch dot com`, [simonefrancia](https://github.com/simonefrancia)<br>
212 | **Paolo Magnani**: `paul.magnani95 at gmail dot com`, [paulthemagno](https://github.com/paulthemagno)<br>
213 | 
214 | ## Citation
215 | 
216 | Cite this work with:
217 | ```bibtex
218 | @misc{musixmatch-2020-umberto,
219 |   author = {Loreto Parisi and Simone Francia and Paolo Magnani},
220 |   title = {UmBERTo: an Italian Language Model trained with Whole Word Masking},
221 |   year = {2020},
222 |   publisher = {GitHub},
223 |   journal = {GitHub repository},
224 |   howpublished = {\url{https://github.com/musixmatchresearch/umberto}}
225 | }
226 | ```
227 | 
228 | ## About Musixmatch AI
229 | ![Musxmatch Ai mac app icon-128](https://user-images.githubusercontent.com/163333/72244273-396aa380-35ee-11ea-894b-4ea48230c02b.png)<br>
230 | We do Machine Learning and Artificial Intelligence @[musixmatch](https://twitter.com/Musixmatch)<br>
231 | Follow us on [Twitter](https://twitter.com/musixmatchai) [Github](https://github.com/musixmatchresearch)
232 | 
233 | 
234 | 


--------------------------------------------------------------------------------