├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── GoogleRE_objects
    │   ├── af.json
    │   ├── ar.json
    │   ├── az.json
    │   ├── be.json
    │   ├── bg.json
    │   ├── bn.json
    │   ├── ca.json
    │   ├── ceb.json
    │   ├── cs.json
    │   ├── cy.json
    │   ├── da.json
    │   ├── de.json
    │   ├── el.json
    │   ├── en.json
    │   ├── es.json
    │   ├── et.json
    │   ├── eu.json
    │   ├── fa.json
    │   ├── fi.json
    │   ├── fr.json
    │   ├── ga.json
    │   ├── gl.json
    │   ├── he.json
    │   ├── hi.json
    │   ├── hr.json
    │   ├── hu.json
    │   ├── hy.json
    │   ├── id.json
    │   ├── it.json
    │   ├── ja.json
    │   ├── ka.json
    │   ├── ko.json
    │   ├── la.json
    │   ├── lt.json
    │   ├── lv.json
    │   ├── ms.json
    │   ├── nl.json
    │   ├── pl.json
    │   ├── pt.json
    │   ├── ro.json
    │   ├── ru.json
    │   ├── sk.json
    │   ├── sl.json
    │   ├── sq.json
    │   ├── sr.json
    │   ├── sv.json
    │   ├── ta.json
    │   ├── th.json
    │   ├── tr.json
    │   ├── uk.json
    │   ├── ur.json
    │   ├── vi.json
    │   └── zh.json
    └── TREx_multilingual_objects
    │   ├── af.json
    │   ├── an.json
    │   ├── ar.json
    │   ├── ast.json
    │   ├── az.json
    │   ├── azb.json
    │   ├── ba.json
    │   ├── bar.json
    │   ├── be.json
    │   ├── bg.json
    │   ├── bn.json
    │   ├── br.json
    │   ├── bs.json
    │   ├── ca.json
    │   ├── ce.json
    │   ├── ceb.json
    │   ├── cs.json
    │   ├── cv.json
    │   ├── cy.json
    │   ├── da.json
    │   ├── de.json
    │   ├── el.json
    │   ├── en.json
    │   ├── es.json
    │   ├── et.json
    │   ├── eu.json
    │   ├── fa.json
    │   ├── fi.json
    │   ├── fr.json
    │   ├── ga.json
    │   ├── gl.json
    │   ├── gu.json
    │   ├── he.json
    │   ├── hi.json
    │   ├── hr.json
    │   ├── ht.json
    │   ├── hu.json
    │   ├── hy.json
    │   ├── id.json
    │   ├── io.json
    │   ├── is.json
    │   ├── it.json
    │   ├── ja.json
    │   ├── jv.json
    │   ├── ka.json
    │   ├── kk.json
    │   ├── kn.json
    │   ├── ko.json
    │   ├── ky.json
    │   ├── la.json
    │   ├── lb.json
    │   ├── lmo.json
    │   ├── lt.json
    │   ├── lv.json
    │   ├── mg.json
    │   ├── min.json
    │   ├── mk.json
    │   ├── ml.json
    │   ├── mn.json
    │   ├── mr.json
    │   ├── ms.json
    │   ├── my.json
    │   ├── nds.json
    │   ├── ne.json
    │   ├── new.json
    │   ├── nl.json
    │   ├── nn.json
    │   ├── no.json
    │   ├── oc.json
    │   ├── pa.json
    │   ├── pl.json
    │   ├── pms.json
    │   ├── pnb.json
    │   ├── pt.json
    │   ├── ro.json
    │   ├── ru.json
    │   ├── scn.json
    │   ├── sco.json
    │   ├── sh.json
    │   ├── sk.json
    │   ├── sl.json
    │   ├── sq.json
    │   ├── sr.json
    │   ├── su.json
    │   ├── sv.json
    │   ├── sw.json
    │   ├── ta.json
    │   ├── te.json
    │   ├── tg.json
    │   ├── th.json
    │   ├── tl.json
    │   ├── tr.json
    │   ├── tt.json
    │   ├── uk.json
    │   ├── ur.json
    │   ├── uz.json
    │   ├── vi.json
    │   ├── vo.json
    │   ├── war.json
    │   ├── yo.json
    │   └── zh.json
├── dataset
    ├── cleanup.py
    ├── download_trexentities.py
    ├── download_wikidata.py
    ├── mbertlangs.txt
    ├── mlama.sh
    ├── reader.py
    ├── relations.py
    ├── requirements.txt
    ├── translate_googlere.py
    ├── translate_templates.py
    ├── translate_trex.py
    └── utils.py
├── mlama
    ├── __init__.py
    ├── build_encoded_dataset.py
    ├── eval_generation.py
    ├── evaluation_metrics_ranked.py
    ├── get_contextual_embeddings.py
    ├── modules
    │   ├── __init__.py
    │   ├── base_connector.py
    │   ├── bert_connector.py
    │   └── bert_connector_.py
    ├── options.py
    ├── utils.py
    └── vocab_intersection.py
├── requirements.txt
└── scripts
    ├── batch_eval_KB_completion_mBERT_ranked.py
    ├── eval.py
    └── run_experiments_mBERT_ranked.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | RelationExtraction/emnlp2017-relation-extraction-master/resources/glove/glove.6B.50d.txt
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | # IPython checkpoints
62 | .ipynb_checkpoints
63 | 
64 | # Mac os x stuff
65 | .DS_Store
66 | 
67 | pre-trained_language_models/
68 | src/
69 | .idea
70 | */.mypy_cache
71 | LAMA-Internal/
72 | data/
73 | last_results.csv
74 | output/
75 | 
76 | last_*
77 | .nfs*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # mLAMA: multilingual LAnguage Model Analysis
 2 | 
 3 | This repository contains code for the EACL 2021 paper ["Multilingual LAMA: Investigating Knowledge in Multilingual Pretrained Language Models"](https://arxiv.org/abs/2102.00894).
 4 | It extends the original LAMA probe to the multilingual setting, e.g. it probes knowledge in pre-trained language models in a multilingual setting.
 5 | 
 6 | The repository is forked from https://github.com/facebookresearch/LAMA and adapted accordingly. 
 7 | 
 8 | ## The mLAMA probe
 9 | 
10 | To reproduce our results:
11 | 
12 | ### 1. Create conda environment and install requirements
13 | 
14 | (optional) It might be a good idea to use a separate conda environment. It can be created by running:
15 | ```
16 | conda create -n mlama -y python=3.7 && conda activate mlama
17 | pip install -r requirements.txt
18 | ```
19 | 
20 | add project to path:
21 | 
22 | export PYTHONPATH=${PYTHONPATH}:/path-to-project
23 | 
24 | ### 2. Download the data
25 | 
26 | 
27 | ```bash
28 | wget http://cistern.cis.lmu.de/mlama/mlama1.1.zip
29 | unzip mlama1.1.zip
30 | rm mlama1.1.zip
31 | mv mlama1.1 data/mlama1.1/
32 | ```
33 | 
34 | ### 3. Run the experiments
35 | 
36 | ```bash
37 | python scripts/run_experiments_mBERT_ranked.py --lang "fr"
38 | python scripts/eval.py
39 | ```
40 | 
41 | ## The dataset
42 | 
43 | Code to recreate the dataset can be found in the folder `dataset`. 
44 | 
45 | We provide a class to read in the dataset in `dataset/reader.py`. Example for reading the data: 
46 | ```python
47 | ml = MLama("data/mlama/")
48 | ml.load()
49 | ```
50 | 
51 | ## Reference:
52 | 
53 | ```bibtex
54 | @inproceedings{kassner2021multilingual,
55 |     title = "Multilingual {LAMA}: Investigating Knowledge in Multilingual Pretrained Language Models",
56 |     author = {Kassner, Nora  and
57 |       Dufter, Philipp  and
58 |       Sch{\"u}tze, Hinrich},
59 |     booktitle = "to appear in Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics",
60 |     year = "2021",
61 |     address = "Online",
62 |     publisher = "Association for Computational Linguistics",
63 | }
64 | 
65 | @inproceedings{petroni2019language,
66 |   title={Language Models as Knowledge Bases?},
67 |   author={F. Petroni, T. Rockt{\"{a}}schel, A. H. Miller, P. Lewis, A. Bakhtin, Y. Wu and S. Riedel},
68 |   booktitle={In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2019},
69 |   year={2019}
70 | }
71 | ```
72 | 
73 | ## Acknowledgements
74 | 
75 | * [https://github.com/huggingface/pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT)
76 | * [https://github.com/allenai/allennlp](https://github.com/allenai/allennlp)
77 | * [https://github.com/pytorch/fairseq](https://github.com/pytorch/fairseq)
78 | * https://github.com/facebookresearch/LAMA
79 | 
80 | ## Licence
81 | 
82 | mLAMA is licensed under the CC-BY-NC 4.0 license. The text of the license can be found [here](LICENSE).
83 | 


--------------------------------------------------------------------------------
/data/GoogleRE_objects/af.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/az.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/be.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/ceb.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/cs.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["\u0158\u00edm", "Koda\u0148", "Praha", "Mnichov", "Sydney", "Var\u0161ava", "Burlington", "Antverpy", "Riga", "Pa\u0159\u00ed\u017e", "Stade", "Havana", "Jamajka", "Iowa", "Lond\u00fdn", "Madrid", "Sydney", "Mil\u00e1n", "Heidelberg", "Montr\u00e9al", "Var\u0161ava", "N\u011bmecko", "Pa\u0159\u00ed\u017e", "Bow", "Lipsko", "Dayton", "Sevilla", "Wellington", "Neapol", "Anglie", "\u0158\u00edm", "Lond\u00fdn", "Neapol", "Bradford", "Havana", "Stockholm", "Melbourne", "Var\u0161ava", "Anglie", "Lond\u00fdn", "Praha", "Montr\u00e9al", "Gloucester", "Budape\u0161\u0165", "Praha", "Jamajka", "Pa\u0159\u00ed\u017e", "Florencie", "Moskva", "Koda\u0148", "Anglie", "Praha", "Budape\u0161\u0165", "Tur\u00edn", "Sydney", "Anglie", "Praha", "Sydney", "Mil\u00e1n", "Aston", "Praha", "Ben\u00e1tky", "Toronto", "Nanking", "Tbilisi", "Francie", "\u0158\u00edm", "Sydney", "Filadelfie", "Cleveland", "Siena", "Stockholm", "Al\u017e\u00edr", "Fairfield", "Neapol", "Detroit", "Watford", "Liverpool", "Nevada", "Waterford", "Berl\u00edn", "Stuttgart", "Barcelona", "Polsko", "Pa\u0159\u00ed\u017e", "Pa\u0159\u00ed\u017e", "\u0158\u00edm", "Lond\u00fdn", "Wilmington", "Austin", "Seattle", "Vancouver", "Anglie", "Pa\u0159\u00ed\u017e", "Buckinghamshire", "Praha", "Melbourne", "It\u00e1lie", "Seattle", "Manchester", "Limerick", "Pa\u0159\u00ed\u017e", "Z\u00e1h\u0159eb", "Preston", "Oslo", "Al\u017e\u00edr", "Manchester", "Pa\u0159\u00ed\u017e", "Anglie", "Florencie", "Nottingham", "Pa\u0159\u00ed\u017e", "Z\u00e1h\u0159eb", "Alb\u00e1nie", "Praha", "Praha", "Brandon", "Boston", "\u0158\u00edm", "Ipswich", "Prefektura Tokio", "B\u011blehrad", "Neapol", "Riga", "Lipsko", "Barcelona", "\u0158\u00edm", "Praha", "Westminster", "Split", "Lisabon", "Split", "Moskva", "Edinburgh", "\u0160v\u00e9dsko", "Macon", "Bukure\u0161\u0165", "Kalifornie", "Madrid", "Dublin", "Columbus", "Lyon", "Pa\u0159\u00ed\u017e", "Francie", "Praha", "Var\u0161ava", "Brooklyn", "Como", "Lond\u00fdn", "Montr\u00e9al", "Lond\u00fdn", "Kalifornie", "Tur\u00edn", "Palermo", "V\u00edde\u0148", "Oslo", "Praha", "Polsko", "\u0160pan\u011blsko", "J\u00e1va", "V\u00edde\u0148", "Rochester", "Tur\u00edn", "Lvov", "Ben\u00e1tky", "Massachusetts", "Devon", "Pa\u0159\u00ed\u017e", "Berl\u00edn", "Chicago", "Toledo", "Surrey", "Anglie", "B\u011blehrad", "Praha", "Hamburk", "Slovensko", "Lipsko", "Rusko", "Finsko", "Victoria", "Hol\u0161t\u00fdnsko", "Berl\u00edn", "Weston", "Neapol", "Lyon", "Mil\u00e1n", "Amsterdam", "Pa\u0159\u00ed\u017e", "Mexiko", "It\u00e1lie", "Stuttgart", "Lond\u00fdn", "Praha", "Berl\u00edn", "Casablanca", "Tampa", "Belgie", "Jeruzal\u00e9m", "Lond\u00fdn", "Janov", "Anglie", "Lipsko", "Mil\u00e1n", "Vancouver", "Lond\u00fdn", "Tours", "Newport", "Springfield", "Austr\u00e1lie", "Mil\u00e1n", "Berl\u00edn", "Tottenham", "Brooklyn", "Borneo", "Berl\u00edn", "Ontario", "Filadelfie", "Mil\u00e1n", "Pa\u0159\u00ed\u017e", "\u0158\u00edm", "Nassau", "Bukure\u0161\u0165", "Lipsko", "V\u00edde\u0148", "Norwich", "Amsterdam", "Bratislava", "Moskva", "Neapol", "Pittsburgh", "Como", "Detroit", "Stuttgart", "Tbilisi", "Praha"], "subjects": ["Eduard Ender", "Eyolf Kleven", "Alois Wachsman", "Marcus Junkelmann", "Julia Wilson", "Zenon Nowosz", "Paul Daniels", "Peeter van Bredael", "Igors Vihrovs", "Renaud Gagneux", "Peter Ording", "Yanitzia Canetti", "Bernard Wright", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Stanis\u0142aw Urban", "Aurel Codoban", "Nikos Aliagas", "Clive Brooks", "Maja Tucholke", "Kelley Deal", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Leo Abrahams", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "Christfried Burmeister", "Aliuska L\u00f3pezov\u00e1", "Staffan de Mistura", "Brett Hayman", "Romuald Giegiel", "Colin Groves", "Benjamin Brecknell Turner", "Rudolf K\u0159es\u0165an", "David Atkinson", "Edward Gardner", "Andrea M\u00e1tayov\u00e1", "Ivo Luka\u010dovi\u010d", "Sheyla Bonnick", "Herv\u00e9 Alphand", "Filippo Soffici", "Jelena Beljakovov\u00e1", "Victor Borge", "Stephen Fox", "Pavel \u017d\u00e1\u010dek", "Katalin Kar\u00e1dy", "Felice Giordano", "Danielle McGrath", "John Mundy", "Milan Orlowski", "Stephen Carr", "Guiniforte Solari", "Trevor Burton", "Jan Anton\u00edn Duchoslav", "Giovanni Francesco Commendone", "Jack Blum", "Kuo \u0164in-lung", "\u017dores Medved\u011bv", "Gratien Ga\u00ebl Suares", "Lucius Verus", "Elizabeth Kell", "Francis Davis", "Mark Buchanan", "Alessandro Frosini", "Ellen Gulbransonov\u00e1", "Paul Belmondo", "Richard Bayley", "Carlo Silipo", "Lawrence Kushner", "George Randolph Pearkes", "Dylan Taite", "Pat Nixonov\u00e1", "Michael Carney", "Guy De Saint Cyr", "Wilhelm Friedrich Boger", "Felipe Alfau", "Li'on Dici'an", "Marc Sangnier", "Corn\u00e9lie Falcon", "Stefano Nolfi", "Alaric Alexander Watts", "Collins J. Seitz", "Sahara Smith", "Shyril O'Steen", "Peter Dembicki", "Pieter de Molyn", "Gabriel Bertrand", "John Borlase", "Petr Kroutil", "Michael Guider", "Tancr\u00e8de Dumas", "Jeff Simmons", "Matt O'Connor", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Snorre Valen", "Maurice Va\u00efsse", "John Mundy", "Victor Antoine Signoret", "Edward Locke", "Enrico Toselli", "Barry Howard", "Alain Ehrenberg", "Aleksandra Romani\u0107", "Thomas Nassi", "Martin Kratochv\u00edl", "Joseph Wilhelm Swoboda", "Tim Long", "Frederick Lewis Allen", "Anastasius I.", "Jamie Moses", "Take\u0161i Maeda", "Andrea Leki\u0107ov\u00e1", "Andrea Giani", "Mordehajs Dubins", "Gottfried Heinrich Bach", "Miguel Garc\u00eda", "Augusto De Marsanich", "Franti\u0161ek Neuwirth", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Jennifer Smith", "Petar \u010culi\u0107", "Lev Le\u0161\u010denko", "Georgina Kennard", "Staffan de Mistura", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Catherine Pakenham", "Sumalee Montano", "Claude Bourgelat", "Henri de Contenson", "Michael Armstrong", "Lud\u011bk Fr\u00fdbort", "Jan Szyszko", "Joe Ascione", "Luca Princiotta", "Malcolm Cecil", "William Reed", "Nigel Preston", "Jimmy Greenspoon", "Alessio Secco", "Francesco Musotto", "Marion Stein", "Erik Dammann", "Regina Mar\u0161\u00edkov\u00e1", "Gosia Piotrowska", "Manola Saavedra", "Ien Angov\u00e1", "Ernst Florian Winter", "Diane Greene", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Giulio Carpioni", "Sarah Stiles", "Neil Doncaster", "Charles Nicholas Aub\u00e9", "Meike Evers", "James Burnham", "Francisco Cervantes de Salazar", "Rob Heanley", "Barry Palmer", "Marinko Mad\u017egalj", "Ond\u0159ej Neff", "Frederick Franklin Schrader", "Pavol Polakovi\u010d", "Johann Friedrich Schleusner", "Rosabelle Sinclair", "Sami Hinkka", "Murray Hocking", "Louis Gurlitt", "Heinz Oestergaard", "Jared Cohen", "Giuseppe de Majo", "Ernest Lafont", "Matteo Salvini", "Theodor Holman", "Ren\u00e9 Mayer", "Leopoldo Gout", "Giancarlo Primo", "Bernhard R\u00fchling", "William Main Page", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Michel Lafosse", "Josef Chari\u0161", "Charles Forbes Ren\u00e9 de Montalembert", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Abbondio Sangiorgio", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Richard Hudson", "Borah Bergman", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Leon Bass", "Marisa Masullo", "Fran\u00e7ois Maspero", "Tommaso Marconi", "Johnny Kemp", "Loredana Errore", "Moritz Wilhelm Drobisch", "Adolf Patera", "Warren Carlyle", "Gijs Vermeulen", "Andrej \u0160eban", "Andrej Maratovi\u010d Babickij", "Diego Nargiso", "David Scott Milton", "Giambattista Nolli", "Dave Marsh", "Kim Bauermeister", "Giorgi Ketojev", "Jan \u010cul\u00edk"]}, "place_of_death": {"objects": ["Stockholm", "Cambridge", "Konstantinopol", "Havaj", "Edinburgh", "Neapol", "Lyon", "V\u00edde\u0148", "\u0158\u00edm", "Var\u0161ava", "Belgie", "Cambridge", "Lond\u00fdn", "Amsterdam", "Lubla\u0148", "Exeter", "Florencie", "Jeruzal\u00e9m", "Litva", "Buffalo", "Vilnius", "Manhattan", "Cincinnati", "\u0158\u00edm", "Florida", "Tunisko", "Gent", "Kalifornie", "Sussex", "Manhattan", "Jerevan", "Helsinky", "Oxford", "Montr\u00e9al", "Florencie", "Lond\u00fdn", "Praha", "Pa\u0159\u00ed\u017e", "Madrid", "Lond\u00fdn", "Var\u0161ava", "Liverpool", "Z\u00e1h\u0159eb", "Perth", "Var\u0161ava", "Lipsko", "Praha", "Toronto", "Jokohama", "Siena", "Atlanta", "Berl\u00edn", "Detroit", "Birmingham", "Madrid", "Var\u0161ava", "Dover", "Lille", "Scarborough", "Konstantinopol", "Berkeley", "Sevilla", "Moskva", "Amsterdam", "Janov", "Filadelfie", "Praha"], "subjects": ["Johann Gustaf Sandberg", "Simon Greenleaf", "Jan I. Dukas", "Donn Lewin", "Henry Siddons", "Raimondo Guarini", "Joseph Jean-Baptiste Xavier Fournet", "Johan Stephan Decker", "Gioseppe Agnelli", "Adolf Dygasi\u0144ski", "Mark\u00e9ta Anglick\u00e1", "Grahame Clark", "Joshua Cristall", "Norbert van Bloemen", "Janez Bleiweis", "John Flavel", "Domenico Passignano", "Wolf Gold", "Simonas Daukantas", "Rose Clark", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Gotthard Deutsch", "Giovanni Battista Caccini", "Jim Chapin", "Georges Madon", "Robert van Audenaerd", "Ethel Catherwoodov\u00e1", "Thomas Slingsby Duncombe", "Der Scutt", "Toros Toramanian", "Olavi Paavolainen", "Homer Hasenpflug Dubs", "Edouard Gagnon", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Alexandr Gu\u010dkov", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Frane Buli\u0107", "Vivian Bullwinkelov\u00e1", "Ludwika J\u0119drzejewicz", "Paul Luther", "Wilhelm Elsner", "Pauline Mills McGibbon", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Christa Wolfov\u00e1", "Orestes Brownson", "Martin Laroche", "Enrique Sarasola", "Wincenty Krasi\u0144ski", "Nathaniel William Wraxall", "John Shortland", "Stephen Joseph", "Abd\u00fclmecid I.", "Franti\u0161ek Wolf", "Felipe de Le\u00f3n", "Nikolaj Strunnikov", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/cy.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/et.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/eu.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenhage", "Munich", "Sydney", "Milwaukee", "Dallas", "Stade", "Habana", "Livingston", "Varsovia", "Tacoma", "Londres", "Madril", "Amsterdam", "Sydney", "Milan", "Heidelberg", "Montreal", "Oklahoma", "Varsovia", "Alemania", "Leipzig", "Wellington", "Ingalaterra", "Lublin", "Hartford", "Erroma", "Londres", "Nebraska", "Bradford", "Melbourne", "Grezia", "Filadelfia", "Ingalaterra", "Shelby", "Montreal", "Praga", "Chicago", "Florentzia", "Kopenhage", "Praga", "Sydney", "Londres", "Ingalaterra", "Paris", "Sydney", "Boston", "Cornish", "Tbilisi", "Erroma", "Sydney", "Filadelfia", "Siena", "Aljer", "Chicago", "Nevada", "Waterford", "Stuttgart", "Bartzelona", "Sevilla", "Aljer", "Paris", "Polonia", "Pittsburgh", "Seattle", "Vancouver", "Utah", "Paris", "Pomerania", "Buckinghamshire", "Paris", "Paris", "Melbourne", "Manhattan", "Bergen", "Hollywood", "Limerick", "Paris", "Zagreb", "Preston", "Chicago", "Manchester", "Ingalaterra", "Nottingham", "Kalifornia", "Zagreb", "Albania", "Praga", "Erroma", "Orlando", "Riga", "Leipzig", "Erroma", "Westminster", "Split", "Chicago", "Split", "Macon", "Bukarest", "Kalifornia", "Espainia", "Columbus", "Lyon", "Paris", "Tallinn", "Frantzia", "Como", "Cardiff", "Paris", "Montreal", "Wilmington", "Turin", "Palermo", "Paris", "Viena", "Oslo", "Turin", "Lviv", "Paris", "Kensington", "Massachusetts", "Devon", "Berlin", "Surrey", "Hanburgo", "Oslo", "Budapest", "Errusia", "Victoria", "Holstein", "Paris", "Pennsylvania", "Milan", "Italia", "Lima", "Stuttgart", "Macon", "Praga", "Berlin", "Casablanca", "Tampa", "Genova", "Ingalaterra", "Leipzig", "Vancouver", "Londres", "Tours", "Newport", "Springfield", "Australia", "Milan", "Berlin", "Hartford", "Malta", "Borneo", "Berlin", "Cardiff", "Ontario", "Mississippi", "Tallinn", "Milan", "Erroma", "Bukarest", "Norwich", "Tallinn", "Amsterdam", "Baltimore", "Filadelfia", "Pittsburgh", "Chicago"], "subjects": ["Eyolf Kleven", "Marcus Junkelmann", "Julia Wilson", "Robert Daniel Murphy", "Shannon Emerick", "Peter Ording", "Yanitzia Canetti", "Ken Niles", "Gaba Kulka", "Michael Manuel", "James William Wallack", "Fina de Calder\u00f3n", "Johan de Graeff", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Charles Kemper", "Stanis\u0142aw Urban", "Aurel Codoban", "Maja Tucholke", "Barry Mitcalfe", "Leo Abrahams", "J\u00f3zef Wieniawski", "Ann Corio", "Enrico Montesano", "Cliff Jones", "Edwin Sutherland", "Christfried Burmeister", "Brett Hayman", "Nektaria Karantzi", "Susan Denin", "Colin Groves", "Nina Repeta", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Byron Morrow", "Filippo Soffici", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Danielle McGrath", "David Parry", "John Mundy", "No\u00ebl Gallon", "Stephen Carr", "John Snyder", "Julie Duncan", "Zhores Medvedev", "Luzio Vero", "Elizabeth Kell", "Francis Davis", "Alessandro Frosini", "Paul Belmondo", "No I.D.", "Patricia Ryan Nixon", "Michael Carney", "Wilhelm Boger", "Felipe Alfau", "Cipriano de Valera", "Alain Dorval", "Marc Sangnier", "Eva Maria Zuk", "Buzzy Linhart", "Shyril O'Steen", "Peter Dembicki", "Leonard Strong", "Gabriel Bertrand", "Martin Kosleck", "John Borlase", "Jean Gallon", "Pierre Joxe", "Michael Guider", "Dennis Davis", "Kjersti Elvik", "Amy Chance", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Kip King", "John Mundy", "Edward Locke", "Barry Howard", "John Friedrich", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Anastasio I.a", "Davis Gaines", "Mordehajs Dubins", "Gottfried Heinrich Bach", "Augusto De Marsanich", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Dino Wells", "Petar \u010culi\u0107", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "\u00c1ngel Garma", "Sumalee Montano", "Claude Bourgelat", "Henri de Contenson", "Martin Jervan", "Michael Armstrong", "Luca Princiotta", "Herbert Bowden, Baron Aylestone", "\u00c9mile L\u00e9vy", "William Reed", "Margaret Gwenver", "Alessio Secco", "Francesco Musotto", "Pierre Cartellier", "Marion Stein", "Erik Dammann", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Armand Toussaint", "Nigel Tangye", "Sarah Stiles", "Neil Doncaster", "Meike Evers", "Rob Heanley", "Frederick Franklin Schrader", "Jon Elster", "\u00c1kos Cs\u00e1sz\u00e1r", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "H\u00e9l\u00e8ne Carr\u00e8re d'Encausse", "Bill Dillard", "Matteo Salvini", "Giancarlo Primo", "Antonio Ruiz de Montoya", "Bernhard R\u00fchling", "Lisa Sheridan", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Nick Karner", "Gabriel Caruana", "Michael Matus", "Erich Werdermann", "Simon Bowman", "Elizabeth Hess", "Judy Dunaway", "Aarne Ruben", "Marisa Masullo", "Tommaso Marconi", "Loredana Errore", "Warren Carlyle", "Martin Zobel", "Gijs Vermeulen", "Alwina Valleria", "Bernie Lowe", "David Scott Milton", "Paul Willis"]}, "place_of_death": {"objects": ["Hawaii", "Edinburgh", "Richmond", "Lyon", "Varsovia", "Colchester", "Hollywood", "Filadelfia", "Cambridge", "Kingston", "Londres", "Carlisle", "Vilnius", "Manhattan", "Paris", "Florida", "Sussex", "Manhattan", "Londres", "Jerusalem", "Kalifornia", "Helsinki", "Oxford", "Houston", "Florentzia", "Londres", "Praga", "Paris", "Madril", "Londres", "Varsovia", "Liverpool", "Sevilla", "Leipzig", "Cheyenne", "Newark", "Toronto", "Yokohama", "Siena", "Atlanta", "Berlin", "Hollywood", "Detroit", "Berkeley", "Lille", "Scarborough", "Hollywood", "Boston", "Konstantinopla", "Paris", "Sevilla", "Bolonia", "Amsterdam", "Genova", "Filadelfia", "Praga"], "subjects": ["Donn Lewin", "Henry Siddons", "Alexander William Doniphan", "Joseph Jean-Baptiste Xavier Fournet", "Adolf Dygasi\u0144ski", "Hugh Iorys Hughes", "Romaine Fielding", "Coral Lansbury", "Grahame Clark", "Mariana Grajales", "Joshua Cristall", "Molly Pitcher", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Andr\u00e9 Chamson", "Jim Chapin", "Thomas Slingsby Duncombe", "Der Scutt", "Pauline Joran", "Elisha Netanyahu", "Anthony George", "Olavi Paavolainen", "Homer Hasenpflug Dubs", "Tommy Leonetti", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Alexander Gutxkov", "Juan P\u00e9rez de Montalv\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Floridablancako kondea", "Paul Luther", "William Pleater Davidge", "Jerry Damon", "Pauline Mills McGibbon", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Christa Wolf", "Gloria Grey", "Orestes Brownson", "Egon Petri", "John Shortland", "Stephen Joseph", "Anita King", "Robert Hazard", "Abdulmezid I.a", "Sibyl Sanderson", "Felipe de Le\u00f3n", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/gl.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["M\u00fanic", "Sydney", "Dallas", "Stade", "Casablanca", "A Habana", "Suecia", "Iowa", "Londres", "Madrid", "Sydney", "Mil\u00e1n", "Heidelberg", "Arxentina", "Montreal", "Toronto", "Varsovia", "Alema\u00f1a", "Bow", "Leipzig", "Wellington", "Roma", "Londres", "Melbourne", "Inglaterra", "Montreal", "Praga", "Xamaica", "Montreal", "Florencia", "Colonia", "Praga", "Sydney", "Londres", "Manchester", "Inglaterra", "Sydney", "Belgrado", "Alema\u00f1a", "Sydney", "Filadelfia", "Melbourne", "Siena", "Escocia", "Chicago", "Waterford", "Berl\u00edn", "Stuttgart", "Par\u00eds", "Madrid", "Seattle", "Vancouver", "Inglaterra", "Par\u00eds", "Buckinghamshire", "Melbourne", "Manhattan", "Limerick", "Par\u00eds", "Honduras", "Zagreb", "Alxer", "Manchester", "Par\u00eds", "Miami", "Inglaterra", "Zagreb", "Albania", "Praga", "Roma", "Riga", "Glasgow", "Bristol", "Roma", "Split", "Edimburgo", "Split", "Londres", "Macon", "Londres", "Bucarest", "California", "Madrid", "Li\u00f3n", "Par\u00eds", "Tal\u00edn", "Francia", "Como", "Cardiff", "Montreal", "Tur\u00edn", "Palermo", "Viena", "Oslo", "Manchester", "Viena", "Sheffield", "Rochester", "Tur\u00edn", "Lviv", "Massachusetts", "Devon", "Berl\u00edn", "Surrey", "Hamburgo", "Rusia", "Victoria", "Berl\u00edn", "Par\u00eds", "Mil\u00e1n", "Italia", "Stuttgart", "Macon", "Praga", "Berl\u00edn", "Casablanca", "Tampa", "X\u00e9nova", "Inglaterra", "Leipzig", "Vancouver", "Londres", "Tours", "Newport", "Australia", "Mil\u00e1n", "Berl\u00edn", "Borneo", "Berl\u00edn", "Caracas", "Cardiff", "Ontario", "Tal\u00edn", "Mil\u00e1n", "Roma", "Bucarest", "Par\u00eds", "Tal\u00edn", "\u00c1msterdam", "Filadelfia", "Alxer", "Roma", "Lisboa"], "subjects": ["Marcus Junkelmann", "Julia Wilson", "Tom Jones", "Peter Ording", "Jos\u00e9 B\u00e9naz\u00e9raf", "Yanitzia Canetti", "Leonard Gyllenhaal", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Horatio Luro", "Wayne Eagling", "Tony Mitchell", "Stanis\u0142aw Urban", "Aurel Codoban", "Clive Brooks", "Maja Tucholke", "Barry Mitcalfe", "Enrico Montesano", "Cliff Jones", "Brett Hayman", "Colin Groves", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Sheyla Bonnick", "Lionel Tiger", "Filippo Soffici", "Josef Metternich", "Pavel \u017d\u00e1\u010dek", "Danielle McGrath", "David Parry", "Alfred Ollivant", "John Mundy", "Stephen Carr", "Aco Petrovi\u0107", "Antonio Ciacca", "Elizabeth Kell", "Francis Davis", "Justine Smethurst", "Alessandro Frosini", "John McHale", "No_ID", "Michael Carney", "Guy De Saint Cyr", "Wilhelm Boger", "Marc Sangnier", "Eduardo Lago", "Shyril O'Steen", "Peter Dembicki", "Pieter de Molijn", "Gabriel Bertrand", "John Borlase", "Michael Guider", "Dennis Davis", "Sam Lynch", "Claude Piel", "Vicente G\u00f3mez", "Luka Grubor", "Maurice Va\u00efsse", "John Mundy", "Victor Antoine Signoret", "Alejandro Gonz\u00e1lez Trujillo", "Edward Locke", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Anastasio I, papa", "Mordehajs Dubins", "Ian Steel", "William Child", "Augusto De Marsanich", "Tomislav Smoljanovi\u0107", "Walter Elliot", "Petar \u010culi\u0107", "Andrew Bell", "Laurence Stallings", "Charles Dixon", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Claude Bourgelat", "Henri de Contenson", "Martin Jervan", "Michael Armstrong", "Luca Princiotta", "Herbert Bowden, Baron Aylestone", "William Reed", "Alessio Secco", "Francesco Musotto", "Marion Stein", "Erik Dammann", "John Owens", "Ernst Florian Winter", "Trevor Taylor", "Diane Greene", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Sarah Stiles", "Neil Doncaster", "Meike Evers", "Rob Heanley", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Heinz Oestergaard", "H\u00e9l\u00e8ne Carr\u00e8re d'Encausse", "Matteo Salvini", "Giancarlo Primo", "Bernhard R\u00fchling", "Lisa Sheridan", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Michael Matus", "Erich Werdermann", "Rosario Marciano", "Simon Bowman", "Elizabeth Hess", "Aarne Ruben", "Marisa Masullo", "Tommaso Marconi", "Loredana Errore", "Elsa Lunghini", "Martin Zobel", "Gijs Vermeulen", "Bernie Lowe", "Hakim Toumi", "Luis Simarro", "Rui Tavares"]}, "place_of_death": {"objects": ["Hawai", "Edimburgo", "Francia", "Li\u00f3n", "Colombia", "Cambridge", "Londres", "Vilnius", "Manhattan", "Florida", "Madrid", "Manhattan", "Par\u00eds", "Oxford", "Florencia", "Londres", "Praga", "Madrid", "Londres", "Varsovia", "Leiden", "Liverpool", "Dunedin", "Toronto", "Iocoama", "Siena", "Berkeley", "Atlanta", "Berl\u00edn", "Detroit", "Madrid", "Lille", "Boston", "Sevilla", "Bolo\u00f1a", "\u00c1msterdam", "X\u00e9nova", "Filadelfia", "Praga"], "subjects": ["Donn Lewin", "Henry Siddons", "Walt Hansgen", "Joseph Jean-Baptiste Xavier Fournet", "Rafael Pombo", "Grahame Clark", "Joshua Cristall", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Jim Chapin", "Esperanza P\u00e9rez Labrador", "Der Scutt", "Jules Quicherat", "Homer Hasenpflug Dubs", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Juan P\u00e9rez de Montalv\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Tiberius Hemsterhuis", "Jefferson Lowndes", "Kurt Baier", "Pauline Mills McGibbon", "Vito Positano", "Francesco Vanni", "Andrew Imbrie", "Cesare Siepi", "Christa Wolf", "Orestes Brownson", "Enrique Sarasola", "John Shortland", "Robert Hazard", "Felipe de Le\u00f3n", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/he.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/hr.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenhagen", "M\u00fcnchen", "Sydney", "Stade", "Havana", "Iowa", "London", "Madrid", "Zagreb", "Sydney", "Heidelberg", "Montr\u00e9al", "London", "Var\u0161ava", "Njema\u010dka", "Leipzig", "Turska", "Sarajevo", "Sevilla", "Wellington", "Napulj", "Bologna", "London", "Napulj", "Bradford", "Melbourne", "Engleska", "Montr\u00e9al", "Zagreb", "Prag", "Turska", "Kopenhagen", "Prag", "Torino", "Sydney", "Engleska", "Sydney", "Toronto", "Sydney", "Philadelphia", "Siena", "Split", "Nevada", "Waterford", "Berlin", "Barcelona", "Pariz", "Turska", "Seattle", "Vancouver", "Pariz", "Buckinghamshire", "Melbourne", "Limerick", "Pariz", "Zagreb", "Al\u017eir", "Manchester", "Engleska", "Nottingham", "London", "Zagreb", "Albanija", "Prag", "Riga", "Leipzig", "Westminster", "Split", "Sarajevo", "Split", "Macon", "Bukure\u0161t", "Kalifornija", "Madrid", "Lyon", "Pariz", "Francuska", "Montr\u00e9al", "London", "Palermo", "Sarajevo", "Be\u010d", "Oslo", "\u0160panjolska", "Sheffield", "Lavov", "Massachusetts", "Devon", "Arkansas", "Berlin", "Toledo", "Beograd", "Hamburg", "Rusija", "Finska", "Victoria", "Holstein", "Gvatemala", "Milano", "London", "Prag", "Berlin", "Engleska", "Pasadena", "Leipzig", "Vancouver", "London", "Tours", "Newport", "Frederick", "Australija", "Berlin", "London", "Zagreb", "Borneo", "Berlin", "Ontario", "Pariz", "Amsterdam", "Bratislava", "Pittsburgh", "Como", "Dublin"], "subjects": ["Eyolf Kleven", "Marcus Junkelmann", "Julia Wilson", "Peter Ording", "Yanitzia Canetti", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Gordan Ko\u017eulj", "Alastair Gordon", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "John Barry", "Stanis\u0142aw Urban", "Aurel Codoban", "Maja Tucholke", "Serdar Apayd\u0131n", "Viktor Ivan\u010di\u0107", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Ottaviano Mascherino", "Cliff Jones", "Warington Wilkinson Smyth", "Christfried Burmeister", "Brett Hayman", "Colin Groves", "David Atkinson", "Marija Lugari\u0107", "Ivo Luka\u010dovi\u010d", "Murat Evliyao\u011flu", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Felice Giordano", "Danielle McGrath", "John Mundy", "Stephen Carr", "Jack Blum", "Elizabeth Kell", "Francis Davis", "Alessandro Frosini", "Tino Vegar", "Pat Nixon", "Michael Carney", "Guy De Saint Cyr", "Felipe Alfau", "Marc Sangnier", "Tolga Tekinalp", "Shyril O'Steen", "Peter Dembicki", "Gabriel Bertrand", "John Borlase", "Michael Guider", "Sam Lynch", "Claude Piel", "Luka Grubor", "Maurice Va\u00efsse", "John Mundy", "Edward Locke", "Barry Howard", "Olivia Poulet", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Mordehajs Dubins", "Gottfried Heinrich Bach", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Davor Su\u010di\u0107", "Petar \u010culi\u0107", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Claude Bourgelat", "Henri de Contenson", "Michael Armstrong", "William Reed", "Nigel Preston", "Francesco Musotto", "Kemal Alispahi\u0107", "Marion Stein", "Erik Dammann", "Manola Saavedra", "Trevor Taylor", "Witold Rodzi\u0144ski", "Sarah Stiles", "Neil Doncaster", "Keena Rothhammer", "Meike Evers", "Francisco Cervantes de Salazar", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Sami Hinkka", "Murray Hocking", "Louis Gurlitt", "Franz Galich Mazariegos", "Matteo Salvini", "William Main Page", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "John Joseph Braham, Sr.", "Gordon Copley", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Scott Ambush", "Claire Baxter", "Peter Lachmann", "Reginald Baliot Brett", "Lovro Artukovi\u0107", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Fran\u00e7ois Maspero", "Gijs Vermeulen", "Andrej \u0160eban", "David Scott Milton", "Giambattista Nolli", "John O'Conor"]}, "place_of_death": {"objects": ["Carigrad", "Havaji", "Edinburgh", "Francuska", "Bologna", "Napulj", "Lyon", "Cambridge", "London", "Litva", "Vilnius", "Manhattan", "Pariz", "Florida", "Beograd", "Sussex", "Manhattan", "Oxford", "London", "Lisabon", "Prag", "Madrid", "London", "Var\u0161ava", "Liverpool", "Zagreb", "Pariz", "Toronto", "Yokohama", "Hollywood", "Detroit", "Madrid", "Lille", "Carigrad", "Sevilla", "Bologna", "Amsterdam", "Philadelphia", "Prag"], "subjects": ["Ivan I. Duka", "Donn Lewin", "Henry Siddons", "Walt Hansgen", "Odofredus Denari", "Raimondo Guarini", "Joseph Jean-Baptiste Xavier Fournet", "Grahame Clark", "Joshua Cristall", "Simonas Daukantas", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Andr\u00e9 Chamson", "Jim Chapin", "Florijan Matekalo", "Thomas Slingsby Duncombe", "Der Scutt", "Homer Hasenpflug Dubs", "David Merrick", "Jos\u00e9 Manuel Soares", "V\u00e1clav Havel", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Frane Buli\u0107", "Marcel Oopa", "Pauline Mills McGibbon", "Vito Positano", "Gloria Grey", "Orestes Brownson", "Enrique Sarasola", "John Shortland", "Abdul Med\u017eid I.", "Felipe de Le\u00f3n", "Thomas Dempster", "Nicolaes de Bruyn", "William More Gabb", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/hy.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["\u0539\u0580\u0565\u0576\u0569\u0578\u0576", "\u054d\u057f\u0578\u056f\u0570\u0578\u056c\u0574", "\u054c\u056b\u0563\u0561", "\u0555\u0576\u057f\u0561\u0580\u056b\u0578", "\u054f\u0561\u056f\u0578\u0574\u0561", "\u0539\u0578\u0582\u056c\u0578\u0582\u0566", "\u054f\u0578\u0580\u0578\u0576\u057f\u0578", "\u0555\u0564\u0565\u057d\u0561", "\u0532\u0578\u0582\u0564\u0561\u057a\u0565\u0577\u057f", "\u0535\u0563\u056b\u057a\u057f\u0578\u057d", "\u0544\u0578\u057d\u056f\u057e\u0561", "\u0532\u0578\u0582\u0564\u0561\u057a\u0565\u0577\u057f", "\u053f\u0561\u057d\u0561\u0562\u056c\u0561\u0576\u056f\u0561", "\u054a\u0580\u0561\u0570\u0561", "\u0540\u057c\u0578\u0574", "\u0531\u056c\u056a\u056b\u0580", "\u0546\u0587\u0561\u0564\u0561", "\u054e\u0561\u0576", "\u0553\u0561\u0580\u056b\u0566", "\u0540\u0561\u0580\u0569\u0586\u0578\u0580\u0564", "\u0540\u057c\u0578\u0574", "\u054e\u056b\u0580\u057b\u056b\u0576\u056b\u0561", "\u0553\u0561\u0580\u056b\u0566", "\u0544\u0565\u0575\u056f\u0578\u0576", "\u0555\u0584\u057d\u0586\u0578\u0580\u0564", "\u054e\u0561\u0580\u0577\u0561\u057e\u0561", "\u0555\u057d\u056c\u0578", "\u0535\u0580\u0587\u0561\u0576", "\u0555\u057d\u056c\u0578", "\u054f\u056b\u0580\u0561\u0576\u0561", "\u0553\u0561\u0580\u056b\u0566", "\u0535\u0580\u0587\u0561\u0576", "\u0540\u0578\u0582\u0576\u0563\u0561\u0580\u056b\u0561", "\u0539\u0562\u056b\u056c\u056b\u057d\u056b", "\u0539\u0562\u056b\u056c\u056b\u057d\u056b", "\u0537\u0564\u056b\u0576\u0562\u0578\u0582\u0580\u0563"], "subjects": ["\u0544\u0561\u0580\u057f\u056b\u0576 \u053f\u0578\u0576\u0578\u0580", "\u0533\u0580\u0565\u057f\u0561 \u053f\u0576\u0578\u0582\u057f\u057d\u0578\u0576", "\u053b\u0563\u0578\u0580 \u054e\u056b\u0570\u0580\u0578\u057e", "\u0548\u0582\u0578\u056c\u0569\u0565\u0580 \u0533\u0580\u0565\u0581\u056f\u056b", "\u0544\u0561\u0575\u0584\u056c \u0544\u0561\u0576\u0578\u0582\u0565\u056c", "\u054c\u0578\u056a\u0565 \u0532\u0580\u0575\u0578\u0582\u0576\u0565", "\u0539\u0578\u0576\u056b \u0544\u056b\u057f\u0579\u0565\u056c", "\u054d\u0561\u0574\u057e\u0565\u056c \u0533\u0575\u0578\u0566\u0561\u056c\u0575\u0561\u0576", "\u053c\u0561\u057d\u056c\u0578 \u0532\u056b\u057f\u0578", "\u0540\u0561\u056f\u0578\u0562 \u0540\u0561\u056f\u0578\u0562\u0575\u0561\u0576", "\u0531\u0576\u0561\u057f\u0578\u056c\u056b \u0531\u056c\u0565\u0584\u057d\u056b\u0576", "\u053f\u0561\u057f\u0561\u056c\u056b\u0576 \u053f\u0561\u0580\u0561\u0564\u056b", "\u053c\u0561\u0570\u057d\u0565\u0576 \u0531\u0562\u0580\u0561\u0574\u056b", "\u0545\u0561\u0576 \u0531\u0576\u057f\u0578\u0576\u056b\u0576 \u0534\u0578\u0582\u056d\u0578\u057d\u056c\u0561\u057e", "\u053c\u0578\u0582\u0581\u056b\u0578\u057d \u054e\u0565\u0580\u0578\u057d", "\u054a\u0578\u056c \u0532\u0565\u056c\u0574\u0578\u0576\u0564\u0578", "\u0553\u0565\u0569 \u0546\u056b\u0584\u057d\u0578\u0576", "\u0531\u0572\u0561\u057d\u056b \u053d\u0561\u0576\u057b\u0575\u0561\u0576", "\u053a\u0561\u056f \u0564'\u0531\u0563\u0561\u0580", "\u053c\u0578\u0582\u057d\u056b\u0576 \u0531\u0574\u0561\u0580\u0561", "\u0531\u0576\u0561\u057d\u057f\u0561\u057d I", "\u054b\u0578\u0576 \u0537\u057e\u0561\u0576\u057d", "\u0531\u056c\u0565\u0584\u057d\u0561\u0576\u0564\u0580 \u0534\u0565\u0563\u0578\u0586", "\u053c\u0578\u0578\u0582\u0580\u0565\u0576\u057d \u054d\u0569\u0561\u056c\u056b\u0576\u0563\u057d", "\u0540\u0565\u056c\u0565\u0576 \u0534\u0580\u0561\u0576\u0563\u0561", "\u0537\u0564\u0574\u0578\u0582\u0576\u0564 \u0556\u0565\u057f\u057f\u056b\u0576\u0563", "\u0537\u0580\u056b\u056f \u0534\u0561\u0574\u0561\u0576", "\u0546\u0565\u0580\u057d\u0565\u057d \u0535\u0580\u056b\u0581\u0575\u0561\u0576", "\u0545\u0578\u0582\u0576 \u0537\u056c\u057d\u0569\u0565\u0580", "\u0545\u0578\u0582\u0574\u0565\u0580 \u054a\u0561\u0574\u057a\u0578\u0582\u0580\u056b", "\u0537\u056c\u0565\u0576 \u053f\u0561\u0580\u0580\u0565\u0580 \u0534\u0561\u0576\u056f\u0578\u057d", "\u0531\u0580\u0574\u0565\u0576 \u0531\u0575\u057e\u0561\u0566\u0575\u0561\u0576", "\u054c\u0578\u0562\u0565\u0580\u057f \u0540\u0565\u0581\u0580\u0578\u0576", "\u053c\u0561\u057e\u0580\u0565\u0576\u057f\u056b \u0531\u0580\u0564\u0561\u0566\u056b\u0561\u0576\u056b", "\u0533\u0565\u0578\u0580\u0563\u056b \u053f\u0565\u057f\u0578\u0587", "\u054d\u0584\u0578\u0569 \u053f\u056c\u0587\u0565\u0580\u0564\u0578\u0576"]}, "place_of_death": {"objects": ["\u0544\u0578\u057d\u056f\u057e\u0561", "\u0544\u0561\u0576\u0570\u0565\u0569\u0565\u0576", "\u0553\u0561\u0580\u056b\u0566", "\u0554\u056b\u0576\u0563\u057d\u057f\u0578\u0576", "\u053c\u056b\u057f\u057e\u0561", "\u0556\u056b\u056c\u0561\u0564\u0565\u056c\u0586\u056b\u0561", "\u0553\u0561\u057d\u0561\u0564\u0565\u0576\u0561", "\u0532\u0578\u0582\u056d\u0561\u0580\u0565\u057d\u057f", "\u0534\u0565\u057f\u0580\u0578\u0575\u0569", "\u0553\u0561\u0580\u056b\u0566", "\u054a\u0565\u056f\u056b\u0576", "\u0535\u0580\u0587\u0561\u0576", "\u0544\u0578\u057d\u056f\u057e\u0561", "\u0553\u0561\u0580\u056b\u0566", "\u0556\u056c\u0578\u0580\u0565\u0576\u0581\u056b\u0561", "\u0545\u0578\u056f\u0578\u0570\u0561\u0574\u0561", "\u053f\u056b\u0578\u057f\u0578", "\u053f\u0578\u057d\u057f\u0561\u0576\u0564\u0576\u0578\u0582\u057a\u0578\u056c\u056b\u057d", "\u0546\u056b\u057d", "\u0544\u0561\u0564\u0580\u056b\u0564"], "subjects": ["\u054e\u0561\u0580\u057e\u0561\u057c\u0561 \u0544\u0561\u057d\u0561\u056c\u056b\u057f\u056b\u0576\u0578\u057e\u0561", "\u0531\u0564\u0565\u056c \u0544\u0561\u0580\u056f\u0578\u0582\u057d", "\u0531\u056c\u0586\u0580\u0565\u0564 \u054c\u0561\u0574\u0562\u0578", "\u0544\u0561\u0580\u056b\u0561\u0576\u0561 \u0533\u0580\u0561\u056d\u0561\u056c\u0565\u057d \u0544\u0561\u057d\u0565\u0578", "\u054d\u056b\u0574\u0578\u0576\u0561\u057d \u0534\u0561\u0578\u0582\u056f\u0561\u0576\u057f\u0561\u057d", "\u0533\u0578\u0580\u057b \u054e\u0578\u056c\u0565\u057d \u0544\u0565\u056c\u057e\u056b\u056c", "\u054b\u0578\u0576 \u054f\u0578\u0564\u0564", "\u054b\u0578\u0580\u057b\u0565 \u054b\u0578\u0580\u057b\u0565\u057d\u056f\u0578\u0582", "\u0533\u0578\u0582\u0580\u0563\u0565\u0576 \u0531\u056c\u0565\u0574\u0577\u0561\u0570", "Andr\u00e9 Jean", "\u0549\u0565\u0576 \u0545\u0561\u0576\u0581\u0575\u0578\u0582", "\u0539\u0578\u0580\u0578\u057d \u0539\u0578\u0580\u0561\u0574\u0561\u0576\u0575\u0561\u0576", "\u054d\u057f\u0565\u057a\u0561\u0576 \u0537\u0580\u0566\u056b\u0561", "\u0531\u056c\u0565\u0584\u057d\u0561\u0576\u0564\u0580 \u0533\u0578\u0582\u0579\u056f\u0578\u057e", "\u0556\u0580\u0565\u0576\u057d\u056b\u057d \u0531\u056c\u0565\u0584\u057d\u0561\u0576\u0564\u0580", "\u054a\u0578\u0566\u056b\u057f\u0561\u0576\u0578 \u054e\u056b\u057f\u0578", "\u0548\u0582\u0565\u0564\u0561 \u0531\u056f\u056b\u0576\u0561\u0580\u056b", "\u0531\u0562\u0564\u0578\u0582\u056c \u0544\u0565\u057b\u056b\u0564", "\u0533\u0565\u0578\u0580\u0563\u056b \u0531\u0564\u0561\u0574\u0578\u057e\u056b\u0579", "\u0532\u0565\u0580\u0576\u0561\u0580\u0564\u0578 \u053c\u0578\u057a\u0565\u0566 \u054a\u056b\u056f\u0565\u0580"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/id.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenhagen", "Sydney", "Australia", "Burlington", "Amsterdam", "Stade", "Havana", "Oxford", "Iowa", "London", "Damaskus", "Wina", "Sydney", "Milan", "Heidelberg", "Montreal", "Toulouse", "Warsawa", "Leipzig", "Wellington", "Portland", "Napoli", "Roma", "London", "Napoli", "Chennai", "Melbourne", "Inggris", "Montreal", "Praha", "Firenze", "Kopenhagen", "Mobile", "Sydney", "London", "Yorkshire", "Inggris", "Praha", "Sydney", "Tbilisi", "Roma", "Sydney", "Siena", "Napoli", "Nevada", "Waterford", "Berlin", "Stuttgart", "Bangkok", "Paris", "Reading", "Paris", "Seattle", "Istanbul", "Vancouver", "Paris", "Canberra", "Melbourne", "Leiden", "Connecticut", "Limerick", "Paris", "Zagreb", "Preston", "Blackburn", "Guangzhou", "Manchester", "Inggris", "Firenze", "Nottingham", "Roma", "Tokyo", "Napoli", "Roma", "Westminster", "Split", "Cincinnati", "Macon", "Bukares", "California", "Paris", "Adelaide", "Amsterdam", "Como", "Brunei Darussalam", "Cardiff", "Istanbul", "Montreal", "Paris", "London", "Torino", "Palermo", "Wina", "Chicago", "Oslo", "Melbourne", "Adelaide", "Beijing", "Torino", "Melbourne", "Massachusetts", "Bordeaux", "Berlin", "Mumbai", "Garland", "Oxford", "Surrey", "Beograd", "Hamburg", "Oslo", "Rusia", "Victoria", "Holstein", "Baltimore", "Napoli", "Chicago", "Milan", "Italia", "Stuttgart", "India", "Casablanca", "Genova", "Inggris", "Leipzig", "Vancouver", "London", "Gent", "Tours", "Newport", "Hamilton, Selandia Baru", "Springfield", "Australia", "Milan", "Berlin", "Jakarta", "Berlin", "Prancis", "Ankara", "Kalimantan", "Cardiff", "Ontario", "Tallinn", "Milan", "Colorado", "Roma", "Queens", "Bukares", "Norwich", "Bretagne", "Amsterdam", "Manila", "Napoli", "Pittsburgh", "Kentucky", "Detroit"], "subjects": ["Eyolf Kleven", "Julia Wilson", "John Seru", "Paul Daniels", "Bernard de Wolff", "Peter Ording", "Yanitzia Canetti", "Thomas Godfrey Faussett", "Eric Ziebold", "James William Wallack", "Bachar Kouatly", "Norbert Balatsch", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Roger Brunet", "Stanis\u0142aw Urban", "Maja Tucholke", "Barry Mitcalfe", "Steve Sundholm", "Renato Caccioppoli", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "K. Bhaskaran", "Brett Hayman", "Colin Groves", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Filippo Soffici", "Victor Borge", "Darnell Kennedy", "Danielle McGrath", "David Parry", "Timothy Drever", "John Mundy", "Milan Orlowski", "Stephen Carr", "Zhores Medvedev", "Lucius Verus", "Elizabeth Kell", "Alessandro Frosini", "Carlo Silipo", "Pat Nixon", "Michael Carney", "Guy De Saint Cyr", "Wilhelm Boger", "Sandrina Malakiano", "Marc Sangnier", "Denys Page", "Hugues Krafft", "Shyril O'Steen", "Do\u011fa Bekleriz", "Peter Dembicki", "Gabriel Bertrand", "Queenie van de Zandt", "Michael Guider", "Simon Binnendijk", "Charles H. Kraft", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Jimmy Brown", "George Kitching", "John Mundy", "Edward Locke", "Enrico Toselli", "Barry Howard", "Paus Anastasius I", "Takeshi Maeda", "Andrea Giani", "Augusto De Marsanich", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Kay Lahusen", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "Henri de Contenson", "Ben Nicholas", "Albertus Jonas Brandt", "Luca Princiotta", "Paula Malai Ali", "Herbert Bowden, Baron Aylestone", "Fuat G\u00fcner", "William Reed", "Eug\u00e8ne Brieux", "Thomas Taylor", "Alessio Secco", "Francesco Musotto", "Marion Stein", "Tim McGill", "Erik Dammann", "Maggie Fitzgibbon", "Janet Ramsey Johnson", "Zhang Xueling", "Nicola Campogrande", "Nicholas Colla", "Sarah Stiles", "Jean Baptiste Rives", "Meike Evers", "Siddhant Karnick", "Nick Richmond", "Edward Stransham", "Rob Heanley", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Jon Elster", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Myra Sklarew", "Giuseppe de Majo", "Raymond R. Schumacher", "Matteo Salvini", "Giancarlo Primo", "Bernhard R\u00fchling", "Brihaspati Dev Triguna", "Migidio Bourifa", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Marc Van Montagu", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Matthew Walker", "Homer Curran", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Pierre Rolland", "Otto Eugen Schulz", "J. B. Jackson", "Serhat", "Michael Matus", "Simon Bowman", "Elizabeth Hess", "Aarne Ruben", "Marisa Masullo", "Tom Maniatis", "Tommaso Marconi", "Stephen K. Benjamin", "Loredana Errore", "Warren Carlyle", "Lo\u00efc Jouannigot", "Gijs Vermeulen", "Elizabeth Cooper", "Diego Nargiso", "David Scott Milton", "George Jewett", "Dave Marsh"]}, "place_of_death": {"objects": ["Santiago de Chile", "Hawaii", "Edinburgh", "Lyon", "Belgia", "Colchester", "Cambridge", "Tbilisi", "London", "Yerusalem", "Manhattan", "Florida", "Inggris", "Australia", "Manhattan", "Amsterdam", "Roma", "Oxford", "Montreal", "Edinburgh", "Mumbai", "London", "Mumbai", "Leiden", "Lansing", "Liverpool", "Leipzig", "Toronto", "Karachi", "Karachi", "Yokohama", "Siena", "Atlanta", "Detroit", "Hollywood", "Kairo", "Dover", "Lille", "Scarborough, Yorkshire Utara", "Vancouver", "Konstantinopel", "Melbourne", "Sevilla", "Bologna", "Amsterdam", "Genova"], "subjects": ["Marta Canales", "Donn Lewin", "Henry Siddons", "Joseph Jean-Baptiste Xavier Fournet", "Margaret, Istri Adipati Brabant", "Hugh Iorys Hughes", "Grahame Clark", "Guram Sharadze", "Joshua Cristall", "Wolf Gold", "Arthur Siegel", "Jim Chapin", "Margo McLennan", "Bettina Welch", "Der Scutt", "Albertus Jonas Brandt", "Sebastiano Baggio", "Homer H. Dubs", "\u00c9douard Gagnon", "William Roxburgh", "Sultan Khan", "David Merrick", "Ali Sardar Jafri", "Tiberius Hemsterhuis", "Geraldine Doyle", "Jefferson Lowndes", "Paul Luther", "Pauline Mills McGibbon", "Abdur Rab Nishtar", "Khursheed Bano", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Orestes Brownson", "Andreas Dippel", "Lotfia El Nady", "Nathaniel Wraxall", "John Shortland", "Stephen Joseph", "Jan Hulsker", "Abd-ul-Mejid I", "Alwyn Kurts", "Felipe de Le\u00f3n", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/ka.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/la.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/lt.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenhaga", "Miunchenas", "Sidn\u0117jus", "Berlingtonas", "Stad\u0117", "Ajova", "Londonas", "Madridas", "Roma", "Sidn\u0117jus", "Heidelbergas", "Monrealis", "Var\u0161uva", "Vokietija", "Pary\u017eius", "Vestfalija", "Leipcigas", "Sevilija", "Velingtonas", "Neapolis", "Bolonija", "At\u0117nai", "Londonas", "Neapolis", "Viena", "Bradfordas", "Melburnas", "Monrealis", "Praha", "Florencija", "Kopenhaga", "Praha", "Turinas", "Sidn\u0117jus", "Anglija", "Sidn\u0117jus", "Tbilisis", "Viena", "Roma", "Sidn\u0117jus", "Filadelfija", "Berlynas", "\u0160tutgartas", "Barselona", "Sevilija", "Pary\u017eius", "Roma", "Vankuveris", "Anglija", "Pary\u017eius", "Bakingam\u0161yras", "Pary\u017eius", "Melburnas", "Pary\u017eius", "Zagrebas", "Prestonas", "Pary\u017eius", "Man\u010desteris", "Melburnas", "Anglija", "Notingamas", "Zagrebas", "Albanija", "Praha", "Ryga", "Pary\u017eius", "Vestminsteris", "Splitas", "Splitas", "Maskva", "Bagdadas", "Bukare\u0161tas", "Madridas", "Lionas", "Pary\u017eius", "Talinas", "Vilnius", "Pranc\u016bzija", "Monrealis", "Londonas", "Viena", "Aleksandrija", "Oslas", "Ispanija", "Masa\u010dusetsas", "Devonas", "Berlynas", "\u010cikaga", "Toledas", "Sur\u0117jus", "Belgradas", "Hamburgas", "Rusija", "Viktorija", "Hol\u0161teinas", "Lionas", "Pary\u017eius", "Kopenhaga", "\u0160tutgartas", "Londonas", "Praha", "Berlynas", "Tampa", "Londonas", "Pary\u017eius", "Anglija", "Leipcigas", "Vankuveris", "Londonas", "Niuportas", "Australija", "Pary\u017eius", "Ankara", "Borneo", "Berlynas", "Ontarijas", "Talinas", "Norid\u017eas", "Talinas", "Amsterdamas", "Maskva", "Pitsbergas", "Komas", "\u0160tutgartas"], "subjects": ["Eyolf Kleven", "Marcus Junkelmann", "Julia Wilson", "Paul Daniels", "Peter Ording", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Anna Maria Villani Scicolone", "Alastair Gordon", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Stanis\u0142aw Urban", "Aurel Codoban", "Nikos Aliagas", "Werner M\u00fcnch", "Maja Tucholke", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Ottaviano Mascherino", "Argiris Pedoulakis", "Cliff Jones", "Warington Wilkinson Smyth", "Ludwig von Wohlgemuth", "Christfried Burmeister", "Brett Hayman", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Filippo Soffici", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Felice Giordano", "Danielle McGrath", "John Mundy", "Stephen Carr", "\u017doresas Medvedevas", "Julius Goldzier", "Lucijus Verus", "Elizabeth Kell", "Francis Davis", "Guy De Saint Cyr", "Wilhelm Boger", "Felipe Alfau", "Cipriano de Valera", "Marc Sangnier", "Stefano Nolfi", "Peter Dembicki", "Pieter de Molijn", "Gabriel Bertrand", "John Borlase", "Pierre Joxe", "Michael Guider", "Claude Piel", "Luka Grubor", "Helen Longworth", "Martin Malvy", "John Mundy", "Diana Trask", "Edward Locke", "Barry Howard", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Mordehajs Dubins", "Robert Lecou", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Petar \u010culi\u0107", "Levas Le\u0161\u010denko", "Jamal Jum\u00e1", "Drago\u0219 Neagu", "M\u00f3nica Estarreado", "Claude Bourgelat", "Henri de Contenson", "Martin Jervan", "Petras Geniu\u0161as", "Michael Armstrong", "William Reed", "Nigel Preston", "Marion Stein", "Maurice Maunoury", "Erik Dammann", "Manola Saavedra", "Sarah Stiles", "Neil Doncaster", "Meike Evers", "James Burnham", "Francisco Cervantes de Salazar", "Rob Heanley", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Ernest Lafont", "Ren\u00e9 Mayer", "Ulla Pia", "Bernhard R\u00fchling", "William Main Page", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Dave Steele", "Charles de Montalembert", "Ren\u00e9 Renoult", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Bernard Lloyd", "Claire Baxter", "\u017diulis Gotje", "Serhat", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Aarne Ruben", "Warren Carlyle", "Martin Zobel", "Gijs Vermeulen", "Andrejus Babickis", "David Scott Milton", "Giambattista Nolli", "Kim Bauermeister"]}, "place_of_death": {"objects": ["Havajai", "Edinburgas", "Neapolis", "Kembrid\u017eas", "Milanas", "Florencija", "Lietuva", "Vilnius", "Manhatanas", "Roma", "Florida", "Manhatanas", "Oksfordas", "Maskva", "Florencija", "Londonas", "Praha", "Hamburgas", "Madridas", "Londonas", "Var\u0161uva", "Liverpulis", "Var\u0161uva", "Pary\u017eius", "Pary\u017eius", "Londonas", "Torontas", "Jokohama", "Madridas", "Lilis", "Skarboras", "Sevilija", "Bolonija", "Filadelfija", "Praha"], "subjects": ["Donn Lewin", "Henry Siddons", "Raimondo Guarini", "Grehemas Klarkas", "Gino Penno", "Domenico Cresti", "Simonas Daukantas", "Karolis Pod\u010da\u0161inskis", "Arthur Siegel", "Giovanni Battista Caccini", "Jim Chapin", "Der Scutt", "Homer Hasenpflug Dubs", "Stepanas Erzia", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Algirdas Klimaitis", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Ludwika J\u0119drzejewicz", "Marcel Pouvanaa Oopa", "Robert Lindet", "Tzvi Hirsch Ferber", "Pauline Mills McGibbon", "Vito Positano", "Enrique Sarasola", "John Shortland", "Stephen Joseph", "Felipe de Le\u00f3n", "Thomas Dempster", "William More Gabb", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/lv.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenh\u0101gena", "Minhene", "R\u012bga", "Aiova", "Londona", "Heidelberga", "Zviedrija", "Neapole", "Londona", "Pr\u0101ga", "Par\u012bze", "Braz\u012blija", "Kopenh\u0101gena", "Anglija", "Filadelfija", "R\u012bga", "Nevada", "Berl\u012bne", "Par\u012bze", "Prestona", "Man\u010destra", "Anglija", "Notingema", "Roma", "R\u012bga", "Vestminstera", "R\u012bga", "Maskava", "Par\u012bze", "Makona", "Bukareste", "Kolumbusa", "Par\u012bze", "Sidneja", "V\u012bne", "Boldera", "Oslo", "R\u012bga", "Masa\u010d\u016bsetsa", "Belgrada", "Hamburga", "Krievija", "Viktorija", "Hol\u0161teina", "Tir\u0101na", "Tampa", "Anglija", "Londona", "\u0145\u016bporta", "Springf\u012blda", "Austr\u0101lija", "Frezno", "Ankara", "Kalimant\u0101na", "Ont\u0101rio", "Tallina", "Maskava", "Norid\u017ea", "Maskava", "Pitsburga", "\u0160tutgarte"], "subjects": ["Eyolf Kleven", "Marcus Junkelmann", "Igors Vihrovs", "Eric Ziebold", "James William Wallack", "Eug\u00e9nie S\u00f6derberg", "Leo \u0112rnr\u016bts", "Renato Caccioppoli", "Cliff Jones", "Ivo Luka\u010dovi\u010d", "Luijs God\u0113ns", "Ruta Kardoso", "Victor Borge", "John Mundy", "Francis Davis", "Aigars V\u012btols", "Peta Niksone", "Guy De Saint Cyr", "Gabriel Bertrand", "Helen Longworth", "John Mundy", "Edward Locke", "Barry Howard", "Anastasijs I", "Mordehajs Dubins", "Henry Bentley", "Juris Sokolovskis", "\u013bevs \u013be\u0161\u010denko", "Aleksandrs Degofs", "Laurence Stallings", "Drago\u0219 Neagu", "Sumalee Montano", "Henri de Contenson", "Tom Kazas", "Marion Stein", "P\u012bters Stetina", "Erik Dammann", "Bruno Rubess", "Sarah Stiles", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Imers Pampuri", "Dave Steele", "John Joseph Braham, Sr.", "Alisa Arnah", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "Toraijs Bragss", "Serhats", "Michael Matus", "Elizabeth Hess", "Aarne Ruben", "Anna Ah\u0161arumova", "Warren Carlyle", "Andrejs Babickis", "David Scott Milton", "Kim Bauermeister"]}, "place_of_death": {"objects": ["Edinburga", "Lietuva", "Pasad\u012bna", "Manhetena", "Florida", "Manhetena", "Maskava", "Toronto", "Berl\u012bne", "Kaira", "Konstantinopole", "Bolo\u0146a", "Filadelfija"], "subjects": ["Henry Siddons", "Simons Daukants", "D\u017eons Tods", "Arthur Siegel", "Jim Chapin", "Der Scutt", "Stepans Erzja", "Pauline Mills McGibbon", "Krista Volfa", "Lotfia El Nadi", "Abdulmed\u017eids I", "Thomas Dempster", "William More Gabb"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/ms.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/pl.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Boston", "Kopenhaga", "Belgia", "Boston", "Luksemburg", "Monachium", "Belgrad", "Sydney", "Tirana", "Budapeszt", "Londyn", "Warszawa", "Sztokholm", "Burlington", "Dallas", "Salzburg", "Ryga", "Berkeley", "Stade", "Serbia", "Wellington", "Warszawa", "Hawana", "Livingston", "Jamajka", "Iowa", "Warszawa", "Londyn", "Warszawa", "Londyn", "Madryt", "Zagrzeb", "Izrael", "Damaszek", "Kij\u00f3w", "Polska", "Wiede\u0144", "Sacramento", "Londyn", "Genua", "Sydney", "Warszawa", "Szkocja", "Moskwa", "Mediolan", "Heidelberg", "Budapeszt", "Pary\u017c", "Montreal", "Tuluza", "Mediolan", "Warszawa", "Niemcy", "Westfalia", "Carlisle", "Bow", "Lipsk", "Kensington", "Ukraina", "Milwaukee", "Lw\u00f3w", "Mediolan", "Sewilla", "Warren", "Wellington", "Budapeszt", "Neapol", "Sofia", "Lublin", "Bolonia", "Warszawa", "Rzym", "Londyn", "Neapol", "Alexandria", "Indianapolis", "Nebraska", "Manchester", "Warszawa", "Lublin", "Budapeszt", "Bradford", "Hawana", "Melbourne", "Christchurch", "Filadelfia", "Warszawa", "Anglia", "Ankara", "Rzym", "Shelby", "Praga", "Montreal", "Londyn", "Pary\u017c", "Budapeszt", "Grenada", "Praga", "Salford", "Birmingham", "Warszawa", "Nankin", "Barcelona", "Belgia", "Tulsa", "Louisville", "Tuluza", "Florencja", "Moskwa", "Coventry", "Kopenhaga", "Cumberland", "Praga", "Dublin", "Moskwa", "Budapeszt", "Turyn", "Sofia", "Sydney", "Londyn", "Anglia", "Wiede\u0144", "Praga", "Casablanca", "Salem", "Sydney", "Mediolan", "Bukareszt", "Austria", "Wenecja", "Sofia", "Tbilisi", "Rzym", "Sydney", "Berlin", "Filadelfia", "Cleveland", "Siena", "Baltimore", "Portugalia", "Tokio", "Boulder", "Split", "Neapol", "Frankfurt nad Menem", "Kopenhaga", "Chicago", "Gandawa", "Ateny", "Nevada", "Preston", "Warszawa", "Waterford", "Anglia", "Berlin", "Stuttgart", "Barcelona", "Pary\u017c", "Polska", "Pary\u017c", "Chicago", "Rzym", "Albany", "Austin", "Warszawa", "Szwecja", "Seattle", "Wan", "Toronto", "Vancouver", "Anglia", "Filadelfia", "Pary\u017c", "Buckinghamshire", "Pensylwania", "Pary\u017c", "Barcelona", "Melbourne", "Budapeszt", "Manhattan", "Kopenhaga", "Polska", "Bergen", "Limerick", "Niemcy", "Lima", "Pary\u017c", "Victoria", "Tokio", "Zagrzeb", "Preston", "Pary\u017c", "Algier", "Manchester", "Lw\u00f3w", "Ryga", "Seattle", "Pune", "Rotterdam", "Anglia", "Buffalo", "Florencja", "Nottingham", "Warszawa", "Middlesex", "Moskwa", "Zagrzeb", "Albania", "Praga", "Sofia", "Rzym", "Ipswich", "Rzym", "Stambu\u0142", "Ottawa", "Belgrad", "Portugalia", "Sztokholm", "Neapol", "Ryga", "Best", "Glasgow", "Pary\u017c", "Bristol", "Rzym", "Westminster", "Ryga", "Oslo", "Split", "Madryt", "Split", "Moskwa", "Genua", "Warszawa", "Budapeszt", "Birmingham", "Macon", "Brze\u015b\u0107", "Bukareszt", "Kalifornia", "Madryt", "Columbus", "Lyon", "Pary\u017c", "Neapol", "Budapeszt", "Antwerpia", "Kilkenny", "Pary\u017c", "Francja", "Warszawa", "Portsmouth", "Londyn", "Warszawa", "Watford", "City of Salford", "Everett", "Kij\u00f3w", "Brooklyn", "Sydney", "Dallas", "Wichita", "Exeter", "Como", "Warszawa", "Londyn", "Bari", "Czechy", "Cardiff", "Montreal", "Londyn", "Lublin", "Kaza\u0144", "Turyn", "Warszawa", "Palermo", "Sarajewo", "Wiede\u0144", "Aleksandria", "Florencja", "Boulder", "Oslo", "Fleet", "Kuba", "Polska", "Hiszpania", "Warszawa", "Sheffield", "Cumberland", "York", "Turyn", "Lw\u00f3w", "Kij\u00f3w", "Massachusetts", "Warszawa", "Devon", "Niemcy", "Arkansas", "Baltimore", "Luksemburg", "Berlin", "Chicago", "Oslo", "Toledo", "Oksford", "Surrey", "Anglia", "Belgrad", "Praga", "Hamburg", "Anglia", "Edmonton", "Oslo", "Sydney", "Moskwa", "Filadelfia", "Rosja", "Mediolan", "Wiktoria", "Berlin", "Holsztyn", "Cincinnati", "Oslo", "Neapol", "Wilno", "Tirana", "Edynburg", "Lyon", "Budapeszt", "Pary\u017c", "Mediolan", "Pary\u017c", "Londyn", "W\u0142ochy", "Kopenhaga", "Stuttgart", "Londyn", "Macon", "Dayton", "Bukareszt", "Praga", "Berlin", "Casablanca", "Wiede\u0144", "Tampa", "Lublin", "Bolonia", "Londyn", "Genua", "Pary\u017c", "Anglia", "Lipsk", "Croydon", "Belfast", "Berlin", "Vancouver", "Londyn", "Swindon", "Tours", "Newport", "Springfield", "Australia", "Mediolan", "Berlin", "Warszawa", "Phoenix", "Montreal", "Polska", "Lyon", "Lw\u00f3w", "Ankara", "Ankara", "Borneo", "Berlin", "Baltimore", "Ontario", "Madryt", "Moskwa", "Mediolan", "Montgomery", "Cambridge", "Pary\u017c", "Rzym", "Nassau", "Bukareszt", "Lipsk", "Pary\u017c", "Middlesex", "Budapeszt", "Albany", "Norwich", "Lyon", "Londyn", "Amsterdam", "Bratys\u0142awa", "Kopenhaga", "Toledo", "Szwecja", "Filadelfia", "Neapol", "Pittsburgh", "Lizbona", "Como", "Filadelfia", "Stuttgart", "Tbilisi", "Londyn", "Glasgow", "Edynburg", "Budapeszt", "Pary\u017c"], "subjects": ["Lucy Toulmin Smith", "Eyolf Kleven", "Marvano", "Douglas Fry", "Jean Hamilius", "Marcus Junkelmann", "Mihael Brejc", "Julia Wilson", "Ylli Bufi", "Ferenc Sipos", "Arthur Harold Stone", "Zenon Nowosz", "Greta Knutson", "Paul Daniels", "Tom Jones", "Eugen Enderlen", "Igors Vihrovs", "Patrick Daughters", "Peter Ording", "Branko Ra\u0161i\u0107", "Paula Tesoriero", "Witold Nazarewicz", "Yanitzia Canetti", "Ken Niles", "Bernard Wright", "Eric Ziebold", "Adam Buszko", "Joan Vincent Murray", "Gaba Kulka", "James William Wallack", "Fina de Calder\u00f3n", "Gordan Ko\u017eulj", "Avi Bortnick", "Bachar Kouatly", "Rusia", "Marcin Gawron", "Norbert Balatsch", "Michael Urbano", "Alain de Cadenet", "Giacomo Luigi Brignole", "Alastair Gordon", "Dariusz Lipi\u0144ski", "Michael Gallagher", "Siergiej Ord\u017conikidze", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "George Clifford Sziklai", "Marthe Chenal", "Wayne Eagling", "Roger Brunet", "Celestino Sfondrati", "Stanis\u0142aw Urban", "Aurel Codoban", "Werner M\u00fcnch", "Samuel Sterett", "Clive Brooks", "Maja Tucholke", "Arthur Kinnaird", "Cippora Laskow", "Terry Zahn", "Anna Kurska", "Paul Zuccarelli", "Francisco de Osuna", "Michael Shine", "Barry Mitcalfe", "Gy\u00f6rgy Br\u00f3dy", "Renato Caccioppoli", "Mire\u0142a Iwanowa", "J\u00f3zef Wieniawski", "Ottaviano Mascherino", "Leszek Korzeniowski", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "John Carlyle Herbert", "John Boling", "Edwin Hardin Sutherland", "Brian Callison", "Hanna O\u017cogowska", "Andrzej Ma\u0144ka", "L\u00e1szl\u00f3 Bit\u00f3", "Christfried Burmeister", "Aliuska L\u00f3pez", "Brett Hayman", "Gilbert de Clare", "Harrison Allen", "Romuald Giegiel", "Colin Groves", "Ahmet G\u00fclhan", "Mario Theodoli", "Nina Repeta", "Rudolf K\u0159es\u0165an", "David Atkinson", "Johnny Mowlem", "Patrick Lemari\u00e9", "Andrea M\u00e1tay", "Francisca Pleguezuelos", "Ivo Luka\u010dovi\u010d", "Gary Titley", "Brian Manning", "Micha\u0142 Tober", "Michael Anti", "Jacques Mehler", "Adam Gierek", "George Clark", "Alexander Pope Field", "Christine de Veyrac", "Filippo Soffici", "Jelena Bielakowa", "Martin Jacques", "Victor Borge", "Thomas Johns Perry", "Pavel \u017d\u00e1\u010dek", "Leslie Paul", "Anatolij Aleksin", "Katalin Kar\u00e1dy", "Felice Giordano", "Lubomir Iwanow", "Danielle McGrath", "David Parry", "John Mundy", "Leopold Alexander", "Milan Orlowski", "Lahcen Abrami", "James Henry Emerton", "Stephen Carr", "Guiniforte Solari", "Valeriu Stoica", "Lucas Auer", "Giovanni Francesco Commendone", "Miglena Markowa", "\u017bores Miedwiediew", "Lucjusz Werus", "Elizabeth Kell", "Leopold Casper", "Francis Davis", "Mark Buchanan", "Alessandro Frosini", "William Samuel Booze", "Jo\u00e3o de Souza Mendes", "Eugene Tzigane", "Alex Figge", "Tino Vegar", "Carlo Silipo", "Karl Chmielewski", "Niels Bjerrum", "No I.D.", "Johan Daisne", "Antigoni Goni", "Pat Nixon", "Robert Holden", "Pawe\u0142 Zalewski", "Michael Carney", "Donald Appleyard", "Guy De Saint Cyr", "Wilhelm Boger", "Felipe Alfau", "Charles Coll\u00e9", "Leon Dycian", "Marc Sangnier", "Leslie Allen", "Stefano Nolfi", "Stephen Levine", "Sahara Smith", "Aleksander \u017babczy\u0144ski", "Emma Ejwertz", "Shyril O'Steen", "Aghasi Chand\u017cian", "David Hackl", "Peter Dembicki", "Pieter de Molyn", "Jeff Chandler", "Gabriel Bertrand", "John Borlase", "Leonard Bosack", "Pierre Joxe", "Salvador Cristau Coll", "Michael Guider", "Csaba \u0150ry", "Dennis Davis", "Thomas Dausgaard", "Artur Zawisza", "Kjersti Elvik", "Sam Lynch", "Ornella Oettl Reyes", "Gunnar Samuelsson", "Claude Piel", "Roland Green", "Masataka Yanagida", "Luka Grubor", "Helen Longworth", "Martin Malvy", "Maurice Va\u00efsse", "John Mundy", "Tadeusz Browicz", "Nico Gardener", "Graham Ackerman", "Abhijit Kunte", "Mark Koevermans", "Edward Locke", "David Marusek", "Enrico Toselli", "Barry Howard", "Binem Heller", "Nathaniel Culverwel", "Jakow Murej", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "W\u0142adimir Georgiew", "Anastazy I", "Jamie Moses", "Antonio Tosti", "Aleksander Hangerli", "Alejandro Abellan", "Andrea Leki\u0107", "Francisco Roxo", "Tommy Waidelich", "Andrea Giani", "Mordehajs Dubins", "Eric Swinkels", "Ian Steel", "Robert Lecou", "William Child", "Augusto De Marsanich", "Henry Bentley", "Juris Sokolovskis", "Tommy Rustad", "Tomislav Smoljanovi\u0107", "Alonso del Arco", "Petar \u010culi\u0107", "Lew Leszczenko", "Carlo Fatuzzo", "Stanis\u0142awa Nowicka", "K\u00e1roly Varga", "Bo Weavil Jackson", "Laurence Stallings", "Natalla Hielach", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Sumalee Montano", "Claude Bourgelat", "Henri de Contenson", "Carlo Emery", "Zolt\u00e1n K\u00f3sz", "Hendrick Andriessen", "Eileen O\u2019Keeffe", "Andr\u00e9 L\u00e9ri", "Michael Armstrong", "Jan Szyszko", "John Randall Reding", "Bertie Felstead", "Kazimierz Flatau", "Alan MacDonald", "Michelle Rogers", "Benjamin Castleman", "Wiktor Krasin", "Joe Ascione", "Tom Kazas", "Clinton D. McKinnon", "Roger Mears", "Chris Welsby", "Luca Princiotta", "Karolina Kosi\u0144ska", "Edward Alfred Cowper", "Marcello Vernola", "Jan Beer", "Herbert Bowden", "William Reed", "Nigel Preston", "Marek Muszy\u0144ski", "Rem Urasin", "Alessio Secco", "Edmund Fetting", "Francesco Musotto", "Kemal Alispahi\u0107", "Marion Stein", "Maurice Maunoury", "Valdo Spini", "Peter Stetina", "Erik Dammann", "Gordon Coppuck", "Juan Carlos Gonz\u00e1lez Zamora", "Ma\u0142gorzata Piotrowska", "Manola Saavedra", "Andrzej Kunert", "Trevor Taylor", "George Alexander Pearre", "Adam Kowalczyk", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Abraham Mintchine", "Sarah Stiles", "Piotr Buciarski", "Neil Doncaster", "Imanu\u2019el Szefer", "Keena Rothhammer", "John Ambrose Meyer", "Claude Wiseler", "Meike Evers", "James Burnham", "Johannes Falkenberg", "Francisco Cervantes de Salazar", "Edward Stransham", "Rob Heanley", "Barry Palmer", "Marinko Mad\u017egalj", "Ond\u0159ej Neff", "Frederick Franklin Schrader", "Janek Schaefer", "Chris Woodhead", "Jon Elster", "Joan Hartigan", "Galina Fokina", "Freddy Winnai", "Rosabelle Sinclair", "Luca Bottale", "Murray Hocking", "Max Lehmann", "Louis Gurlitt", "Tom Luken", "Erik Willoch", "Giuseppe de Majo", "Jacek Sauk", "Ymer Pampuri", "Gillian Cooke", "Ernest Lafont", "Ferenc A. V\u00e1li", "H\u00e9l\u00e8ne Carr\u00e8re d\u2019Encausse", "Matteo Salvini", "Ren\u00e9 Mayer", "Alice Temple", "Giancarlo Primo", "Ulla Pia", "Bernhard R\u00fchling", "William Main Page", "Lisa Sheridan", "Len Zengel", "Nicolae Herlea", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Georg Hellmesberger Jr.", "Dave Steele", "Alina Gut", "Max Angelelli", "Charles de Montalembert", "Monica Esposito", "Ren\u00e9 Renoult", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Anne Clark", "John Garland", "Awner W. Less", "Heather Davis", "Alisa Arnah", "Crowther Charlesworth", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Miros\u0142aw Maliszewski", "Jerry Pettis", "Ira Vail", "Jerzy Adamuszek", "Martine Roure", "Jerzy Lerski", "Serhat", "Ayta\u00e7 Biter", "Michael Matus", "Erich Werdermann", "John Lewis Thomas Jr.", "Elizabeth Hess", "Emilio Men\u00e9ndez del Valle", "Anna Achszarumowa", "Marisa Masullo", "Bill Endicott", "Aleksander Whitaker", "Fran\u00e7ois Maspero", "Tommaso Marconi", "Johnny Kemp", "Loredana Errore", "Moritz Wilhelm Drobisch", "Elsa Lunghini", "Geoffrey Alderman", "B\u00e9la Glattfelder", "Jane Stanford", "Warren Carlyle", "Didier Andr\u00e9", "Keith Wiggins", "Gijs Vermeulen", "Andrej \u0160eban", "Ludwig Drescher", "Cliff Bergere", "Kajsa Kling", "Bernie Lowe", "Diego Nargiso", "David Scott Milton", "Rui Tavares", "Giambattista Nolli", "Spencer Wishart", "Kim Bauermeister", "Gieorgij Kietojew", "Adrian Bowyer", "John Cameron", "Scott Cleverdon", "Imre Zach\u00e1r", "Marcel Bertrand"]}, "place_of_death": {"objects": ["Moskwa", "Sztokholm", "Konstantynopol", "Hawaje", "Vancouver", "Edynburg", "Francja", "Marquette", "Neapol", "Richmond", "Lyon", "Pary\u017c", "Warszawa", "Florencja", "Bolonia", "Londyn", "Warszawa", "Kolumbia", "Bejrut", "Cambridge", "Cambridge", "Turcja", "Londyn", "Caldwell", "Kalifornia", "Exeter", "Wellington", "Lyon", "Litwa", "Milton", "Bukareszt", "Wilno", "Manhattan", "Utrecht", "Rzym", "Floryda", "Tunezja", "Japonia", "Sparta", "Neapol", "Kalifornia", "Sussex", "Cambridge", "Pary\u017c", "Manhattan", "Cambridge", "Rzym", "Berlin", "Moskwa", "Helsinki", "Pary\u017c", "Oksford", "Moskwa", "Montreal", "Chicago", "Florencja", "Massachusetts", "Londyn", "Praga", "Londyn", "Pary\u017c", "Norwich", "Madryt", "Londyn", "Warszawa", "Londyn", "Lejda", "Pary\u017c", "Portugalia", "Cumberland", "Liverpool", "Sewilla", "Warszawa", "Pary\u017c", "Lipsk", "Pary\u017c", "Baltimore", "Florencja", "Warszawa", "Toronto", "Wiede\u0144", "Jokohama", "Baltimore", "Siena", "Manhattan", "Kioto", "Atlanta", "Berlin", "Edynburg", "Detroit", "Pary\u017c", "Madryt", "Warszawa", "Kair", "Berkeley", "Dover", "Lille", "Scarborough", "Vancouver", "Boston", "Konstantynopol", "Melbourne", "Sewilla", "Manila", "Nicea", "Szanghaj", "Moskwa", "Mediolan", "Dublin", "Londyn", "Bolonia", "Amsterdam", "Genua", "Filadelfia", "Aleppo", "Praga"], "subjects": ["Varvara Massalitinova", "Johan Gustaf Sandberg", "Jan I Angelos", "Donn Lewin", "Bent Peder Rasch", "Henry Siddons", "Walt Hansgen", "Frederic Baraga", "Raimondo Guarini", "Alexander William Doniphan", "Joseph Jean-Baptiste Xavier Fournet", "Alfred Nicolas Rambaud", "Adolf Dygasi\u0144ski", "Giovanni Durando", "Carlo Emery", "Pearl Richards Craigie", "Antoni Adam Piotrowski", "Rafael Pombo", "Lidija Lipkowska", "Daniel Maynadier Henry", "John Grahame Douglas Clark", "Ola Hansson", "Joshua Cristall", "Sarah Morgan Bryan Piatt", "Charles Constantine", "John Flavel", "Ernest Beaglehole", "Teresa Couderc", "Simonas Daukantas", "Adeline Dutton Train Whitney", "George Georgescu", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Adriaan Reland", "Giovanni Battista Caccini", "Jim Chapin", "Georges Madon", "Masanobu Okumura", "Jezus Jazon", "Giovanni Pontano", "Ethel Catherwood", "Thomas Slingsby Duncombe", "James Augustus Stewart", "Robert Desoille", "Der Scutt", "Ihor \u0160ev\u010denko", "Sebastiano Baggio", "Moritz Land\u00e9", "Niko\u0142aj Milutin", "Olavi Paavolainen", "Moshe Lewin", "Homer Hasenpflug Dubs", "Stiepan Erzia", "Edouard Gagnon", "Battling Nelson", "Philipp von Stosch", "Ferenc A. V\u00e1li", "David Merrick", "V\u00e1clav Havel", "Lucy Faithfull, Baroness Faithfull", "Aleksandr Guczkow", "Eugene Mallove", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Romola de Pulszky", "Tiberius Hemsterhuis", "Gabriel Nicolas de La Reynie", "David Croft", "George Alexander Pearre", "Jefferson Lowndes", "Jos\u00e9 Mo\u00f1ino y Redondo", "Ludwika Chopin", "Marcel Pouvanaa Oopa", "Paul Luter", "Robert Lindet", "Frank Charles Wachter", "Francis Alexander", "Stanis\u0142aw Grzesiuk", "Pauline Mills McGibbon", "Alfred von Henikstein", "Vito Positano", "William Purington Cole Jr.", "Francesco Vanni", "Enrico Donati", "Akinari Ueda", "Cesare Siepi", "Christa Wolf", "John Gillies", "Orestes Brownson", "W\u0142adys\u0142aw \u017bele\u0144ski", "Enrique Sarasola", "Wincenty Krasi\u0144ski", "Lotfia El Nadi", "Egon Petri", "Nathaniel William Wraxall", "John Shortland", "Stephen Joseph", "Jan Hulsker", "Robert Hazard", "Abd\u00fclmecid I", "Alwyn Kurts", "Felipe de Le\u00f3n", "Joaquina Maria Mercedes Barcelo Pages", "Gieorgij Adamowicz", "Liang Shiyi", "Niko\u0142aj Strunnikow", "Maria Pierina de Micheli", "Robert MacBryde", "Benjamin Dale", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Ibrahim Hananu", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/ro.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Copenhaga", "Belgia", "Bucure\u0219ti", "Luxemburg", "M\u00fcnchen", "Belgrad", "Sydney", "Tirana", "Burlington", "Dallas", "Budapesta", "Stade", "Havana", "Hamburg", "Oxford", "Suedia", "Iowa", "Londra", "Madrid", "Viena", "Roma", "Sydney", "Milano", "Heidelberg", "Montr\u00e9al", "Toulouse", "Portsmouth", "Germania", "Cheshire", "Helsinki", "Bow", "Leipzig", "Kensington", "Ungaria", "Sevilla", "Wellington", "Portland", "Napoli", "Bologna", "Roma", "Londra", "Napoli", "Viena", "Bradford", "Melbourne", "Germania", "Anglia", "Montr\u00e9al", "Granada", "Praga", "Salford", "Strand", "Belgia", "Toulouse", "Anglia", "Floren\u021ba", "Copenhaga", "Praga", "Budapesta", "Torino", "Sydney", "Londra", "Yorkshire", "Anglia", "Sydney", "Bucure\u0219ti", "Austria", "Tbilisi", "Roma", "Sydney", "Madison", "Philadelphia", "Siena", "Alger", "Napoli", "Chicago", "Gent", "Bucure\u0219ti", "Nevada", "Berlin", "Stuttgart", "Barcelona", "Rom\u00e2nia", "Paris", "Roma", "Seattle", "Vancouver", "Paris", "Buckinghamshire", "Melbourne", "Budapesta", "Manhattan", "Paris", "Polonia", "Limerick", "Paris", "Zagreb", "Preston", "Alger", "Manchester", "Anglia", "Buffalo", "Nottingham", "Londra", "Columbus", "Zagreb", "Albania", "Praga", "Roma", "Paris", "Istanbul", "Belgrad", "Napoli", "Riga", "Leipzig", "Roma", "Cipru", "Westminster", "Split", "Split", "Moscova", "Genova", "Macon", "Bucure\u0219ti", "California", "Madrid", "Dublin", "Paris", "Tallinn", "Vilnius", "Como", "Rom\u00e2nia", "Bari", "Montr\u00e9al", "Londra", "Torino", "Palermo", "Brooklyn", "Viena", "Bucure\u0219ti", "Oslo", "Cuba", "Spania", "Berlin", "Torino", "Liov", "Massachusetts", "Devon", "Germania", "Paris", "Berlin", "Toledo", "Oxford", "Surrey", "Belgrad", "Hamburg", "Rusia", "Victoria", "Holstein", "Bucure\u0219ti", "Napoli", "Tirana", "Paris", "Milano", "Amsterdam", "Italia", "Bucure\u0219ti", "Stuttgart", "Londra", "Bucure\u0219ti", "Praga", "Berlin", "Casablanca", "Tampa", "Genova", "Anglia", "Leipzig", "Amsterdam", "Milano", "Vancouver", "Londra", "Tours", "Newport", "Berlin", "Australia", "Milano", "Berlin", "Londra", "Ankara", "Borneo", "Berlin", "Ontario", "Tallinn", "Madrid", "Milano", "Roma", "Bucure\u0219ti", "Leipzig", "Hamilton", "Budapesta", "Bournemouth", "Norwich", "Tallinn", "Amsterdam", "Philadelphia", "Napoli", "Pittsburgh", "Como", "Detroit", "Bucure\u0219ti", "Dublin", "Auckland"], "subjects": ["Eyolf Kleven", "Marvano", "Eugenia Popa", "Jean Hamilius", "Marcus Junkelmann", "Mihael Brejc", "Julia Wilson", "Ylli Bufi", "Paul Daniels", "Tom Jones", "Ilus Vay", "Peter Ording", "Yanitzia Canetti", "Caroline Beil", "Thomas Godfrey Faussett", "Leonard Gyllenhaal", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Norbert Balatsch", "Anna Maria Villani Scicolone", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Roger Brunet", "Andrew O'Neill", "Aurel Codoban", "Susan Bullock", "Taavi Vartia", "Clive Brooks", "Maja Tucholke", "Arthur Kinnaird", "Laszlo Gardony", "Francisco de Osuna", "Barry Mitcalfe", "Steve Sundholm", "Renato Caccioppoli", "Ottaviano Mascherino", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "Ludwig von Wohlgemuth", "Christfried Burmeister", "Brett Hayman", "Walter Hilgers", "Colin Groves", "David Atkinson", "Francisca Pleguezuelos", "Ivo Luka\u010dovi\u010d", "Gary Titley", "Hallgeir Langeland", "Adam Gierek", "Christine de Veyrac", "Ian Hancock", "Filippo Soffici", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Katalin Kar\u00e1dy", "Felice Giordano", "Danielle McGrath", "David Parry", "Timothy Drever", "John Mundy", "Stephen Carr", "Valeriu Stoica", "Lucas Auer", "Zhores Medvedev", "Lucius Verus", "Elizabeth Kell", "Bunita Marcus", "Francis Davis", "Alessandro Frosini", "Paul Belmondo", "Carlo Silipo", "No I.D.", "Johan Daisne", "Mircea Florian", "Pat Nixon", "Guy De Saint Cyr", "Wilhelm Boger", "Felipe Alfau", "Romeo Niram", "Marc Sangnier", "Stefano Nolfi", "Shyril O'Steen", "Peter Dembicki", "Gabriel Bertrand", "John Borlase", "Michael Guider", "Csaba \u0150ry", "Dennis Davis", "Jacques d'Agar", "Artur Zawisza", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Maurice Va\u00efsse", "John Mundy", "Edward Locke", "David Marusek", "Barry Howard", "Olivia Poulet", "Michael Shank", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Papa Anastasie I", "Adolphe Cohn", "Alexandru Hangerli", "Andrea Leki\u0107", "Andrea Giani", "Mordehajs Dubins", "Gottfried Heinrich Bach", "Augusto De Marsanich", "Chad Hartigan", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Petar \u010culi\u0107", "Lev Le\u0219cenco", "Carlo Fatuzzo", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Catherine Wellesley", "Henri de Contenson", "Martin Jervan", "Petras Geniu\u0161as", "Luca Princiotta", "Ephraim Hertzano", "Marcello Vernola", "William Reed", "Nigel Preston", "Alessio Secco", "Francesco Musotto", "Ralph Schoenman", "Marion Stein", "Vlad Georgescu", "Erik Dammann", "Juan Carlos Gonz\u00e1lez Zamora", "Manola Saavedra", "Ralf Wadephul", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Sarah Stiles", "Neil Doncaster", "Emmanuel Scheffer", "Charles Nicolas Aub\u00e9", "Meike Evers", "Francisco Cervantes de Salazar", "Edward Stransham", "Rob Heanley", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Alma Redlinger", "Giuseppe de Majo", "Ymer Pampuri", "H\u00e9l\u00e8ne Carr\u00e8re d'Encausse", "Matteo Salvini", "Theodor Holman", "Giancarlo Primo", "Christian Wilhelm Berger", "Bernhard R\u00fchling", "William Main Page", "Nicolae Herlea", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Nyncke Beekhuyzen", "Abbondio Sangiorgio", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Albert Heinrich Brendel", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Reginald Brett, viconte de Esher", "Serhat", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Aarne Ruben", "Emilio Men\u00e9ndez", "Marisa Masullo", "Tommaso Marconi", "Loredana Errore", "Moritz Wilhelm Drobisch", "Linda Crockett", "B\u00e9la Glattfelder", "Craig Richards", "Warren Carlyle", "Martin Zobel", "Gijs Vermeulen", "Bernie Lowe", "Diego Nargiso", "David Scott Milton", "Giambattista Nolli", "Dave Marsh", "Niculae Conovici", "John O'Conor", "Cherry Wilder"]}, "place_of_death": {"objects": ["Moscova", "Santiago de Chile", "M\u00fcnchen", "Hawaii", "Edinburgh", "Napoli", "Londra", "Beirut", "Cambridge", "Turcia", "Londra", "Bucure\u0219ti", "Vilnius", "Manhattan", "Cheltenham", "Paris", "Roma", "Florida", "Sussex", "Paris", "Manhattan", "Oxford", "Moscova", "Lisabona", "Floren\u021ba", "Londra", "Praga", "Vancouver", "Paris", "Madrid", "Londra", "Londra", "Paris", "Roma", "Toronto", "Liverpool", "Zagreb", "Manhattan", "Leipzig", "Toronto", "Bucure\u0219ti", "Yokohama", "Siena", "Atlanta", "Detroit", "Madrid", "Lille", "Scarborough", "Boston", "Constantinopol", "Sevilla", "Nisa", "Bologna", "Amsterdam", "Genova", "Philadelphia", "Praga"], "subjects": ["Varvara Massalitinova", "Marta Canales", "Tobias Andreae", "Donn Lewin", "Henry Siddons", "Raimondo Guarini", "Craigie, Pearl Mary Teresa Richards,", "Lidia Lipkovskaia", "Grahame Clark", "Ola Hansson", "Joshua Cristall", "George Georgescu", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Charles Barton", "Andr\u00e9 Chamson", "Giovanni Battista Caccini", "Jim Chapin", "Thomas Slingsby Duncombe", "Robert Desoille", "Der Scutt", "Homer Hasenpflug Dubs", "Stepan Erzia", "Murilo Mendes", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Wells Wintemute Coates", "Aleksandr Gucikov", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Romola de Pulszky", "Gabriel Nicolas de la Reynie", "Johann Konrad Dorner", "Margaret Marshall Saunders", "Jefferson Lowndes", "Frane Buli\u0107", "Lois Gould", "Paul Luther", "Pauline Mills McGibbon", "Paul Everac", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Orestes Brownson", "Enrique Sarasola", "John Shortland", "Stephen Joseph", "Robert Hazard", "Abdul-Medjid", "Felipe de Le\u00f3n", "Gheorghi Adamovici", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/sk.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["R\u00edm", "Koda\u0148", "Praha", "Mn\u00edchov", "Sydney", "Par\u00ed\u017e", "Stade", "Havana", "Budape\u0161\u0165", "Iowa", "Lond\u00fdn", "Madrid", "Izrael", "Sydney", "Mil\u00e1no", "Heidelberg", "Montreal", "Ben\u00e1tky", "Var\u0161ava", "Nemecko", "Par\u00ed\u017e", "Lipsko", "Kensington", "Dayton", "Sevilla", "Wellington", "Neapol", "R\u00edm", "Lond\u00fdn", "Neapol", "Bradford", "Melbourne", "Anglicko", "Praha", "Montreal", "Praha", "Jamajka", "Florencia", "Koda\u0148", "Anglicko", "Praha", "Tur\u00edn", "Sydney", "Anglicko", "Sydney", "Praha", "G\u00f6teborg", "R\u00edm", "Sydney", "Philadelphia", "Siena", "Neapol", "Nevada", "Waterford", "Berl\u00edn", "Barcelona", "Par\u00ed\u017e", "R\u00edm", "Wilmington", "Seattle", "Vancouver", "Anglicko", "Par\u00ed\u017e", "Buckinghamshire", "Melbourne", "Limerick", "Par\u00ed\u017e", "Z\u00e1hreb", "Al\u017e\u00edr", "Manchester", "Par\u00ed\u017e", "Rotterdam", "Anglicko", "Nottingham", "Z\u00e1hreb", "Alb\u00e1nsko", "Praha", "Praha", "R\u00edm", "Neapol", "Riga", "R\u00edm", "Westminster", "Split", "Split", "Moskva", "Bukure\u0161\u0165", "Kalifornia", "Madrid", "Lyon", "Par\u00ed\u017e", "Franc\u00fazsko", "Como", "Montreal", "Lond\u00fdn", "Kalifornia", "Tur\u00edn", "Palermo", "Viede\u0148", "Oslo", "\u0160panielsko", "J\u00e1va", "Rochester", "Edinburgh", "Tur\u00edn", "\u013dvov", "Ben\u00e1tky", "Massachusetts", "Devon", "Par\u00ed\u017e", "Berl\u00edn", "Toledo", "Surrey", "Belehrad", "Praha", "Hamburg", "Slovensko", "Rusko", "Vikt\u00f3ria", "Neapol", "Mil\u00e1no", "Par\u00ed\u017e", "Taliansko", "Lond\u00fdn", "Praha", "Berl\u00edn", "Casablanca", "Tampa", "Janov", "Anglicko", "Lipsko", "Vancouver", "Lond\u00fdn", "Tours", "Newport", "Austr\u00e1lia", "Mil\u00e1no", "Berl\u00edn", "Afrika", "Borneo", "Berl\u00edn", "Ont\u00e1rio", "Philadelphia", "Mil\u00e1no", "Par\u00ed\u017e", "R\u00edm", "Nassau", "Bukure\u0161\u0165", "Norwich", "Amsterdam", "Bratislava", "Neapol", "Pittsburgh", "Como"], "subjects": ["Eduard Ender", "Eyolf Kleven", "Alois Wachsman", "Marcus Junkelmann", "Julia Wilson", "Renaud Gagneux", "Peter Ording", "Yanitzia Canetti", "\u00c1rp\u00e1d So\u00f3s", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Avi Bortnick", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Rosina Storchio", "Stanis\u0142aw Urban", "Aurel Codoban", "Nikos Aliagas", "Maja Tucholke", "Arthur Kinnaird", "Kelley Deal", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "Christfried Burmeister", "Brett Hayman", "Colin Groves", "Rudolf K\u0159es\u0165an", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Sheyla Bonnick", "Filippo Soffici", "Victor Borge", "Stephen Fox", "Pavel \u017d\u00e1\u010dek", "Felice Giordano", "Danielle McGrath", "John Mundy", "Stephen Carr", "Jan Anton\u00edn Duchoslav", "Peter Nyborg", "Lucius Verus", "Elizabeth Kell", "Francis Davis", "Alessandro Frosini", "Carlo Silipo", "Pat Nixonov\u00e1", "Michael Carney", "Guy De Saint Cyr", "Felipe Alfau", "Marc Sangnier", "Stefano Nolfi", "Collins J. Seitz", "Shyril O'Steen", "Peter Dembicki", "Pieter de Molyn", "Gabriel Bertrand", "John Borlase", "Michael Guider", "Sam Lynch", "Claude Piel", "Luka Grubor", "Maurice Va\u00efsse", "John Mundy", "Victor Antoine Signoret", "Mark Koevermans", "Edward Locke", "Barry Howard", "Aleksandra Romani\u0107", "Thomas Nassi", "Martin Kratochv\u00edl", "Joseph Wilhelm Swoboda", "Anast\u00e1z I.", "Andrea Giani", "Mordehajs Dubins", "Augusto De Marsanich", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Petar \u010culi\u0107", "Lev Le\u0161\u010denko", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Claude Bourgelat", "Henri de Contenson", "Michael Armstrong", "Luca Princiotta", "William Reed", "Nigel Preston", "Jimmy Greenspoon", "Alessio Secco", "Francesco Musotto", "Marion Stein", "Erik Dammann", "Manola Saavedra", "Ien Angov\u00e1", "Diane Greene", "Marcus Dods", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Giulio Carpioni", "Sarah Stiles", "Neil Doncaster", "Charles Nicolas Aub\u00e9", "Meike Evers", "Francisco Cervantes de Salazar", "Rob Heanley", "Marinko Mad\u017egalj", "Ond\u0159ej Neff", "Frederick Franklin Schrader", "Pavol Polakovi\u010d", "Rosabelle Sinclair", "Murray Hocking", "Giuseppe de Majo", "Matteo Salvini", "Ren\u00e9 Mayer", "Giancarlo Primo", "William Main Page", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Publius Annius Florus", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Leon Bass", "Marisa Masullo", "Fran\u00e7ois Maspero", "Tommaso Marconi", "Johnny Kemp", "Loredana Errore", "Warren Carlyle", "Gijs Vermeulen", "Andrej \u0160eban", "Diego Nargiso", "David Scott Milton", "Giambattista Nolli"]}, "place_of_death": {"objects": ["\u0160tokholm", "Edinburgh", "Neapol", "Lyon", "Viede\u0148", "Cambridge", "Lond\u00fdn", "Jeruzalem", "Vilnius", "Manhattan", "R\u00edm", "Florida", "Gent", "Sussex", "Manhattan", "Jerevan", "Oxford", "Florencia", "Lond\u00fdn", "Praha", "Madrid", "Lond\u00fdn", "Var\u0161ava", "Liverpool", "Lipsko", "Toronto", "Jokohama", "Siena", "Atlanta", "Detroit", "Madrid", "K\u00e1hira", "Lille", "Kon\u0161tant\u00ednopol", "Sevilla", "Nice", "Amsterdam", "Janov", "Philadelphia", "Praha"], "subjects": ["Johann Gustaf Sandberg", "Henry Siddons", "Raimondo Guarini", "Joseph Jean-Baptiste Xavier Fournet", "Johan Stephan Decker", "Grahame Clark", "Joshua Cristall", "Wolf Gold", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Giovanni Battista Caccini", "Jim Chapin", "Robert van Audenaerd", "Thomas Slingsby Duncombe", "Der Scutt", "Toros Toramanjan", "Homer Hasenpflug Dubs", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Paul Luther", "Pauline Mills McGibbon", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Orestes Brownson", "Enrique Sarasola", "Lotfia ElNadi", "John Shortland", "Abd\u00fclmecid I.", "Felipe de Le\u00f3n", "Georgij Viktorovi\u010d Adamovi\u010d", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/ta.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/tr.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Boston", "Kopenhag", "M\u00fcnih", "T\u00fcrkiye", "Tiran", "Budape\u015fte", "Burlington", "\u0130stanbul", "Stade", "Iowa", "Londra", "Madrid", "Kiev", "Milano", "Heidelberg", "Montreal", "Oslo", "Var\u015fova", "Edinburgh", "Almanya", "Konstantiniyye", "\u0130stanbul", "Leipzig", "Kensington", "T\u00fcrkiye", "Sevilla", "Wellington", "Napoli", "Belfast", "Atina", "Londra", "Napoli", "Bradford", "Melbourne", "Yunanistan", "\u0130ngiltere", "Ankara", "Shelby", "Montreal", "Prag", "Durham", "T\u00fcrkiye", "Portland", "Nankin", "Paris", "Floransa", "Coventry", "Kopenhag", "Prag", "\u0130stanbul", "Boston", "Budape\u015fte", "Londra", "Londra", "Yorkshire", "\u0130ngiltere", "Tiflis", "Roma", "Philadelphia", "Arnavutluk", "\u015eikago", "Nevada", "Waterford", "Berlin", "Stuttgart", "Barselona", "T\u00fcrkiye", "Kahire", "Seattle", "\u0130stanbul", "Vancouver", "Paris", "Pomeranya", "Buckinghamshire", "Paris", "Melbourne", "\u0130svi\u00e7re", "Manhattan", "\u0130talya", "Limerick", "Paris", "Zagreb", "Preston", "Paris", "Manchester", "\u0130ngiltere", "Nottingham", "Zagreb", "Arnavutluk", "Almanya", "Prag", "Brandon", "Roma", "\u0130stanbul", "Riga", "Westminster", "Split", "Charlotte", "\u0130stanbul", "Split", "\u0130sve\u00e7", "\u0130sve\u00e7", "Birmingham", "Cincinnati", "Ba\u011fdat", "Macon", "B\u00fckre\u015f", "Madrid", "Suriye", "Columbus", "Lyon", "Paris", "\u0130ngiltere", "Fransa", "Brooklyn", "\u0130svi\u00e7re", "\u0130stanbul", "Montreal", "Kaliforniya", "Viyana", "Oslo", "\u0130spanya", "Berlin", "Lviv", "Venedik", "Leeds", "Massachusetts", "Devon", "Berlin", "\u0130stanbul", "Toledo", "Surrey", "Belgrad", "Hamburg", "Oslo", "Boston", "Brooklyn", "Rusya", "Victoria", "Holstein", "Tiran", "M\u00fcnih", "Glasgow", "Paris", "Kopenhag", "Stuttgart", "Macon", "Prag", "Berlin", "Tampa", "Ankara", "Bangkok", "\u0130ngiltere", "Leipzig", "Amsterdam", "Budape\u015fte", "\u0130stanbul", "Vancouver", "Londra", "Newport", "\u0130stanbul", "Springfield", "Avustralya", "Paris", "\u015eikago", "Ankara", "Ankara", "Borneo", "Berlin", "Ontario", "Paris", "Norwich", "Amsterdam", "Philadelphia", "Pittsburgh", "Stuttgart", "Paris", "Atina"], "subjects": ["Lucy Toulmin Smith", "Eyolf Kleven", "Marcus Junkelmann", "Cenk Renda", "Ylli Bufi", "Ferenc Sipos", "Paul Daniels", "Azra Erhat", "Peter Ording", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Rusya", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Jan Jakob T\u00f8nseth", "Stanis\u0142aw Urban", "Robert Hamilton Paterson", "Aurel Codoban", "\u0130ngiliz Kemal", "Necmi S\u00f6nmez", "Maja Tucholke", "Arthur Kinnaird", "Serdar Apayd\u0131n", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Hamilton Sloan", "Argiris Pedulakis", "Cliff Jones", "Warington Wilkinson Smyth", "Christfried Burmeister", "Brett Hayman", "Nektaria Karantzi", "Colin Groves", "Ahmet G\u00fclhan", "Nina Repeta", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Neil Fingleton", "Murat Evliyao\u011flu", "Jesse A. Hamilton", "Michael Anti", "Herv\u00e9 Alphand", "Filippo Soffici", "Martin Jacques", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Tayyar Yalaz", "Francis J. Ricciardone, Jr.", "Katalin Kar\u00e1dy", "Coral Amiga", "David Parry", "Tim Robinson", "John Mundy", "Jores Medvedev", "Lucius Verus", "Francis Davis", "Hasna Xhuki\u00e7i", "No I.D.", "Pat Nixon", "Michael Carney", "Guy De Saint Cyr", "Wilhelm Boger", "Felipe Alfau", "Tolga Tekinalp", "Richard Anthony", "Shyril O'Steen", "Do\u011fa Bekleriz", "Peter Dembicki", "Gabriel Bertrand", "Martin Kosleck", "John Borlase", "Pierre Joxe", "Michael Guider", "Raymond Meier", "Dennis Davis", "Tancr\u00e8de Dumas", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Martin Malvy", "John Mundy", "Edward Locke", "Barry Howard", "Aleksandra Romani\u0107", "Thomas Nassi", "G\u00f6khan Bozkaya", "Joseph Wilhelm Swoboda", "Tim Long", "I. Anastasius", "Mehmet Ali \u0130rtem\u00e7elik", "Mordehajs Dubins", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Claire Ritter", "\u00d6mer Kaner", "Petar \u010culi\u0107", "Martin Henriksson", "Staffan de Mistura", "Sam Butler", "Kay Lahusen", "Cemal Cuma", "Laurence Stallings", "Drago\u0219 Neagu", "M\u00f3nica Estarreado", "Mohammed Loay Bayazid", "Sumalee Montano", "Claude Bourgelat", "Henri de Contenson", "John Abram", "Michael Armstrong", "Joe Ascione", "Martin Gero", "Fuat G\u00fcner", "William Reed", "Jimmy Greenspoon", "Marion Stein", "Erik Dammann", "Manola Saavedra", "Ralf Wadephul", "Witold Rodzi\u0144ski", "Giulio Carpioni", "John Buckley", "Sarah Stiles", "Neil Doncaster", "Meike Evers", "M\u00fcnir G\u00f6le", "Francisco Cervantes de Salazar", "Rob Heanley", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Jon Elster", "Sara Agnes Mclaughlin Conboy", "Stacy Barthe", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Ymer Pampuri", "Eduard von Weber", "Lynn Faulds Wood", "Ren\u00e9 Mayer", "Ulla Pia", "Bernhard R\u00fchling", "Lisa Sheridan", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Dave Steele", "O\u011fuz Abadan", "Kraisak Choonhavan", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Nyncke Beekhuyzen", "Adrienn Bende", "Orhan Demir", "Heather Davis", "Alisa Arnah", "Bernard Lloyd", "\u015eahan \u015eahnur", "Homer Curran", "Claire Baxter", "Jules de Gautier", "Johanna Meier", "Serhat", "Ayta\u00e7 Biter", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Elsa Lunghini", "Warren Carlyle", "Gijs Vermeulen", "Bernie Lowe", "David Scott Milton", "Kim Bauermeister", "Jane Bathori", "Lukas Sideras"]}, "place_of_death": {"objects": ["Hawaii", "Arizona", "Edinburgh", "Kud\u00fcs", "Philadelphia", "Vilnius", "Manhattan", "Roma", "Florida", "Sussex", "Paris", "Detroit", "Manhattan", "Kahire", "Oxford", "Lizbon", "Floransa", "Mumbai", "Londra", "Prag", "Londra", "Paris", "Madrid", "Londra", "Var\u015fova", "Liverpool", "Toronto", "Yokohama", "Berlin", "Madrid", "Kahire", "Lille", "Boston", "Konstantiniyye", "Sevilla", "Philadelphia", "Halep", "Prag"], "subjects": ["Donn Lewin", "Hac\u0131 Ali", "Henry Siddons", "Yakir Geron", "George Wallace Melville", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Giovanni Battista Caccini", "Jim Chapin", "Thomas Slingsby Duncombe", "Robert Desoille", "Kurken Alemshah", "Der Scutt", "Nasr Hamid Ebu Zeyd", "Homer Hasenpflug Dubs", "Murilo Mendes", "Philipp von Stosch", "Sultan Khan", "David Merrick", "V\u00e1clav Havel", "Lucy Faithfull, Baroness Faithfull", "Aleksandr Gu\u00e7kov", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Pauline Mills McGibbon", "Vito Positano", "Christa Wolf", "Enrique Sarasola", "Lotfia El Nadi", "John Shortland", "Robert Hazard", "Abd\u00fclmecid", "Felipe de Le\u00f3n", "William More Gabb", "\u0130brahim Hananu", "Florentina Mall\u00e1"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/ur.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/vi.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Sydney", "Burlington", "Stade", "Lu\u00e2n \u0110\u00f4n", "Sydney", "Heidelberg", "Montr\u00e9al", "Santiago de Chile", "Leipzig", "Wellington", "Napoli", "Lu\u00e2n \u0110\u00f4n", "Napoli", "Melbourne", "Anh", "Montr\u00e9al", "Sunderland", "Nam Kinh", "Vi\u1ec7t Nam", "Coventry", "Sydney", "Anh", "Sydney", "Nam Kinh", "Roma", "Sydney", "Philadelphia", "Nevada", "Waterford", "Stuttgart", "Seattle", "Van", "Vancouver", "Melbourne", "Limerick", "Preston", "Manchester", "Sheffield", "Anh", "Roma", "T\u00f4ky\u00f4", "Westminster", "Split", "Macon", "California", "Cardiff", "Montr\u00e9al", "Palermo", "Vi\u00ean", "Massachusetts", "Surrey", "Lagos", "Hamburg", "Nga", "Victoria", "Stuttgart", "Anh", "Leipzig", "Vancouver", "Lu\u00e2n \u0110\u00f4n", "Tours", "Newport", "Springfield", "\u00dac", "Ph\u00e1p", "Borneo", "Cardiff", "Ontario", "Norwich", "Pittsburgh", "Lu\u00e2n \u0110\u00f4n"], "subjects": ["Julia Wilson", "Paul Daniels", "Peter Ording", "James William Wallack", "Alastair Gordon", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Ximena Armas", "Maja Tucholke", "Barry Mitcalfe", "Renato Caccioppoli", "Cliff Jones", "Warington Wilkinson Smyth", "Brett Hayman", "Colin Groves", "David Atkinson", "Arthur Andrews", "Michael Anti", "L\u1ea1i Thanh H\u00e0", "Martin Jacques", "Danielle McGrath", "John Mundy", "Stephen Carr", "Qu\u00e1ch Kim Long", "Lucius Verus", "Elizabeth Kell", "Francis Davis", "Pat Nixon", "Michael Carney", "Wilhelm Boger", "Shyril O'Steen", "Aghasi Khanjian", "Peter Dembicki", "Michael Guider", "Sam Lynch", "Helen Longworth", "John Mundy", "Alec Briggs", "Edward Locke", "Gi\u00e1o ho\u00e0ng Anastasi\u00f4 I", "Maeda Takeshi", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Laurence Stallings", "Penny Lernoux", "Herbert Bowden, Baron Aylestone", "William Reed", "Francesco Musotto", "Marion Stein", "Sarah Stiles", "Rob Heanley", "Jumoke Verissimo", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Bernhard R\u00fchling", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "J.B. Jackson", "Michael Matus", "Simon Bowman", "Elizabeth Hess", "Warren Carlyle", "David Scott Milton", "Adrian Bowyer"]}, "place_of_death": {"objects": ["Hawaii", "Edinburgh", "Lyon", "B\u1ec9", "Cambridge", "Lu\u00e2n \u0110\u00f4n", "Manhattan", "\u0110\u00e0i B\u1eafc", "Florida", "Manhattan", "Montr\u00e9al", "Santiago de Chile", "Lu\u00e2n \u0110\u00f4n", "Liverpool", "Toronto", "Yokohama", "Berkeley", "Detroit", "Lille", "Constantinopolis", "Sevilla", "Nice", "Th\u01b0\u1ee3ng H\u1ea3i", "Bologna"], "subjects": ["Donn Lewin", "Henry Siddons", "Joseph Jean-Baptiste Xavier Fournet", "Margaret c\u1ee7a Anh, N\u1eef C\u00f4ng t\u01b0\u1edbc x\u1ee9 Brabant", "Grahame Clark", "Joshua Cristall", "Arthur Siegel", "Th\u1ea9m Ki\u1ebfm H\u1ed3ng", "Jim Chapin", "Der Scutt", "\u00c9douard Gagnon", "Nicolasa Vald\u00e9s", "David Merrick", "Jefferson Lowndes", "Pauline Mills McGibbon", "Vito Positano", "Andrew Imbrie", "Orestes Brownson", "John Shortland", "Abd\u00fcl Mecid I", "Felipe de Le\u00f3n", "Georgi Victorovich Adamovich", "L\u01b0\u01a1ng S\u0129 Di", "Thomas Dempster"]}}


--------------------------------------------------------------------------------
/data/GoogleRE_objects/zh.json:
--------------------------------------------------------------------------------
1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}}


--------------------------------------------------------------------------------
/dataset/cleanup.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import json
 4 | from utils import get_logger
 5 | 
 6 | LOG = get_logger(__name__)
 7 | 
 8 | 
 9 | def clean_triple(line):
10 |     data = json.loads(line)
11 |     relevant_keys = {"obj_label", "sub_label", "obj_uri", "sub_uri"}
12 |     result = {k: v for k, v in data.items() if k in relevant_keys and data["from_english"] is False}
13 |     return result
14 | 
15 | 
16 | def clean_relation(line):
17 |     data = json.loads(line)
18 |     relevant_keys = {"relation", "template"}
19 |     result = {k: v for k, v in data.items() if k in relevant_keys}
20 |     return result
21 | 
22 | 
23 | def main():
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--infolder", default=None, type=str, required=True, help="")
26 |     parser.add_argument("--outfolder", default=None, type=str, required=True, help="")
27 |     args = parser.parse_args()
28 | 
29 |     langs = [x.replace("relations_", "") for x in os.listdir(
30 |         os.path.join(args.infolder, "templates")) if "relations_" in x]
31 |     relations = [x.replace(".jsonl", "") for x in os.listdir(os.path.join(args.infolder, "en"))]
32 | 
33 |     for lang in langs:
34 |         os.makedirs(os.path.join(args.outfolder, lang))
35 | 
36 |     for lang in langs:
37 |         LOG.info(lang)
38 |         # transfer triples
39 |         for relation in relations:
40 |             current_path = os.path.join(args.infolder, lang, relation + ".jsonl")
41 |             if os.path.exists(current_path):
42 |                 with open(current_path) as fin:
43 |                     with open(os.path.join(args.outfolder, lang, relation + ".jsonl"), "w") as fout:
44 |                         for i, line in enumerate(fin):
45 |                             triple = clean_triple(line)
46 |                             if triple:
47 |                                 triple["lineid"] = i
48 |                                 fout.write("{}\n".format(json.dumps(triple)))
49 |         # transfer templates
50 |         with open(os.path.join(args.outfolder, lang, "templates.jsonl"), "a") as fout:
51 |             if os.path.exists(os.path.join(args.infolder, "templates", "relations_{}.jsonl".format(lang))):
52 |                 with open(os.path.join(args.infolder, "templates", "relations_{}.jsonl".format(lang))) as fin:
53 |                     for line in fin:
54 |                         template = clean_relation(line)
55 |                         fout.write("{}\n".format(json.dumps(template)))
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/dataset/download_trexentities.py:
--------------------------------------------------------------------------------
 1 | from relations import Relations
 2 | import argparse
 3 | from typing import Text
 4 | import requests
 5 | import tqdm
 6 | import os
 7 | import json
 8 | from utils import get_logger
 9 | 
10 | LOG = get_logger(__name__)
11 | 
12 | 
13 | def download_entity(url: Text, outfile:  Text) -> None:
14 |     try:
15 |         answer = requests.get(url)
16 |         with open(outfile, "w") as fp:
17 |             fp.write(json.dumps(json.loads(answer.content)))
18 |     except Exception as e:
19 |         LOG.warning("Getting {} failed.".format(url))
20 |         LOG.warning("Exception: {}.".format(e))
21 | 
22 | 
23 | def download_from_wikidata() -> None:
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--datapath", default=None, type=str, required=True, help="")
26 |     parser.add_argument("--outpath", default=None, type=str, required=True, help="")
27 |     parser.add_argument("--use", action="store_true", help="")
28 |     args = parser.parse_args()
29 |     t = Relations(args.datapath)
30 |     filenames = t.get_available_filenames()
31 |     t.load_data(filenames)
32 |     entities = t.get_all_entities(["obj_uri", "sub_uri"])
33 |     base_url = "https://www.wikidata.org/wiki/Special:EntityData/{}.json"
34 |     for entity in tqdm.tqdm(entities):
35 |         download_entity(base_url.format(entity), os.path.join(args.outpath, entity + ".json"))
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     download_from_wikidata()
40 | 


--------------------------------------------------------------------------------
/dataset/download_wikidata.py:
--------------------------------------------------------------------------------
 1 | from .data import Relations
 2 | import argparse
 3 | from typing import Text
 4 | import requests
 5 | import tqdm
 6 | import os
 7 | import json
 8 | 
 9 | 
10 | def download_entity(url: Text, outfile: Text) -> None:
11 |     try:
12 |         answer = requests.get(url)
13 |         with open(outfile, "w") as fp:
14 |             fp.write(json.dumps(json.loads(answer.content)))
15 |     except Exception as e:
16 |         print("Getting {} failed.".format(url))
17 |         print("Exception: {}.".format(e))
18 | 
19 | 
20 | def download_from_wikidata() -> None:
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("--datapath", default=None, type=str, required=True, help="")
23 |     parser.add_argument("--outpath", default=None, type=str, required=True, help="")
24 |     parser.add_argument("--use", action="store_true", help="")
25 |     args = parser.parse_args()
26 |     t = Relations(args.datapath)
27 |     filenames = t.get_available_filenames()
28 |     t.load_data(filenames)
29 |     entities = t.get_all_entities(["obj_uri", "sub_uri"])
30 |     base_url = "https://www.wikidata.org/wiki/Special:EntityData/{}.json"
31 |     for entity in tqdm.tqdm(entities):
32 |         download_entity(base_url.format(entity), os.path.join(args.outpath, entity + ".json"))
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     download_from_wikidata()
37 | 


--------------------------------------------------------------------------------
/dataset/mbertlangs.txt:
--------------------------------------------------------------------------------
  1 | wiki	name	iso	googletranslate
  2 | af	Afrikaans	afr	af
  3 | sq	Albanian	sqi	sq
  4 | ar	Arabic	ara,arb	ar
  5 | an	Aragonese	arg	
  6 | hy	Armenian	hye	hy
  7 | ast	Asturian	ast	
  8 | az	Azerbaijani	aze	az
  9 | ba	Bashkir	bak	
 10 | eu	Basque	eus	eu
 11 | bar	Bavarian	bar	
 12 | be	Belarusian	bel	be
 13 | bn	Bengali	ben	bn
 14 | bpy	Bishnupriya Manipuri	bpy	
 15 | bs	Bosnian	bos	bs
 16 | br	Breton	bre	
 17 | bg	Bulgarian	bul	bg
 18 | my	Burmese	mya	
 19 | ca	Catalan	cat	ca
 20 | ceb	Cebuano	ceb	ceb
 21 | ce	Chechen	che	
 22 | zh	Chinese (Simplified)	zho	zh-CN
 23 | zh-classical	Chinese (Traditional)	lzh	zh-TW
 24 | cv	Chuvash	chv	
 25 | hr	Croatian	hrv	hr
 26 | cs	Czech	ces	cs
 27 | da	Danish	dan	da
 28 | nl	Dutch	nld	nl
 29 | en	English	eng	en
 30 | et	Estonian	est	et
 31 | fi	Finnish	fin	fi
 32 | fr	French	fra	fr
 33 | gl	Galician	glg	gl
 34 | ka	Georgian	kat	ka
 35 | de	German	deu	de
 36 | el	Greek	ell	el
 37 | gu	Gujarati	guj	gu
 38 | ht	Haitian	hat	ht
 39 | he	Hebrew	heb	he
 40 | hi	Hindi	hin	hi
 41 | hu	Hungarian	hun	hu
 42 | is	Icelandic	isl	is
 43 | io	Ido	ido	
 44 | id	Indonesian	ind	id
 45 | ga	Irish	gle	ga
 46 | it	Italian	ita	it
 47 | ja	Japanese	jpn	ja
 48 | jv	Javanese	jav	jv
 49 | kn	Kannada	kan	kn
 50 | kk	Kazakh	kaz	kk
 51 | ky	Kirghiz	kir	
 52 | ko	Korean	kor	ko
 53 | la	Latin	lat	la
 54 | lv	Latvian	lav	lv
 55 | lt	Lithuanian	lit	lt
 56 | lmo	Lombard	lmo	
 57 | nds	Low Saxon	nds	
 58 | lb	Luxembourgish	ltz	lb
 59 | mk	Macedonian	mkd	mk
 60 | mg	Malagasy	mlg,plt	mg
 61 | ms	Malay	msa	ms
 62 | ml	Malayalam	mal	ml
 63 | mr	Marathi	mar	mr
 64 | min	Minangkabau	min	
 65 | ne	Nepali	nep	ne
 66 | new	Newar	new	
 67 | no	Norwegian (Bokmal)	nob	no
 68 | nn	Norwegian (Nynorsk)	nno	
 69 | oc	Occitan	oci	
 70 | fa	Persian (Farsi)	fas,pes	fa
 71 | pms	Piedmontese	pms	
 72 | pl	Polish	pol	pl
 73 | pt	Portuguese	por	pt
 74 | pa	Punjabi	pan	pa
 75 | ro	Romanian	ron	ro
 76 | ru	Russian	rus	ru
 77 | sco	Scots	sco	
 78 | sr	Serbian	srp	sr
 79 | sh	Serbo-Croatian	hbs,srp,hrv	
 80 | scn	Sicilian	scn	
 81 | sk	Slovak	slk	sk
 82 | sl	Slovenian	slv	sl
 83 | azb	South Azerbaijani	azb	
 84 | es	Spanish	spa	es
 85 | su	Sundanese	sun	su
 86 | sw	Swahili	swh	sw
 87 | sv	Swedish	swe	sv
 88 | tl	Tagalog	tgl	tl
 89 | tg	Tajik	tgk	tg
 90 | ta	Tamil	tam	ta
 91 | tt	Tatar	tat	tt
 92 | te	Telugu	tel	te
 93 | tr	Turkish	tur	tr
 94 | uk	Ukrainian	ukr	uk
 95 | ur	Urdu	urd	ur
 96 | uz	Uzbek	uzb	uz
 97 | vi	Vietnamese	vie	vi
 98 | vo	Volapük	vol	
 99 | war	Waray-Waray	war	
100 | cy	Welsh	cym	cy
101 | fy	West Frisian	fry	fy
102 | pnb	Western Punjabi	pnb,pan	
103 | yo	Yoruba	yor	yo
104 | th	Thai	tha	th
105 | mn	Mongolian	mon,khk	


--------------------------------------------------------------------------------
/dataset/mlama.sh:
--------------------------------------------------------------------------------
 1 | WORKDIR="/mounts/work/philipp/tmp/mlama"
 2 | 
 3 | # 1. Download TREx and GoogleRE
 4 | wget https://dl.fbaipublicfiles.com/LAMA/data.zip -P ${WORKDIR}
 5 | unzip ${WORKDIR}/data.zip -d ${WORKDIR} && rm ${WORKDIR}/data.zip
 6 | 
 7 | # 2. Translate TREx
 8 | 
 9 | # download entity data
10 | mkdir -p ${WORKDIR}/data/wikidata_entities
11 | 
12 | python download_trexentities.py \
13 | --datapath ${WORKDIR}/data/TREx \
14 | --outpath ${WORKDIR}/data/wikidata_entities
15 | 
16 | # create multilingual json files
17 | mkdir -p ${WORKDIR}/data/multilingual
18 | python translate_trex.py \
19 | --data ${WORKDIR}/data/TREx \
20 | --entities ${WORKDIR}/data/wikidata_entities \
21 | --outpath ${WORKDIR}/data/multilingual \
22 | --languagemapping mbertlangs.txt
23 | 
24 | 
25 | # 3. Translate GoogleRE
26 | # You will need a valid Google Knowledge Graph API key in the environment variable `GOOGLEAPIKEY for this section
27 | mv ${WORKDIR}/data/Google_RE/date_of_birth_test.jsonl ${WORKDIR}/data/Google_RE/date_of_birth.jsonl
28 | mv ${WORKDIR}/data/Google_RE/place_of_birth_test.jsonl ${WORKDIR}/data/Google_RE/place_of_birth.jsonl
29 | mv ${WORKDIR}/data/Google_RE/place_of_death_test.jsonl ${WORKDIR}/data/Google_RE/place_of_death.jsonl
30 | 
31 | for relation in date_of_birth place_of_death
32 | do
33 | 	python translate_googlere.py \
34 | 	--inputpath ${WORKDIR}/data/Google_RE \
35 | 	--relation ${relation} \
36 | 	--outpath ${WORKDIR}/data/multilingual \
37 | 	--languagemapping mbertlangs.txt
38 | done
39 | 
40 | # 4.1. Translate Templates TREx
41 | mkdir -p ${WORKDIR}/data/multilingual/templates_original
42 | python translate_templates.py translate \
43 | 	--templates ${WORKDIR}/data/relations.jsonl \
44 | 	--outfile ${WORKDIR}/data/multilingual/templates_original \
45 | 	--languagemapping mbertlangs.txt
46 | 
47 | 
48 | # 4.2. Translate Templates GoogleRE
49 | # manually copy the two googlere relations templates and translate them
50 | python translate_templates.py translate \
51 | 	--templates ${WORKDIR}/data/relations_googlere.jsonl \
52 | 	--outfile ${WORKDIR}/data/multilingual/templates_original \
53 | 	--languagemapping mbertlangs.txt
54 | 
55 | # 4.3. Clean Templates in place
56 | cp -r ${WORKDIR}/data/multilingual/templates_original ${WORKDIR}/data/multilingual/templates
57 | python translate_templates.py clean \
58 | 	--templates ${WORKDIR}/data/multilingual/templates
59 | 
60 | # 5. Copy each template json into the language folder
61 | mkdir -p ${WORKDIR}/data_clean
62 | python cleanup.py \
63 |     --infolder ${WORKDIR}/data/multilingual \
64 |     --outfolder ${WORKDIR}/data_clean
65 | 
66 | # 6. Load mLAMA
67 | python reader.py --path ${WORKDIR}/data_clean/
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/dataset/reader.py:
--------------------------------------------------------------------------------
  1 | from typing import Text, List, Set, Any, Text, Dict
  2 | import os
  3 | import json
  4 | 
  5 | 
  6 | class MLama(object):
  7 |     """docstring for MLama"""
  8 | 
  9 |     def __init__(self, path: Text) -> None:
 10 |         super(MLama, self).__init__()
 11 |         self.path = path
 12 |         self.data = {}
 13 | 
 14 |     def get_all_languages(self) -> List[Text]:
 15 |         # not for all languages templates are available.
 16 |         return os.listdir(self.path)
 17 | 
 18 |     def get_official_languages(self) -> List[Text]:
 19 |         return ["ca", "az", "en", "ar", "uk", "fa", "tr", "it", "el", "ru", "hr", "hi", "sv", "sq", "fr", "ga", "eu", "de", "nl", "et", "he", "es", "bn", "ms", "sr",
 20 |                 "hy", "ur", "hu", "la", "sl", "cs", "af", "gl", "fi", "ro", "ko", "cy", "th", "be", "id", "pt", "vi", "ka", "ja", "da", "bg", "zh", "pl", "lv", "sk", "lt", "ta", "ceb"]
 21 | 
 22 |     def get_relations(self, language) -> List[Text]:
 23 |         files = os.listdir(os.path.join(self.path, language))
 24 |         return [file.replace(".jsonl", "") for file in files if file != "templates.jsonl"]
 25 | 
 26 |     @staticmethod
 27 |     def _load_templates(path: Text) -> Dict[Text, Text]:
 28 |         templates = {}
 29 |         with open(path) as fp:
 30 |             for line in fp:
 31 |                 line = json.loads(line)
 32 |                 templates[line["relation"]] = line["template"]
 33 |         return templates
 34 | 
 35 |     @staticmethod
 36 |     def _load_triples(path: Text) -> Dict[Text, Dict[Text, Text]]:
 37 |         triples = {}
 38 |         with open(path) as fp:
 39 |             for line in fp:
 40 |                 line = json.loads(line)
 41 |                 triples[line["lineid"]] = line
 42 |         return triples
 43 | 
 44 |     def load(self, languages: List[Text] = [], relations: List[Text] = []) -> None:
 45 |         self.data = {}
 46 |         if not languages:
 47 |             languages = self.get_official_languages()
 48 |         for language in languages:
 49 |             self.data[language] = {}
 50 |             if not relations:
 51 |                 relations = self.get_relations(language)
 52 |             templates = self._load_templates(os.path.join(self.path, language, "templates.jsonl"))
 53 |             for relation in relations:
 54 |                 self.data[language][relation] = {}
 55 |                 if relation not in templates:
 56 |                     print("Template missing for relation {} in language {}.".format(relation, language))
 57 |                 self.data[language][relation]["template"] = templates.get(relation, "")
 58 |                 self.data[language][relation]["triples"] = self._load_triples(
 59 |                     os.path.join(self.path, language, relation + ".jsonl"))
 60 | 
 61 |     @staticmethod
 62 |     def is_valid_template(template: Text) -> bool:
 63 |         return ("[X]" in template and "[Y]" in template)
 64 | 
 65 |     def _fill_templates(self, template: Text, triples: Dict[Text, Dict[Text, Text]], mode: Text) -> Dict[Text, Text]:
 66 |         '''
 67 |         mode in ["x", "y", "xy"]
 68 |         '''
 69 |         if not self.is_valid_template(template):
 70 |             print("Invalid template: {}".format(template))
 71 |             return {}
 72 |         else:
 73 |             filled_templates = {}
 74 |             for triple_id, triple in triples.items():
 75 |                 filled_templates[triple_id] = template
 76 |                 if "x" in mode:
 77 |                     filled_templates[triple_id] = filled_templates[triple_id].replace("[X]", triple["sub_label"])
 78 |                 if "y" in mode:
 79 |                     filled_templates[triple_id] = filled_templates[triple_id].replace("[Y]", triple["obj_label"])
 80 |             return filled_templates
 81 | 
 82 |     def fill_all_templates(self, mode: Text):
 83 |         for language in self.data:
 84 |             for relation in self.data[language]:
 85 |                 self.data[language][relation]["filled_templates"] = self._fill_templates(
 86 |                     self.data[language][relation]["template"], self.data[language][relation]["triples"], mode)
 87 | 
 88 | 
 89 | def view_sample():
 90 |     import random
 91 |     # prints a part of a latex table
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument("--path", default=None, type=str, required=True, help="")
 94 |     args = parser.parse_args()
 95 |     ml = MLama(args.path)
 96 |     ml.load()
 97 |     ml.fill_all_templates("xy")
 98 |     for lang in ml.data:
 99 |         all_instances = []
100 |         for relation in ml.data[lang]:
101 |             all_instances.extend(ml.data[lang][relation]["filled_templates"].values())
102 |         examples = random.sample(all_instances, 3)
103 |         print("\\multirow{{3}}{{0.3cm}}{{{}}}".format(lang), end="")
104 |         for example in examples:
105 |             print(" & {}\\\\".format(example))
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     view_sample()
110 | 


--------------------------------------------------------------------------------
/dataset/relations.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Text, List, Set
 3 | import collections
 4 | import tqdm
 5 | import json
 6 | 
 7 | 
 8 | class Relations(object):
 9 |     """docstring for Relations"""
10 | 
11 |     def __init__(self, path: Text, suffix: Text = ".jsonl") -> None:
12 |         self.path = path
13 |         self.suffix = suffix
14 |         self.data = collections.defaultdict(list)
15 | 
16 |     def get_available_filenames(self) -> List[Text]:
17 |         filenames = []
18 |         for file in os.listdir(self.path):
19 |             filenames.append(file.replace(self.suffix, ""))
20 |         return filenames
21 | 
22 |     def load_data(self, filenames: List[Text]) -> None:
23 |         for filename in tqdm.tqdm(filenames):
24 |             with open(os.path.join(self.path, filename + self.suffix)) as fp:
25 |                 for line in fp:
26 |                     if line:
27 |                         self.data[filename].append(json.loads(line))
28 | 
29 |     def get_all_entities(self, fields: List[Text]) -> Set[Text]:
30 |         entities = set()
31 |         for filename, triples in self.data.items():
32 |             for triple in triples:
33 |                 for field in fields:
34 |                     if field in triple:
35 |                         entities.add(triple[field].strip())
36 |         return entities
37 | 
38 | 


--------------------------------------------------------------------------------
/dataset/requirements.txt:
--------------------------------------------------------------------------------
1 | googletrans>=3.1.0a0


--------------------------------------------------------------------------------
/dataset/translate_googlere.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import Text, Dict, Set, Any
 3 | import json
 4 | import requests
 5 | import os
 6 | from tqdm import tqdm
 7 | from utils import get_logger, load_languagemapping
 8 | 
 9 | LOG = get_logger(__name__)
10 | 
11 | 
12 | 
13 | 
14 | 
15 | def translate(kgids: Set[Text], targetlang: Text, key: Text) -> Dict[Text, Text]:
16 |     translations = {}
17 |     kgids = list(kgids)
18 |     kgids = [x for x in kgids if x.startswith("/m/")]
19 |     batch_size = 16
20 |     for i in tqdm(range(0, len(kgids), batch_size)): 
21 |         response = requests.get("https://kgsearch.googleapis.com/v1/entities:search", {"key": key, "languages": targetlang, "ids": kgids[i:i + batch_size]})
22 |         if response.status_code == 200:
23 |             result = json.loads(response.content)
24 |             for elem in result['itemListElement']:
25 |                 kgid = elem["result"]["@id"].replace("kg:", "")
26 |                 name = elem["result"]["name"]
27 |                 translations[kgid] = name
28 |         else:
29 |             LOG.warning("Wrong status code: {}".format(response))
30 |             break
31 |     return translations
32 | 
33 | 
34 | def get_translation(current_id: Text, translations: Dict[Text, Text], triple: Dict[Text, Any]) -> Dict[Text, Any]:
35 |     if current_id.startswith("/m/"):
36 |         if current_id in translations:
37 |             sub_translated = translations[current_id]
38 |         else:
39 |             sub_translated = None
40 |     else:
41 |         if triple["sub"] != triple["sub_label"]:
42 |             sub_translated = None
43 |         else:
44 |             sub_translated = triple["sub"]
45 |     return sub_translated
46 | 
47 | 
48 | def main():
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument("--inputpath", default=None, type=str, required=True, help="")
51 |     parser.add_argument("--relation", default=None, type=str, required=True, help="")
52 |     parser.add_argument("--outpath", default=None, type=str, required=True, help="")
53 |     parser.add_argument("--languagemapping", default=None, type=str, required=True, help="")
54 |     args = parser.parse_args()
55 |     key = os.environ["GOOGLEAPIKEY"]
56 |     lang2translateid = load_languagemapping(args.languagemapping)
57 |     triples = []
58 |     with open(os.path.join(args.inputpath, args.relation + ".jsonl")) as fp:
59 |         for line in fp:
60 |             if line.strip():
61 |                 triples.append(json.loads(line))
62 | 
63 |     kgids = set()
64 |     for triple in triples:
65 |         if "sub" in triple:
66 |             kgids.add(triple["sub"])
67 |         if "obj" in triple:
68 |             kgids.add(triple["obj"])
69 | 
70 |     for langid, googleid in lang2translateid.items():
71 |         LOG.info(langid)
72 |         translations = translate(kgids, googleid, key)
73 |         result = []
74 |         for triple in triples:
75 |             if "sub" not in triple or "obj" not in triple or "sub_label" not in triple or "obj_label" not in triple:
76 |                 triple["from_english"] = True
77 |                 result.append(triple)
78 |             else:
79 |                 subid = triple["sub"]
80 |                 objid = triple["obj"]
81 |                 sub_translated = get_translation(subid, translations, triple)
82 |                 obj_translated = get_translation(objid, translations, triple)
83 |                 if sub_translated is None or obj_translated is None:
84 |                     triple["from_english"] = True
85 |                     result.append(triple)
86 |                 else:
87 |                     triple["from_english"] = False
88 |                     triple["sub_label"] = sub_translated
89 |                     triple["obj_label"] = obj_translated
90 |                     result.append(triple)
91 |         os.makedirs(os.path.join(args.outpath, langid), exist_ok=True)
92 |         with open(os.path.join(args.outpath, langid, args.relation + ".jsonl"), "w") as fout:
93 |             for triple in result:
94 |                 fout.write(json.dumps(triple) + "\n")
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/dataset/translate_templates.py:
--------------------------------------------------------------------------------
  1 | from googletrans import Translator
  2 | import json
  3 | import os
  4 | import argparse
  5 | from utils import get_logger
  6 | 
  7 | LOG = get_logger(__name__)
  8 | 
  9 | 
 10 | def fix_template(template, lang):
 11 |     # general rules
 12 |     if "[X]" not in template:
 13 |         template = template.replace("X", "[X]", 1)
 14 |     if "[Y]" not in template:
 15 |         template = template.replace("Y", "[Y]", 1)
 16 |         template = template.replace("[Y ]", "[Y] ", 1)
 17 | 
 18 |     if lang == "tl":
 19 |         template = template.replace("Naglalaro ang [X] sa posisyon.", "Naglalaro si [X] sa posisyon na [Y]", 1)
 20 |         template = template.replace("Sumali sa [X] ang [X].", "Sumali ang [X] sa [Y].", 1)
 21 |         template = template.replace("Naglalaro ang [X] ng musika.", "Naglalaro si [X] ng [Y] musika.", 1)
 22 |         template = template.replace("Naglalaro ang [X].", "Ginawa ni [X] ang [Y].", 1)
 23 |     if lang == "el":
 24 |         template = template.replace("[Χ]", "[X]", 1)
 25 |         template = template.replace("[Υ]", "[Y]", 1)
 26 |         if "[Y]" in template and "[X]" not in template:
 27 |             template = template.replace("[Ο]", "[X]", 1)
 28 |         if "[X]" in template and "[Y]" not in template:
 29 |             template = template.replace("[Ο]", "[Y]", 1)
 30 |     if lang == "ceb":
 31 |         # to be checked
 32 |         template = template.replace("Natawo sa [Y].", "Natawo ang [X] sa [Y].", 1)
 33 |         template = template.replace("Nag-apil sa [X] ang [X].", "Ang [X] miapil sa [Y].", 1)
 34 | 
 35 |     if lang == "pa":
 36 |         template = template.replace("[ਐਕਸ]", "[X]", 1)
 37 |         template = template.replace("[ਵਾਈ]", "[Y]", 1)
 38 |     if lang == "ta":
 39 |         template = template.replace("[எக்ஸ்]", "[X]", 1)
 40 |         template = template.replace("[ஒய்]", "[Y]", 1)
 41 |     if lang == "mg":
 42 |         template = template.replace(
 43 |             "Tamin'ny voalohany, nalefan'i [Y] tany am-boalohany.", "Tamin'ny voalohany, ny X [X] dia nalefa tamin'ny [Y].", 1)
 44 |     if lang == "gu":
 45 |         template = template.replace("[એક્સ]", "[X]", 1)
 46 |         template = template.replace("[વાય]", "[Y]", 1)
 47 |     if lang == "mr":
 48 |         template = template.replace("[एक्स]", "[X]", 1)
 49 |         template = template.replace("[वाई]", "[Y]", 1)
 50 |         template = template.replace("[वाय]", "[Y]", 1)
 51 |     if lang == "sr":
 52 |         template = template.replace("[Кс]", "[X]", 1)
 53 |         template = template.replace("[И]", "[Y]", 1)
 54 |         template = template.replace("[X] је рођен у И.", "[X] је рођен у [Y].", 1)
 55 |     if lang == "kk":
 56 |         template = template.replace("[Х] университетте білім алған.", "[X] [Y] университетінде білім алған.", 1)
 57 |         template = template.replace("Ана тілі [Х] болып табылады.", "[Х] -дің ана тілі - [Y].", 1)
 58 |         template = template.replace("[Х]", "[X]", 1)
 59 |         template = template.replace("[Y]", "[Y]", 1)
 60 |     if lang == "kn":
 61 |         template = template.replace("[ಎಕ್ಸ್]", "[X]", 1)
 62 |         template = template.replace("[ವೈ]", "[Y]", 1)
 63 |     if lang == "ne":
 64 |         template = template.replace("[एक्स]", "[X]", 1)
 65 |         template = template.replace("[Y]", "[Y]", 1)
 66 |     if lang == "hy":
 67 |         template = template.replace("[X]", "[X]", 1)
 68 |         template = template.replace("[Յ]", "[Y]", 1)
 69 |     if lang == "uz":
 70 |         template = template.replace("[X] universitetida tahsil olgan.", "[X] [Y] universitetida tahsil olgan.", 1)
 71 |         template = template.replace("[X] din bilan bog'liq.", "[X] [Y] diniga mansub.", 1)
 72 |     if lang == "tg":
 73 |         template = template.replace("[X] аз рӯи касб аст.", "[X] аз рӯи касб [Y] аст.", 1)
 74 |         template = template.replace("[Ю]", "[Y]", 1)
 75 |         template = template.replace("[Х]", "[X]", 1)
 76 |         template = template.replace("[Y]", "[Y]", 1)
 77 |     if lang == "lt":
 78 |         template = template.replace(
 79 |             "Buvo įgijęs išsilavinimą [Y] universitete.", "[X] įgijo išsilavinimą [Y] universitete.", 1)
 80 |     if lang == "bn":
 81 |         template = template.replace("[এক্স]", "[X]", 1)
 82 |         template = template.replace("[ওয়াই]", "[Y]", 1)
 83 |     if lang == "la":
 84 |         template = template.replace("[K]", "[Y]", 1)
 85 |         template = template.replace("[A]", "[Y]", 1)
 86 |         template = template.replace("[N]", "[Y]", 1)
 87 |         template = template.replace("[V]", "[Y]", 1)
 88 |         template = template.replace("[ego]", "[Y]", 1)
 89 |         template = template.replace("[Ego]", "[Y]", 1)
 90 |     if lang == "hi":
 91 |         if "[X]" not in template:
 92 |             template = template.replace("[एक्स]", "[X]", 1)
 93 |         if "[Y]" not in template:
 94 |             template = template.replace("[वाई]", "[Y]", 1)
 95 |     return template
 96 | 
 97 | 
 98 | def clean(args):
 99 |     to_fix = []
100 |     broken = 0
101 |     for file in os.listdir(args.templates):
102 |         with open(os.path.join(args.templates, file), "r") as fp:
103 |             for line in fp:
104 |                 if line:
105 |                     template = json.loads(line)
106 |                     #lang = file.replace(".jsonl", "").split("_")[-1]
107 |                     #template["template"] = fix_template(template["template"], lang)
108 |                     if template["template"].count("[X]") != 1 or template["template"].count("[Y]") != 1:
109 |                         LOG.warning("Broken Template {} {} {}".format(file, template["relation"], template["template"]))
110 |                         to_fix.append(file)
111 |                         broken += 1
112 |     to_fix = set(to_fix)
113 |     LOG.info("Fixing {} broken templates across {} languages.".format(broken, len(to_fix)))
114 |     for file in to_fix:
115 |         with open(os.path.join(args.templates, file), "r") as fp:
116 |             fixed_templates = []
117 |             for line in fp:
118 |                 if line:
119 |                     template = json.loads(line)
120 |                     lang = file.replace(".jsonl", "").split("_")[-1]
121 |                     if template["template"].count("[X]") != 1 or template["template"].count("[Y]") != 1:
122 |                         template["template"] = fix_template(template["template"], lang)
123 |                     fixed_templates.append(template)
124 |         with open(os.path.join(args.templates, file), "w") as fp:
125 |             for line in fixed_templates:
126 |                 fp.write(json.dumps(line) + "\n")
127 | 
128 | 
129 | def translate(args):
130 |     lang2translateid = {}
131 |     with open(args.languagemapping) as fp:
132 |         next(fp)
133 |         for line in fp:
134 |             if line:
135 |                 wikiid, _, _, googleid = line.split("\t")
136 |                 if not googleid:
137 |                     # try the other id and see what comes out of goole translate
138 |                     googleid = wikiid
139 |                 lang2translateid[wikiid.strip()] = googleid.strip()
140 | 
141 |     templates = []
142 |     with open(args.templates) as fp:
143 |         for line in fp:
144 |             if line:
145 |                 templates.append(json.loads(line))
146 | 
147 |     # get translations
148 |     for wikiid, googleid in lang2translateid.items():
149 |         LOG.info("TRANSLATING {}".format(wikiid))
150 |         translated = []
151 |         for template in templates:
152 |             try:
153 |                 translator = Translator()
154 |                 result = translator.translate(template["template"], src="en", dest=googleid)
155 |                 translated_template = template.copy()
156 |                 translated_template["template"] = result.text
157 |                 translated.append(translated_template)
158 |             except Exception as e:
159 |                 LOG.info("Exception: {}".format(e))
160 |         if len(translated) != len(templates):
161 |             LOG.warning("Not all translations succesful!")
162 |             LOG.warning("Skipping language")
163 |         else:
164 |             # write out
165 |             with open(os.path.join(args.outfile, "relations_{}.jsonl".format(wikiid)), "w") as fout:
166 |                 for template in translated:
167 |                     fout.write("{}\n".format(json.dumps(template)))
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     parser = argparse.ArgumentParser()
172 |     subparsers = parser.add_subparsers()
173 | 
174 |     parser_translate = subparsers.add_parser('translate')
175 |     parser_translate.set_defaults(func=translate)
176 |     parser_translate.add_argument("--templates", default=None, type=str, required=True, help="")
177 |     parser_translate.add_argument("--languagemapping", default=None, type=str, required=True, help="")
178 |     parser_translate.add_argument("--outfile", default=None, type=str, required=True, help="")
179 | 
180 |     parser_clean = subparsers.add_parser('clean')
181 |     parser_clean.set_defaults(func=clean)
182 |     parser_clean.add_argument("--templates", default=None, type=str, required=True, help="")
183 | 
184 |     args = parser.parse_args()
185 |     args.func(args)
186 | 


--------------------------------------------------------------------------------
/dataset/translate_trex.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from relations import Relations
 3 | from typing import Text
 4 | import tqdm
 5 | import os
 6 | import json
 7 | import collections
 8 | from utils import get_logger, load_languagemapping
 9 | 
10 | LOG = get_logger(__name__)
11 | 
12 | 
13 | def get_entity_surface(basepath: Text, uri: Text, language: Text) -> Text:
14 |     try:
15 |         with open(os.path.join(basepath, uri + ".json")) as fp:
16 |             data = json.load(fp)
17 | 
18 |         surfaces = data['entities'][uri]['labels']
19 |         if language in surfaces:
20 |             if surfaces[language]["language"] != language:
21 |                 raise Warning("Language mismatch in data: {}".format(surfaces))
22 |             return surfaces[language]["value"]
23 |         else:
24 |             return ""
25 |     except Exception as e:
26 |         print("Exception: {} (probably entity file does not exist).".format(e))
27 |         return ""
28 | 
29 | 
30 | def main():
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument("--data", default=None, type=str, required=True, help="")
33 |     parser.add_argument("--entities", default=None, type=str, required=True, help="")
34 |     parser.add_argument("--outpath", default=None, type=str, required=True, help="")
35 |     parser.add_argument("--languagemapping", default=None, type=str, required=True, help="")
36 |     args = parser.parse_args()
37 |     lang2translateid = load_languagemapping(args.languagemapping)
38 | 
39 |     for lang in lang2translateid:
40 |         t = Relations(args.data)
41 |         filenames = t.get_available_filenames()
42 |         t.load_data(filenames)
43 |         count = collections.Counter()
44 |         logfile = open(os.path.join(args.outpath, lang + ".log"), "w")
45 |         for filename, relations in t.data.items():
46 |             LOG.info("Processing relation: {}".format(filename))
47 |             outdirectory = os.path.join(args.outpath, lang)
48 |             os.makedirs(outdirectory, exist_ok=True)
49 |             with open(os.path.join(outdirectory, filename + ".jsonl"), "w") as fout:
50 |                 for relation in relations:
51 |                     count["in_file"] += 1
52 |                     if ("sub_uri" in relation and "obj_uri" in relation and "sub_label" in relation and "obj_label" in relation):
53 |                         count["available"] += 1
54 |                         obj_uri = relation["obj_uri"]
55 |                         sub_uri = relation["sub_uri"]
56 |                         # load entitiy information
57 |                         obj_surface = get_entity_surface(args.entities, obj_uri, lang)
58 |                         sub_surface = get_entity_surface(args.entities, sub_uri, lang)
59 |                         # write out
60 |                         if obj_surface and sub_surface:
61 |                             count["converted"] += 1
62 |                             to_write = {"sub_uri": sub_uri, "obj_uri": obj_uri,
63 |                                         "obj_label": obj_surface, "sub_label": sub_surface, "from_english": False}
64 |                         else:
65 |                             # use english surface forms
66 |                             to_write = {"sub_uri": sub_uri, "obj_uri": obj_uri,
67 |                                         "obj_label": relation["obj_label"], "sub_label": relation["sub_label"], "from_english": True}
68 |                         fout.write(json.dumps(to_write) + "\n")
69 |             summary = "{}|{}|{}|(converted/available/in_file)".format(count["converted"], count["available"], count["in_file"])
70 |             LOG.info(summary)
71 |             logfile.write("{}|{}\n".format(filename, summary))
72 |         logfile.close()
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/dataset/utils.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import logging
 3 | 
 4 | 
 5 | def rec_dd():
 6 |     return collections.defaultdict(rec_dd)
 7 | 
 8 | 
 9 | def load_languagemapping(path):
10 |     lang2translateid = {}
11 |     with open(path) as fp:
12 |         next(fp)
13 |         for line in fp:
14 |             if line:
15 |                 wikiid, _, _, googleid = line.split("\t")
16 |                 if not googleid:
17 |                     # try the other id and see what comes out of google translate
18 |                     googleid = wikiid
19 |                 lang2translateid[wikiid.strip()] = googleid.strip()
20 |     return lang2translateid
21 | 
22 | 
23 | def get_logger(name, filename=None, level=logging.DEBUG):
24 |     logger = logging.getLogger(name)
25 |     logger.setLevel(level)
26 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
27 | 
28 |     ch = logging.StreamHandler()
29 |     ch.setLevel(level)
30 |     ch.setFormatter(formatter)
31 |     logger.addHandler(ch)
32 | 
33 |     if filename is not None:
34 |         fh = logging.FileHandler(filename)
35 |         fh.setLevel(level)
36 |         fh.setFormatter(formatter)
37 |         logger.addHandler(fh)
38 |     return logger


--------------------------------------------------------------------------------
/mlama/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mlama/build_encoded_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | import pickle as pkl
  8 | from tqdm import tqdm
  9 | try:
 10 |     import ujson as json
 11 | except ImportError:
 12 |     import json
 13 | import collections
 14 | import torch
 15 | from lama.modules import build_model_by_name
 16 | 
 17 | # A tuple containing a single example from the input dataset with sentences
 18 | # mapped into a sequence of vectors:
 19 | #   embeddings: tensor with shape (some_length, embedding_dim).
 20 | # Note that some_length differs from example to example, while
 21 | # embedding_dim is the same for all examples for the encoded dataset.
 22 | EncodedSentence = collections.namedtuple('EncodedSentence',
 23 |                                          'embedding, length, tokens')
 24 | 
 25 | 
 26 | class EncodedDataset(torch.utils.data.Dataset):
 27 | 
 28 |     def __init__(self, encoded_sentences=None):
 29 |         if encoded_sentences:
 30 |             # make sure encoded_sentences is a list of Strings
 31 |             assert isinstance(encoded_sentences, list)
 32 |             sample = encoded_sentences[0]
 33 |             assert len(sample) == 3
 34 |             assert isinstance(sample[0], torch.Tensor)
 35 |             self._encodings = encoded_sentences
 36 |         else:
 37 |             self._embeddings = []
 38 | 
 39 |     def __len__(self):
 40 |         return len(self._encodings)
 41 | 
 42 |     def __getitem__(self, idx):
 43 |         encoding = self._encodings[idx]
 44 |         embedding, sent_length, tokens = encoding
 45 | 
 46 |         return EncodedSentence(embedding=embedding, length=sent_length, tokens=tokens)
 47 | 
 48 |     def save(self, path):
 49 |         with open(path, 'wb') as f:
 50 |             pkl.dump(self._encodings, f)
 51 | 
 52 |     def load(self, path):
 53 |         """ Read precomputed contextual embeddings from file
 54 | 
 55 |         :param path: path to the embedding file (in npz format)
 56 |         """
 57 |         with open(path, 'rb') as f:
 58 |             self._encodings = pkl.load(f)
 59 | 
 60 | 
 61 | def load_encoded_dataset(path):
 62 |     dataset = EncodedDataset()
 63 |     dataset.load(path)
 64 |     return dataset
 65 | 
 66 | 
 67 | def _batchify(sentences, batch_size):
 68 |     start = 0
 69 |     while start < len(sentences):
 70 |         yield sentences[start:start + batch_size]
 71 |         start += batch_size
 72 | 
 73 | 
 74 | def _aggregate_layers(embeddings):
 75 |     """ Average over all layers """
 76 |     new_embed = torch.stack(embeddings, 0)  # [#layers, #batchsize, #max_sent_len, #dim]
 77 |     agg_embed = torch.mean(new_embed, 0)  # [#batchsize, #max_sent_len, #dim]
 78 |     return agg_embed
 79 | 
 80 | 
 81 | def encode(args, sentences, sort_input=False):
 82 |     """Create an EncodedDataset from a list of sentences
 83 | 
 84 |     Parameters:
 85 |     sentences (list[list[string]]): list of elements. Each element is a list
 86 |                                     that contains either a single sentence
 87 |                                     or two sentences
 88 |     sort_input (bool): if true, sort sentences by number of tokens in them
 89 | 
 90 |     Returns:
 91 |     dataset (EncodedDataset): an object that contains the contextual
 92 |                               representations of the input sentences
 93 |     """
 94 |     print("Language Models: {}".format(args.lm))
 95 |     model = build_model_by_name(args.lm, args)
 96 | 
 97 |     # sort sentences by number of tokens in them to make sure that in all
 98 |     # batches there are sentence with a similar numbers of tokens
 99 |     if sort_input:
100 |         sorted(sentences, key=lambda k: len(" ".join(k).split()) )
101 | 
102 |     encoded_sents = []
103 |     for current_batch in tqdm(_batchify(sentences, args.batch_size)):
104 |         embeddings, sent_lens, tokenized_sents = model.get_contextual_embeddings(current_batch)
105 | 
106 |         agg_embeddings = _aggregate_layers(embeddings)  # [#batchsize, #max_sent_len, #dim]
107 |         sent_embeddings = [agg_embeddings[i, :l] for i, l in enumerate(sent_lens)]
108 |         encoded_sents.extend(list(zip(sent_embeddings, sent_lens, tokenized_sents)))
109 | 
110 |     dataset = EncodedDataset(encoded_sents)
111 |     return dataset


--------------------------------------------------------------------------------
/mlama/eval_generation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | from lama.modules import build_model_by_name
 8 | from lama.utils import print_sentence_predictions, load_vocab
 9 | import lama.options as options
10 | import lama.evaluation_metrics as evaluation_metrics
11 | 
12 | 
13 | def main(args):
14 | 
15 |     if not args.text and not args.interactive:
16 |         msg = "ERROR: either you start LAMA eval_generation with the " \
17 |               "interactive option (--i) or you pass in input a piece of text (--t)"
18 |         raise ValueError(msg)
19 | 
20 |     stopping_condition = True
21 | 
22 |     print("Language Models: {}".format(args.models_names))
23 | 
24 |     models = {}
25 |     for lm in args.models_names:
26 |         models[lm] = build_model_by_name(lm, args)
27 | 
28 |     vocab_subset = None
29 |     if args.common_vocab_filename is not None:
30 |         common_vocab = load_vocab(args.common_vocab_filename)
31 |         print("common vocabulary size: {}".format(len(common_vocab)))
32 |         vocab_subset = [x for x in common_vocab]
33 | 
34 |     while stopping_condition:
35 |         if args.text:
36 |             text = args.text
37 |             stopping_condition = False
38 |         else:
39 |             text = input("insert text:")
40 | 
41 |         if args.split_sentence:
42 |             import spacy
43 |             # use spacy to tokenize input sentence
44 |             nlp = spacy.load(args.spacy_model)
45 |             tokens = nlp(text)
46 |             print(tokens)
47 |             sentences = []
48 |             for s in tokens.sents:
49 |                 print(" - {}".format(s))
50 |                 sentences.append(s.text)
51 |         else:
52 |             sentences = [text]
53 | 
54 |         if len(sentences) > 2:
55 |             print("WARNING: only the first two sentences in the text will be considered!")
56 |             sentences = sentences[:2]
57 | 
58 |         for model_name, model in models.items():
59 |             print("\n{}:".format(model_name))
60 |             original_log_probs_list, [token_ids], [masked_indices] = model.get_batch_generation([sentences], try_cuda=False)
61 | 
62 |             index_list = None
63 |             if vocab_subset is not None:
64 |                 # filter log_probs
65 |                 filter_logprob_indices, index_list = model.init_indices_for_filter_logprobs(vocab_subset)
66 |                 filtered_log_probs_list = model.filter_logprobs(original_log_probs_list, filter_logprob_indices)
67 |             else:
68 |                 filtered_log_probs_list = original_log_probs_list
69 | 
70 |             # rank over the subset of the vocab (if defined) for the SINGLE masked tokens
71 |             if masked_indices and len(masked_indices) > 0:
72 |                 evaluation_metrics.get_ranking(filtered_log_probs_list[0], masked_indices, model.vocab, index_list=index_list)
73 | 
74 |             # prediction and perplexity for the whole softmax
75 |             print_sentence_predictions(original_log_probs_list[0], token_ids, model.vocab, masked_indices=masked_indices)
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     parser = options.get_eval_generation_parser()
80 |     args = options.parse_args(parser)
81 |     main(args)
82 | 


--------------------------------------------------------------------------------
/mlama/evaluation_metrics_ranked.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | import torch
  8 | import numpy as np
  9 | import scipy
 10 | 
 11 | 
 12 | def __max_probs_values_indices(masked_indices, log_probs, topk=1000):
 13 | 
 14 |     masked_index = masked_indices
 15 | 
 16 |     objects = candidates[len(masked_index)]
 17 | 
 18 |     log_probs = log_probs[masked_index]
 19 | 
 20 |     value_max_probs, index_max_probs = torch.topk(input=log_probs,k=topk,dim=1)
 21 |     index_max_probs = index_max_probs.numpy().astype(int)
 22 |     value_max_probs = value_max_probs.detach().numpy()
 23 | 
 24 |     return log_probs, index_max_probs, value_max_probs
 25 | 
 26 | 
 27 | def __print_top_k(value_max_probs, index_max_probs, vocab, mask_topk, index_list, candidates_obj, max_printouts = 10):
 28 |     result = []
 29 |     msg = "\n| Top{} predictions\n".format(max_printouts)
 30 |     for i in range(mask_topk):
 31 |         idx_joined = []
 32 |         word_form_joined = []
 33 | 
 34 |         for n_mask in range(len(value_max_probs)):
 35 |             filtered_idx = index_max_probs[n_mask][i].item()
 36 | 
 37 |             if index_list is not None:
 38 |                 # the softmax layer has been filtered using the vocab_subset
 39 |                 # the original idx should be retrieved
 40 |                 idx = index_list[filtered_idx]
 41 |             else:
 42 |                 idx = filtered_idx
 43 | 
 44 |             log_prob = value_max_probs[n_mask][i].item()
 45 |             word_form = vocab[idx]
 46 | 
 47 |             word_form_joined.append(word_form)
 48 |             idx_joined.append(idx)
 49 |             if i < max_printouts:
 50 |                 msg += "{:<8d}{:<20s}{:<12.3f}\n".format(
 51 |                     i,
 52 |                     word_form,
 53 |                     log_prob
 54 |                 )
 55 |         element = {'i' : i, 'token_idx': idx_joined, 'log_prob': log_prob, 'token_word_form': word_form_joined}
 56 |         result.append(element)
 57 |     return result, msg
 58 | 
 59 | def get_prediction(log_probs, masked_indices, vocab, label_index = None, index_list = None, topk = 1000, P_AT = 10, print_generation=True):
 60 | 
 61 |     experiment_result = {}
 62 | 
 63 |     # score only first mask
 64 |     masked_indices = masked_indices[:1]
 65 | 
 66 |     masked_index = masked_indices[0]
 67 |     log_probs = log_probs[masked_index]
 68 | 
 69 |     value_max_probs, index_max_probs = torch.topk(input=log_probs,k=topk,dim=0)
 70 |     index_max_probs = index_max_probs.numpy().astype(int)
 71 |     value_max_probs = value_max_probs.detach().numpy()
 72 | 
 73 |     result_masked_topk, return_msg = __print_top_k(value_max_probs, index_max_probs, vocab, topk, index_list)
 74 | 
 75 |     return result_masked_topk, return_msg
 76 | 
 77 | 
 78 | def get_ranking(log_probs, sample, masked_indices, vocab, candidates, label_index = None, index_list = None, topk = 10, P_AT = 10, print_generation=True):
 79 |     experiment_result = {}
 80 |     dict_probs = {}
 81 |     return_msg = ""
 82 |     objects_true = sample["obj_label"]
 83 | 
 84 |     for i, num_masks in enumerate(candidates):
 85 |         if len(masked_indices) >1:
 86 |             masked_idx = masked_indices[i]
 87 |         else:
 88 |             masked_idx = [masked_indices[i]]
 89 |         predictions = log_probs[i][masked_idx]
 90 | 
 91 |         for object in candidates[num_masks]:
 92 |             probs = []
 93 |             for id, prediction in zip(candidates[num_masks][object], predictions):
 94 |                 #print(id)
 95 |                 #print("pred", prediction)
 96 |                 probs.append(prediction[id])
 97 |             dict_probs[object] = np.mean(probs)
 98 |     object_keys = np.array(list(dict_probs.keys()))
 99 |     object_values = np.array(list(dict_probs.values()))
100 |  
101 |     idx_true = np.argwhere(objects_true == object_keys)[0][0]
102 |     idcs = np.argsort(object_values)
103 |     rank = len(object_values) - np.argwhere(idcs==idx_true)[0][0]
104 | 
105 |     experiment_result["rank"] = rank - 1
106 |     experiment_result["prob_true"] = dict_probs[objects_true]
107 |     experiment_result["predicted"] = object_keys[idcs]
108 |     experiment_result["probs"] = object_values[idcs]
109 | 
110 |     return experiment_result, return_msg
111 | 


--------------------------------------------------------------------------------
/mlama/get_contextual_embeddings.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | from lama.modules import build_model_by_name
 8 | import lama.options as options
 9 | 
10 | def main(args):
11 |     sentences = [
12 |         ["the cat is on the table ."],  # single-sentence instance
13 |         ["the dog is sleeping on the sofa .", "he makes happy noises ."],  # two-sentence
14 |     ]
15 | 
16 |     print("Language Models: {}".format(args.models_names))
17 | 
18 |     models = {}
19 |     for lm in args.models_names:
20 |         models[lm] = build_model_by_name(lm, args)
21 | 
22 |     for model_name, model in models.items():
23 |         print("\n{}:".format(model_name))
24 |         if args.cuda:
25 |             model.try_cuda()
26 |         contextual_embeddings, sentence_lengths, tokenized_text_list = model.get_contextual_embeddings(
27 |             sentences)
28 | 
29 |         # contextual_embeddings is a list of tensors, one tensor for each layer.
30 |         # Each element contains one layer of the representations with shape
31 |         # (x, y, z).
32 |         #   x    - the batch size
33 |         #   y    - the sequence length of the batch
34 |         #   z    - the length of each layer vector
35 | 
36 |         print(f'Number of layers: {len(contextual_embeddings)}')
37 |         for layer_id, layer in enumerate(contextual_embeddings):
38 |             print(f'Layer {layer_id} has shape: {layer.shape}')
39 | 
40 |         print("sentence_lengths: {}".format(sentence_lengths))
41 |         print("tokenized_text_list: {}".format(tokenized_text_list))
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = options.get_general_parser()
46 |     parser.add_argument('--cuda', action='store_true', help='Try to run on GPU')
47 |     args = options.parse_args(parser)
48 |     main(args)
49 | 


--------------------------------------------------------------------------------
/mlama/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | from .bert_connector import Bert
 8 | 
 9 | def build_model_by_name(lm, args, verbose=True):
10 |     """Load a model by name and args.
11 | 
12 |     Note, args.lm is not used for model selection. args are only passed to the
13 |     model's initializator.
14 |     """
15 |     MODEL_NAME_TO_CLASS = dict(
16 |         bert=Bert
17 |     )
18 |     if lm not in MODEL_NAME_TO_CLASS:
19 |         raise ValueError("Unrecognized Language Model: %s." % lm)
20 |     if verbose:
21 |         print("Loading %s model..." % lm)
22 |     return MODEL_NAME_TO_CLASS[lm](args)
23 | 


--------------------------------------------------------------------------------
/mlama/modules/base_connector.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | import re
  8 | import torch
  9 | 
 10 | MASK = "[MASK]"
 11 | BERT_UNK = "[UNK]"
 12 | BERT_CLS = "[CLS]"
 13 | BERT_SEP = "[SEP]"
 14 | BERT_PAD = "[PAD]"
 15 | ELMO_UNK = "<UNK>"
 16 | ELMO_START_SENTENCE = "<S>"
 17 | ELMO_END_SENTENCE = "</S>"
 18 | OPENAI_UNK = "<unk>"
 19 | OPENAI_EOS = "<eos>"
 20 | ROBERTA_MASK = "<mask>"
 21 | ROBERTA_START_SENTENCE = "<s>"
 22 | ROBERTA_END_SENTENCE = "</s>"
 23 | ROBERTA_VOCAB_SIZE = 50266
 24 | 
 25 | SPECIAL_SYMBOLS = [
 26 |     MASK,
 27 |     BERT_UNK,
 28 |     BERT_CLS,
 29 |     BERT_SEP,
 30 |     BERT_PAD,
 31 |     ELMO_UNK,
 32 |     ELMO_START_SENTENCE,
 33 |     ELMO_END_SENTENCE,
 34 |     OPENAI_UNK,
 35 |     OPENAI_EOS
 36 |     ]
 37 | 
 38 | SPACE_NORMALIZER = re.compile(r"\s+")
 39 | 
 40 | 
 41 | def default_tokenizer(line):
 42 |     """Default tokenizer for models that don't have one
 43 | 
 44 |     Args:
 45 |         line: a string representing a sentence
 46 | 
 47 |     Returns:
 48 |         A list of tokens
 49 |     """
 50 | 
 51 |     line = SPACE_NORMALIZER.sub(" ", line)
 52 |     line = line.strip()
 53 |     line = line.replace(MASK, " "+str(MASK)+" ") #make sure MASK is correctly splitted
 54 | 
 55 |     # fix tokenization for parentheses
 56 |     line = line.replace('(', " ( ")
 57 |     line = line.replace(')', " ) ")
 58 | 
 59 |     # fix tokenization for comma
 60 |     line = line.replace(',', " , ")
 61 | 
 62 |     # fix tokenization for -- (e.g., 1954--1988)
 63 |     line = line.replace('--', " -- ")
 64 | 
 65 |     result = line.split()
 66 |     return result
 67 | 
 68 | 
 69 | class Base_Connector():
 70 | 
 71 |     def __init__(self):
 72 | 
 73 |         # these variables should be initialized
 74 |         self.vocab = None
 75 | 
 76 |         # This defines where the device where the model is. Changed by try_cuda.
 77 |         self._model_device = 'cpu'
 78 | 
 79 |     def optimize_top_layer(self, vocab_subset):
 80 |         """
 81 |         optimization for some LM
 82 |         """
 83 |         pass
 84 | 
 85 |     def _init_inverse_vocab(self):
 86 |         self.inverse_vocab = {w: i for i, w in enumerate(self.vocab)}
 87 | 
 88 |     def try_cuda(self):
 89 |         """Move model to GPU if one is available."""
 90 |         if torch.cuda.is_available():
 91 |             if self._model_device != 'cuda':
 92 |                 print('Moving model to CUDA')
 93 |                 self._cuda()
 94 |                 self._model_device = 'cuda'
 95 |         else:
 96 |             print('No CUDA found')
 97 | 
 98 |     def _cuda(self):
 99 |         """Move model to GPU."""
100 |         raise NotImplementedError
101 | 
102 |     def init_indices_for_filter_logprobs(self, vocab_subset, logger=None):
103 |         index_list = []
104 |         new_vocab_subset = []
105 |         for word in vocab_subset:
106 |             if word in self.inverse_vocab:
107 |                 inverse_id = self.inverse_vocab[word]
108 |                 index_list.append(inverse_id)
109 |                 new_vocab_subset.append(word)
110 |             else:
111 |                 msg = "word {} from vocab_subset not in model vocabulary!".format(word)
112 |                 if logger is not None:
113 |                     logger.warning(msg)
114 |                 else:
115 |                     print("WARNING: {}".format(msg))
116 | 
117 |         # 1. gather correct indices
118 |         indices = torch.as_tensor(index_list)
119 |         return indices, index_list
120 | 
121 |     def filter_logprobs(self, log_probs, indices):
122 |         new_log_probs = log_probs.index_select(dim=2 , index=indices)
123 |         return new_log_probs
124 | 
125 |     def get_id(self, string):
126 |         raise NotImplementedError()
127 | 
128 |     def get_generation(self, sentences, logger=None):
129 |         [log_probs], [token_ids], [masked_indices] = self.get_batch_generation(
130 |             [sentences], logger=logger, try_cuda=False)
131 |         return log_probs, token_ids, masked_indices
132 | 
133 |     def get_batch_generation(self, sentences_list, logger= None, try_cuda=True):
134 |         raise NotImplementedError()
135 | 
136 |     def get_contextual_embeddings(self, sentences):
137 |         """Compute the contextual embeddings of a list of sentences
138 | 
139 |         Parameters:
140 |         sentences (list[list[string]]): list of elements. Each element is a list
141 |                                         that contains either a single sentence
142 |                                         or two sentences
143 | 
144 |         Returns:
145 |         encoder_layers (list(Tensor)): a list of the full sequences of encoded-hidden-states
146 |                             at the end of each attention block (e.g., 12 full
147 |                             sequences for BERT-base,), each encoded-hidden-state
148 |                             is a torch.FloatTensor of size [batch_size,
149 |                             sequence_length, hidden_size]
150 |         sentence_lengths (list[int]): list of lenghts for the sentences in the
151 |                                       batch
152 |         tokenized_text_list: (list[list[string]]): tokenized text for the sentences
153 |                                                    in the batch
154 |         """
155 |         raise NotImplementedError()
156 | 


--------------------------------------------------------------------------------
/mlama/modules/bert_connector.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | import torch
  8 | import pytorch_pretrained_bert.tokenization as btok
  9 | from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM, BasicTokenizer, BertModel
 10 | import numpy as np
 11 | from mlama.modules.base_connector import *
 12 | import torch.nn.functional as F
 13 | 
 14 | 
 15 | class CustomBaseTokenizer(BasicTokenizer):
 16 | 
 17 |     def tokenize(self, text):
 18 |         """Tokenizes a piece of text."""
 19 |         text = self._clean_text(text)
 20 |         # This was added on November 1st, 2018 for the multilingual and Chinese
 21 |         # models. This is also applied to the English models now, but it doesn't
 22 |         # matter since the English models were not trained on any Chinese data
 23 |         # and generally don't have any Chinese data in them (there are Chinese
 24 |         # characters in the vocabulary because Wikipedia does have some Chinese
 25 |         # words in the English Wikipedia.).
 26 |         text = self._tokenize_chinese_chars(text)
 27 |         orig_tokens = btok.whitespace_tokenize(text)
 28 |         split_tokens = []
 29 |         for token in orig_tokens:
 30 | 
 31 |             # pass MASK forward
 32 |             if MASK in token:
 33 |                 split_tokens.append(MASK)
 34 |                 if token != MASK:
 35 |                     remaining_chars = token.replace(MASK,"").strip()
 36 |                     if remaining_chars:
 37 |                         split_tokens.append(remaining_chars)
 38 |                 continue
 39 | 
 40 |             if self.do_lower_case:
 41 |                 token = token.lower()
 42 |                 token = self._run_strip_accents(token)
 43 |             split_tokens.extend(self._run_split_on_punc(token))
 44 | 
 45 |         output_tokens = btok.whitespace_tokenize(" ".join(split_tokens))
 46 |         return output_tokens
 47 | 
 48 | 
 49 | class Bert(Base_Connector):
 50 | 
 51 |     def __init__(self, args, vocab_subset = None):
 52 |         super().__init__()
 53 | 
 54 |         bert_model_name = args.bert_model_name
 55 |         dict_file = bert_model_name
 56 | 
 57 |         if args.bert_model_dir is not None:
 58 |             # load bert model from file
 59 |             bert_model_name = str(args.bert_model_dir) + "/"
 60 |             dict_file = bert_model_name+args.bert_vocab_name
 61 |             self.dict_file = dict_file
 62 |             print("loading BERT model from {}".format(bert_model_name))
 63 |         else:
 64 |             # load bert model from huggingface cache
 65 |             pass
 66 | 
 67 |         # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer
 68 |         do_lower_case = False
 69 |         if 'uncased' in bert_model_name:
 70 |             do_lower_case=True
 71 |         #print(do_lower_case)
 72 |         # Load pre-trained model tokenizer (vocabulary)
 73 |         self.tokenizer = BertTokenizer.from_pretrained(dict_file)
 74 | 
 75 |         # original vocab
 76 |         self.map_indices = None
 77 |         self.vocab = list(self.tokenizer.ids_to_tokens.values())
 78 |         self._init_inverse_vocab()
 79 | 
 80 |         # Add custom tokenizer to avoid splitting the ['MASK'] token
 81 |         custom_basic_tokenizer = CustomBaseTokenizer(do_lower_case = do_lower_case)
 82 |         self.tokenizer.basic_tokenizer = custom_basic_tokenizer
 83 | 
 84 |         # Load pre-trained model (weights)
 85 |         # ... to get prediction/generation
 86 |         self.masked_bert_model = BertForMaskedLM.from_pretrained(bert_model_name)
 87 | 
 88 |         self.masked_bert_model.eval()
 89 | 
 90 |         # ... to get hidden states
 91 |         self.bert_model = self.masked_bert_model.bert
 92 | 
 93 |         self.pad_id = self.inverse_vocab[BERT_PAD]
 94 | 
 95 |         self.unk_index = self.inverse_vocab[BERT_UNK]
 96 | 
 97 |     def get_id(self, string):
 98 |         tokenized_text = self.tokenizer.tokenize(string)
 99 |         indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text)
100 |         if self.map_indices is not None:
101 |             # map indices to subset of the vocabulary
102 |             indexed_string = self.convert_ids(indexed_string)
103 | 
104 |         return indexed_string
105 | 
106 |     def __get_input_tensors_batch(self, sentences_list):
107 |         tokens_tensors_list = []
108 |         segments_tensors_list = []
109 |         masked_indices_list = []
110 |         tokenized_text_list = []
111 |         max_tokens = 0
112 |         for sentences in sentences_list:
113 |             tokens_tensor, segments_tensor, masked_indices, tokenized_text = self.__get_input_tensors(sentences)
114 |             tokens_tensors_list.append(tokens_tensor)
115 |             segments_tensors_list.append(segments_tensor)
116 |             masked_indices_list.append(masked_indices)
117 |             tokenized_text_list.append(tokenized_text)
118 |             # assert(tokens_tensor.shape[1] == segments_tensor.shape[1])
119 |             if (tokens_tensor.shape[1] > max_tokens):
120 |                 max_tokens = tokens_tensor.shape[1]
121 |         # print("MAX_TOKENS: {}".format(max_tokens))
122 |         # apply padding and concatenate tensors
123 |         # use [PAD] for tokens and 0 for segments
124 |         final_tokens_tensor = None
125 |         final_segments_tensor = None
126 |         final_attention_mask = None
127 |         for tokens_tensor, segments_tensor in zip(tokens_tensors_list, segments_tensors_list):
128 |             dim_tensor = tokens_tensor.shape[1]
129 |             pad_lenght = max_tokens - dim_tensor
130 |             attention_tensor = torch.full([1,dim_tensor], 1, dtype= torch.long)
131 |             if pad_lenght>0:
132 |                 pad_1 = torch.full([1,pad_lenght], self.pad_id, dtype= torch.long)
133 |                 pad_2 = torch.full([1,pad_lenght], 0, dtype= torch.long)
134 |                 attention_pad = torch.full([1,pad_lenght], 0, dtype= torch.long)
135 |                 tokens_tensor = torch.cat((tokens_tensor,pad_1), dim=1)
136 |                 segments_tensor = torch.cat((segments_tensor,pad_2), dim=1)
137 |                 attention_tensor = torch.cat((attention_tensor,attention_pad), dim=1)
138 |             if final_tokens_tensor is None:
139 |                 final_tokens_tensor = tokens_tensor
140 |                 final_segments_tensor = segments_tensor
141 |                 final_attention_mask = attention_tensor
142 |             else:
143 |                 final_tokens_tensor = torch.cat((final_tokens_tensor,tokens_tensor), dim=0)
144 |                 final_segments_tensor = torch.cat((final_segments_tensor,segments_tensor), dim=0)
145 |                 final_attention_mask = torch.cat((final_attention_mask,attention_tensor), dim=0)
146 |         # print(final_tokens_tensor)
147 |         # print(final_segments_tensor)
148 |         # print(final_attention_mask)
149 |         # print(final_tokens_tensor.shape)
150 |         # print(final_segments_tensor.shape)
151 |         # print(final_attention_mask.shape)
152 |         return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list
153 | 
154 |     def __get_input_tensors(self, sentences):
155 | 
156 |         if len(sentences) > 2:
157 |             print(sentences)
158 |             raise ValueError("BERT accepts maximum two sentences in input for each data point")
159 | 
160 |         first_tokenized_sentence = self.tokenizer.tokenize(sentences[0])
161 |         first_segment_id = np.zeros(len(first_tokenized_sentence), dtype=int).tolist()
162 | 
163 |         # add [SEP] token at the end
164 |         first_tokenized_sentence.append(BERT_SEP)
165 |         first_segment_id.append(0)
166 | 
167 |         if len(sentences)>1 :
168 |             second_tokenized_sentece = self.tokenizer.tokenize(sentences[1])
169 |             second_segment_id = np.full(len(second_tokenized_sentece),1, dtype=int).tolist()
170 | 
171 |             # add [SEP] token at the end
172 |             second_tokenized_sentece.append(BERT_SEP)
173 |             second_segment_id.append(1)
174 | 
175 |             tokenized_text = first_tokenized_sentence + second_tokenized_sentece
176 |             segments_ids = first_segment_id + second_segment_id
177 |         else:
178 |             tokenized_text = first_tokenized_sentence
179 |             segments_ids = first_segment_id
180 | 
181 |         # add [CLS] token at the beginning
182 |         tokenized_text.insert(0,BERT_CLS)
183 |         segments_ids.insert(0,0)
184 | 
185 |         # look for masked indices
186 |         masked_indices = []
187 |         for i in range(len(tokenized_text)):
188 |             token = tokenized_text[i]
189 |             if token == MASK:
190 |                 masked_indices.append(i)
191 | 
192 |         indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
193 | 
194 |         # Convert inputs to PyTorch tensors
195 |         tokens_tensor = torch.tensor([indexed_tokens])
196 |         segments_tensors = torch.tensor([segments_ids])
197 | 
198 |         return tokens_tensor, segments_tensors, masked_indices, tokenized_text
199 | 
200 |     def __get_token_ids_from_tensor(self, indexed_string):
201 |         token_ids = []
202 |         if self.map_indices is not None:
203 |             # map indices to subset of the vocabulary
204 |             indexed_string = self.convert_ids(indexed_string)
205 |             token_ids = np.asarray(indexed_string)
206 |         else:
207 |             token_ids = indexed_string
208 |         return token_ids
209 | 
210 |     def _cuda(self):
211 |         self.masked_bert_model.cuda()
212 | 
213 |     def get_batch_generation(self, sentences_list, logger= None,
214 |                              try_cuda=True):
215 |         #print("see")
216 |         if not sentences_list:
217 |             return None
218 |         if try_cuda:
219 |             self.try_cuda()
220 | 
221 |         tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list)
222 | 
223 |         if logger is not None:
224 |             logger.debug("\n{}\n".format(tokenized_text_list))
225 | 
226 |         with torch.no_grad():
227 |             logits = self.masked_bert_model(
228 |                 input_ids=tokens_tensor.to(self._model_device),
229 |                 token_type_ids=segments_tensor.to(self._model_device),
230 |                 attention_mask=attention_mask_tensor.to(self._model_device),
231 |             )
232 | 
233 |             log_probs = F.log_softmax(logits, dim=-1).cpu()
234 |         #print(logits.shape)
235 |         token_ids_list = []
236 |         for indexed_string in tokens_tensor.numpy():
237 |             token_ids_list.append(self.__get_token_ids_from_tensor(indexed_string))
238 | 
239 |         return log_probs, token_ids_list, masked_indices_list
240 | 
241 |     def get_contextual_embeddings(self, sentences_list, try_cuda=True):
242 | 
243 |         # assume in input 1 or 2 sentences - in general, it considers only the first 2 sentences
244 |         if not sentences_list:
245 |             return None
246 |         if try_cuda:
247 |             self.try_cuda()
248 | 
249 |         tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list)
250 | 
251 |         with torch.no_grad():
252 |             all_encoder_layers, _ = self.bert_model(
253 |                 tokens_tensor.to(self._model_device),
254 |                 segments_tensor.to(self._model_device))
255 | 
256 |         all_encoder_layers = [layer.cpu() for layer in all_encoder_layers]
257 | 
258 |         sentence_lengths = [len(x) for x in tokenized_text_list]
259 | 
260 |         # all_encoder_layers: a list of the full sequences of encoded-hidden-states at the end
261 |         # of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
262 |         # encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
263 |         return all_encoder_layers, sentence_lengths, tokenized_text_list
264 | 


--------------------------------------------------------------------------------
/mlama/modules/bert_connector_.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | import torch
  8 | import pytorch_pretrained_bert.tokenization as btok
  9 | from pytorch_pretrained_bert.tokenization import BertTokenizer, BertForMaskedLM, BasicTokenizer, BertModel
 10 | import numpy as np
 11 | from lama.modules.base_connector import *
 12 | import torch.nn.functional as F
 13 | from transformers import AutoTokenizer, AutoModelWithLMHead
 14 | 
 15 | class CustomBaseTokenizer(BasicTokenizer):
 16 | 
 17 |     def tokenize(self, text):
 18 |         """Tokenizes a piece of text."""
 19 |         text = self._clean_text(text)
 20 |         # This was added on November 1st, 2018 for the multilingual and Chinese
 21 |         # models. This is also applied to the English models now, but it doesn't
 22 |         # matter since the English models were not trained on any Chinese data
 23 |         # and generally don't have any Chinese data in them (there are Chinese
 24 |         # characters in the vocabulary because Wikipedia does have some Chinese
 25 |         # words in the English Wikipedia.).
 26 |         text = self._tokenize_chinese_chars(text)
 27 |         orig_tokens = btok.whitespace_tokenize(text)
 28 |         split_tokens = []
 29 |         for token in orig_tokens:
 30 | 
 31 |             # pass MASK forward
 32 |             if MASK in token:
 33 |                 split_tokens.append(MASK)
 34 |                 if token != MASK:
 35 |                     remaining_chars = token.replace(MASK,"").strip()
 36 |                     if remaining_chars:
 37 |                         split_tokens.append(remaining_chars)
 38 |                 continue
 39 | 
 40 |             if self.do_lower_case:
 41 |                 token = token.lower()
 42 |                 token = self._run_strip_accents(token)
 43 |             split_tokens.extend(self._run_split_on_punc(token))
 44 | 
 45 |         output_tokens = btok.whitespace_tokenize(" ".join(split_tokens))
 46 |         return output_tokens
 47 | 
 48 | 
 49 | class Bert(Base_Connector):
 50 | 
 51 |     def __init__(self, args, vocab_subset = None):
 52 |         super().__init__()
 53 | 
 54 |         bert_model_name = args.bert_model_name
 55 |         dict_file = bert_model_name
 56 | 
 57 |         if args.bert_model_dir is not None:
 58 |             # load bert model from file
 59 |             bert_model_name = str(args.bert_model_dir) + "/"
 60 |             dict_file = bert_model_name+args.bert_vocab_name
 61 |             self.dict_file = dict_file
 62 |             print("loading BERT model from {}".format(bert_model_name))
 63 |         else:
 64 |             # load bert model from huggingface cache
 65 |             dict_file = args.bert_model_name
 66 |             self.dict_file = dict_file
 67 |         # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer
 68 |         do_lower_case = False
 69 |         if 'uncased' in bert_model_name:
 70 |             do_lower_case=True
 71 |         print(do_lower_case)
 72 |         # Load pre-trained model tokenizer (vocabulary)
 73 |         self.tokenizer = BertTokenizer.from_pretrained(dict_file)
 74 |         #self.tokenizer = AutoTokenizer.from_pretrained(dict_file)
 75 |         # original vocab
 76 |         self.map_indices = None
 77 |         self.vocab = list(self.tokenizer.ids_to_tokens.values())
 78 |         self._init_inverse_vocab()
 79 | 
 80 |         # Add custom tokenizer to avoid splitting the ['MASK'] token
 81 |         #custom_basic_tokenizer = CustomBaseTokenizer(do_lower_case = do_lower_case)
 82 |         #self.tokenizer.basic_tokenizer = custom_basic_tokenizer
 83 | 
 84 |         # Load pre-trained model (weights)
 85 |         # ... to get prediction/generation
 86 |         self.masked_bert_model = BertForMaskedLM.from_pretrained(bert_model_name)
 87 |         #self.masked_bert_model = AutoModelWithLMHead.from_pretrained(bert_model_name)
 88 |         self.masked_bert_model.eval()
 89 | 
 90 |         # ... to get hidden states
 91 |         self.bert_model = self.masked_bert_model.bert
 92 | 
 93 |         self.pad_id = self.inverse_vocab[BERT_PAD]
 94 | 
 95 |         self.unk_index = self.inverse_vocab[BERT_UNK]
 96 | 
 97 |     def get_id(self, string):
 98 |         tokenized_text = self.tokenizer.tokenize(string)
 99 |         indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text)
100 |         if self.map_indices is not None:
101 |             # map indices to subset of the vocabulary
102 |             indexed_string = self.convert_ids(indexed_string)
103 | 
104 |         return indexed_string
105 | 
106 |     def __get_input_tensors_batch(self, sentences_list):
107 |         tokens_tensors_list = []
108 |         segments_tensors_list = []
109 |         masked_indices_list = []
110 |         tokenized_text_list = []
111 |         max_tokens = 0
112 |         for sentences in sentences_list:
113 |             tokens_tensor, segments_tensor, masked_indices, tokenized_text = self.__get_input_tensors(sentences)
114 |             tokens_tensors_list.append(tokens_tensor)
115 |             segments_tensors_list.append(segments_tensor)
116 |             masked_indices_list.append(masked_indices)
117 |             tokenized_text_list.append(tokenized_text)
118 |             # assert(tokens_tensor.shape[1] == segments_tensor.shape[1])
119 |             if (tokens_tensor.shape[1] > max_tokens):
120 |                 max_tokens = tokens_tensor.shape[1]
121 |         # print("MAX_TOKENS: {}".format(max_tokens))
122 |         # apply padding and concatenate tensors
123 |         # use [PAD] for tokens and 0 for segments
124 |         final_tokens_tensor = None
125 |         final_segments_tensor = None
126 |         final_attention_mask = None
127 |         for tokens_tensor, segments_tensor in zip(tokens_tensors_list, segments_tensors_list):
128 |             dim_tensor = tokens_tensor.shape[1]
129 |             pad_lenght = max_tokens - dim_tensor
130 |             attention_tensor = torch.full([1,dim_tensor], 1, dtype= torch.long)
131 |             if pad_lenght>0:
132 |                 pad_1 = torch.full([1,pad_lenght], self.pad_id, dtype= torch.long)
133 |                 pad_2 = torch.full([1,pad_lenght], 0, dtype= torch.long)
134 |                 attention_pad = torch.full([1,pad_lenght], 0, dtype= torch.long)
135 |                 tokens_tensor = torch.cat((tokens_tensor,pad_1), dim=1)
136 |                 segments_tensor = torch.cat((segments_tensor,pad_2), dim=1)
137 |                 attention_tensor = torch.cat((attention_tensor,attention_pad), dim=1)
138 |             if final_tokens_tensor is None:
139 |                 final_tokens_tensor = tokens_tensor
140 |                 final_segments_tensor = segments_tensor
141 |                 final_attention_mask = attention_tensor
142 |             else:
143 |                 final_tokens_tensor = torch.cat((final_tokens_tensor,tokens_tensor), dim=0)
144 |                 final_segments_tensor = torch.cat((final_segments_tensor,segments_tensor), dim=0)
145 |                 final_attention_mask = torch.cat((final_attention_mask,attention_tensor), dim=0)
146 |         # print(final_tokens_tensor)
147 |         # print(final_segments_tensor)
148 |         # print(final_attention_mask)
149 |         # print(final_tokens_tensor.shape)
150 |         # print(final_segments_tensor.shape)
151 |         # print(final_attention_mask.shape)
152 |         return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list
153 | 
154 |     def __get_input_tensors(self, sentences):
155 | 
156 |         if len(sentences) > 2:
157 |             print(sentences)
158 |             raise ValueError("BERT accepts maximum two sentences in input for each data point")
159 | 
160 |         first_tokenized_sentence = self.tokenizer.tokenize(sentences[0])
161 |         first_segment_id = np.zeros(len(first_tokenized_sentence), dtype=int).tolist()
162 | 
163 |         # add [SEP] token at the end
164 |         first_tokenized_sentence.append(BERT_SEP)
165 |         first_segment_id.append(0)
166 | 
167 |         if len(sentences)>1 :
168 |             second_tokenized_sentece = self.tokenizer.tokenize(sentences[1])
169 |             second_segment_id = np.full(len(second_tokenized_sentece),1, dtype=int).tolist()
170 | 
171 |             # add [SEP] token at the end
172 |             second_tokenized_sentece.append(BERT_SEP)
173 |             second_segment_id.append(1)
174 | 
175 |             tokenized_text = first_tokenized_sentence + second_tokenized_sentece
176 |             segments_ids = first_segment_id + second_segment_id
177 |         else:
178 |             tokenized_text = first_tokenized_sentence
179 |             segments_ids = first_segment_id
180 | 
181 |         # add [CLS] token at the beginning
182 |         tokenized_text.insert(0,BERT_CLS)
183 |         segments_ids.insert(0,0)
184 | 
185 |         # look for masked indices
186 |         masked_indices = []
187 |         for i in range(len(tokenized_text)):
188 |             token = tokenized_text[i]
189 |             if token == MASK:
190 |                 masked_indices.append(i)
191 | 
192 |         indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
193 | 
194 |         # Convert inputs to PyTorch tensors
195 |         tokens_tensor = torch.tensor([indexed_tokens])
196 |         segments_tensors = torch.tensor([segments_ids])
197 | 
198 |         return tokens_tensor, segments_tensors, masked_indices, tokenized_text
199 | 
200 |     def __get_token_ids_from_tensor(self, indexed_string):
201 |         token_ids = []
202 |         if self.map_indices is not None:
203 |             # map indices to subset of the vocabulary
204 |             indexed_string = self.convert_ids(indexed_string)
205 |             token_ids = np.asarray(indexed_string)
206 |         else:
207 |             token_ids = indexed_string
208 |         return token_ids
209 | 
210 |     def _cuda(self):
211 |         self.masked_bert_model.cuda()
212 | 
213 |     def get_batch_generation(self, sentences_list, logger= None,
214 |                              try_cuda=True):
215 |         if not sentences_list:
216 |             return None
217 |         if try_cuda:
218 |             self.try_cuda()
219 | 
220 |         tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list)
221 | 
222 |         if logger is not None:
223 |             logger.debug("\n{}\n".format(tokenized_text_list))
224 | 
225 |         with torch.no_grad():
226 |             logits = self.masked_bert_model(
227 |                 input_ids=tokens_tensor.to(self._model_device),
228 |                 token_type_ids=segments_tensor.to(self._model_device),
229 |                 attention_mask=attention_mask_tensor.to(self._model_device),
230 |             )
231 | 
232 |             log_probs = F.log_softmax(logits, dim=-1).cpu()
233 |         token_ids_list = []
234 |         for indexed_string in tokens_tensor.numpy():
235 |             token_ids_list.append(self.__get_token_ids_from_tensor(indexed_string))
236 | 
237 |         return log_probs, token_ids_list, masked_indices_list
238 | 
239 |     def get_contextual_embeddings(self, sentences_list, try_cuda=True):
240 | 
241 |         # assume in input 1 or 2 sentences - in general, it considers only the first 2 sentences
242 |         if not sentences_list:
243 |             return None
244 |         if try_cuda:
245 |             self.try_cuda()
246 | 
247 |         tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list)
248 | 
249 |         with torch.no_grad():
250 |             all_encoder_layers, _ = self.bert_model(
251 |                 tokens_tensor.to(self._model_device),
252 |                 segments_tensor.to(self._model_device))
253 | 
254 |         all_encoder_layers = [layer.cpu() for layer in all_encoder_layers]
255 | 
256 |         sentence_lengths = [len(x) for x in tokenized_text_list]
257 | 
258 |         # all_encoder_layers: a list of the full sequences of encoded-hidden-states at the end
259 |         # of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
260 |         # encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
261 |         return all_encoder_layers, sentence_lengths, tokenized_text_list
262 | 


--------------------------------------------------------------------------------
/mlama/options.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | import argparse
  8 | 
  9 | 
 10 | def get_general_parser():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument(
 13 |         "--language-models",
 14 |         "--lm",
 15 |         dest="models",
 16 |         help="comma separated list of language models",
 17 |         required=True,
 18 |     )
 19 |     parser.add_argument(
 20 |         "--spacy_model",
 21 |         "--sm",
 22 |         dest="spacy_model",
 23 |         default="en_core_web_sm",
 24 |         help="spacy model file path",
 25 |     )
 26 |     parser.add_argument(
 27 |         "--common-vocab-filename",
 28 |         "--cvf",
 29 |         dest="common_vocab_filename",
 30 |         help="common vocabulary filename",
 31 |     )
 32 |     parser.add_argument(
 33 |         "--interactive",
 34 |         "--i",
 35 |         dest="interactive",
 36 |         action="store_true",
 37 |         help="perform the evaluation interactively",
 38 |     )
 39 |     parser.add_argument(
 40 |         "--max-sentence-length",
 41 |         dest="max_sentence_length",
 42 |         type=int,
 43 |         default=100,
 44 |         help="max sentence lenght",
 45 |     )
 46 |     __add_bert_args(parser)
 47 |     __add_elmo_args(parser)
 48 |     __add_gpt_args(parser)
 49 |     __add_transformerxl_args(parser)
 50 |     __add_roberta_args(parser)
 51 |     return parser
 52 | 
 53 | 
 54 | def get_eval_generation_parser():
 55 |     parser = get_general_parser()
 56 |     parser.add_argument(
 57 |         "--text", "--t", dest="text", help="text to compute the generation for"
 58 |     )
 59 |     parser.add_argument(
 60 |         "--split_sentence",
 61 |         dest="split_sentence",
 62 |         action="store_true",
 63 |         help="split the input text in sentences",
 64 |     )
 65 |     return parser
 66 | 
 67 | 
 68 | def get_eval_KB_completion_parser():
 69 |     parser = get_general_parser()
 70 |     parser.add_argument(
 71 |         "--dataset-filename",
 72 |         "--df",
 73 |         dest="dataset_filename",
 74 |         help="filename containing dataset",
 75 |     )
 76 |     parser.add_argument(
 77 |         "--logdir",
 78 |         dest="logdir",
 79 |         default="../experiments_logs/",
 80 |         help="logging directory",
 81 |     )
 82 |     parser.add_argument(
 83 |         "--full-logdir",
 84 |         help="Full path to the logging folder. If set, wiill override log_dir.",
 85 |     )
 86 |     parser.add_argument(
 87 |         "--template", dest="template", default="", help="template for surface relation"
 88 |     )
 89 |     parser.add_argument(
 90 |         "--batch-size", dest="batch_size", type=int, default=32, help="batch size"
 91 |     )
 92 |     parser.add_argument(
 93 |         "--lowercase",
 94 |         "--lower",
 95 |         dest="lowercase",
 96 |         action="store_true",
 97 |         help="perform the evaluation using lowercase text",
 98 |     )
 99 |     parser.add_argument(
100 |         "--threads",
101 |         dest="threads",
102 |         type=int,
103 |         default=-1,
104 |         help="number of threads for evaluation metrics computation (defaults: all available)",
105 |     )
106 |     return parser
107 | 
108 | 
109 | def __add_bert_args(parser):
110 |     group = parser.add_argument_group("BERT")
111 |     group.add_argument(
112 |         "--bert-model-dir",
113 |         "--bmd",
114 |         dest="bert_model_dir",
115 |         help="directory that contains the BERT pre-trained model and the vocabulary",
116 |     )
117 |     group.add_argument(
118 |         "--bert-model-name",
119 |         "--bmn",
120 |         dest="bert_model_name",
121 |         default="bert-base-cased",
122 |         help="name of the BERT pre-trained model (default = 'bert-base-cased')",
123 |     )
124 |     group.add_argument(
125 |         "--bert-vocab-name",
126 |         "--bvn",
127 |         dest="bert_vocab_name",
128 |         default="vocab.txt",
129 |         help="name of vocabulary used to pre-train the BERT model (default = 'vocab.txt')",
130 |     )
131 |     return group
132 | 
133 | 
134 | def __add_roberta_args(parser):
135 |     group = parser.add_argument_group("RoBERTa")
136 |     group.add_argument(
137 |         "--roberta-model-dir",
138 |         "--rmd",
139 |         dest="roberta_model_dir",
140 |         help="directory that contains the ROBERTA pre-trained model and the vocabulary",
141 |     )
142 |     group.add_argument(
143 |         "--roberta-model-name",
144 |         "--rmn",
145 |         dest="roberta_model_name",
146 |         default="model.pt",
147 |         help="name of the ROBERTA pre-trained model (default = 'model.pt')",
148 |     )
149 |     group.add_argument(
150 |         "--roberta-vocab-name",
151 |         "--rvn",
152 |         dest="roberta_vocab_name",
153 |         default="dict.txt",
154 |         help="name of vocabulary used to pre-train the ROBERTA model (default = 'vocab.txt')",
155 |     )
156 |     return group
157 | 
158 | 
159 | def __add_gpt_args(parser):
160 |     group = parser.add_argument_group("GPT")
161 |     group.add_argument(
162 |         "--gpt-model-dir",
163 |         "--gmd",
164 |         dest="gpt_model_dir",
165 |         help="directory that contains the gpt pre-trained model and the vocabulary",
166 |     )
167 |     group.add_argument(
168 |         "--gpt-model-name",
169 |         "--gmn",
170 |         dest="gpt_model_name",
171 |         default="openai-gpt",
172 |         help="name of the gpt pre-trained model (default = 'openai-gpt')",
173 |     )
174 |     return group
175 | 
176 | 
177 | def __add_transformerxl_args(parser):
178 |     group = parser.add_argument_group("GPT")
179 |     group.add_argument(
180 |         "--transformerxl-model-dir",
181 |         "--tmd",
182 |         help="directory that contains the pre-trained model and the vocabulary",
183 |     )
184 |     group.add_argument(
185 |         "--transformerxl-model-name",
186 |         "--tmn",
187 |         default="transfo-xl-wt103",
188 |         help="name of the pre-trained model (default = 'transfo-xl-wt103')",
189 |     )
190 |     return group
191 | 
192 | 
193 | def __add_elmo_args(parser):
194 |     group = parser.add_argument_group("ELMo")
195 |     group.add_argument(
196 |         "--elmo-model-dir",
197 |         "--emd",
198 |         dest="elmo_model_dir",
199 |         help="directory that contains the ELMo pre-trained model and the vocabulary",
200 |     )
201 |     group.add_argument(
202 |         "--elmo-model-name",
203 |         "--emn",
204 |         dest="elmo_model_name",
205 |         default="elmo_2x4096_512_2048cnn_2xhighway",
206 |         help="name of the ELMo pre-trained model (default = 'elmo_2x4096_512_2048cnn_2xhighway')",
207 |     )
208 |     group.add_argument(
209 |         "--elmo-vocab-name",
210 |         "--evn",
211 |         dest="elmo_vocab_name",
212 |         default="vocab-2016-09-10.txt",
213 |         help="name of vocabulary used to pre-train the ELMo model (default = 'vocab-2016-09-10.txt')",
214 |     )
215 |     group.add_argument(
216 |         "--elmo-warm-up-cycles",
217 |         dest="elmo_warm_up_cycles",
218 |         type=int,
219 |         default=5,
220 |         help="ELMo warm up cycles",
221 |     )
222 |     return group
223 | 
224 | 
225 | def parse_args(parser):
226 |     args = parser.parse_args()
227 |     args.models_names = [x.strip().lower() for x in args.models.split(",")]
228 |     if "fconv" in args.models_names:
229 |         if args.data is None:
230 |             raise ValueError(
231 |                 "to use fconv you should specify the directory that contains "
232 |                 "the pre-trained model and the vocabulary with the option --fconv-model-dir/--fmd\n"
233 |                 "you can also specify the fconv model name with the option --fconv-model-name/--fmn (default = 'wiki103.pt')\n"
234 |                 "the vocabulary should be in the provided fconv-model-dir and be named dict.txt"
235 |             )
236 |     if "bert" in args.models_names:
237 |         # use the default shortcut name of a Google AI's pre-trained model (default = 'bert-base-cased')
238 |         pass
239 |     if "elmo" in args.models_names:
240 |         if args.elmo_model_dir is None:
241 |             raise ValueError(
242 |                 "to use elmo you should specify the directory that contains "
243 |                 "the pre-trained model and the vocabulary with the option --elmo-model-dir/--emd\n"
244 |                 "you can also specify the elmo model name with the option --elmo-model-name/--emn (default = 'elmo_2x4096_512_2048cnn_2xhighway')\n"
245 |                 "and the elmo vocabulary name with the option --elmo-vocab-name/--evn (default = 'vocab-2016-09-10.txt')"
246 |             )
247 | 
248 |     return args
249 | 


--------------------------------------------------------------------------------
/mlama/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | import torch
  8 | from colorama import init
  9 | from termcolor import colored
 10 | import numpy as np
 11 | import mlama.modules.base_connector as base
 12 | 
 13 | 
 14 | def __exclude_tokens(token_ids, vocab):
 15 |     indices_to_exclude = []
 16 |     for i, tok in enumerate(token_ids):
 17 |         word_form = vocab[tok]
 18 |         if (word_form in base.SPECIAL_SYMBOLS):
 19 |             indices_to_exclude.append(i)
 20 |     return indices_to_exclude
 21 | 
 22 | 
 23 | def __print_generation(positional_scores, token_ids, vocab, rank_dict,
 24 |                        index_max_probs, value_max_probs, topk,
 25 |                        indices_to_exclude, masked_indices, print_on_console):
 26 |     init()  # colorful output
 27 |     msg = ""
 28 |     dash = '-' * 82
 29 |     msg += dash + "\n"
 30 |     msg += '{:<8s}{:<20s}{:<12s}{:<20}{:<12s}{:<12s}'.format(
 31 |                     "index", "token", "log_prob", "prediction",
 32 |                     "log_prob", "rank@{}".format(topk))
 33 |     msg += "\n" + dash
 34 |     if print_on_console:
 35 |         print(msg)
 36 |     msg += '\n'
 37 | 
 38 |     for idx, tok in enumerate(token_ids):
 39 | 
 40 |         word_form = vocab[tok]
 41 | 
 42 |         rank = -1
 43 |         if idx in rank_dict:
 44 |             rank = rank_dict[idx]
 45 |         index_max_prob = index_max_probs[idx]
 46 | 
 47 |         predicted_token_id = index_max_prob[0]
 48 | 
 49 |         value_max_prob = value_max_probs[idx]
 50 |         string_to_print = '{:<8d}{:<20s}{:<12.3f}{:<20s}{:<12.3f}{:<12d}'.format(
 51 |             idx,
 52 |             str(word_form),
 53 |             positional_scores[idx],
 54 |             str(vocab[predicted_token_id]),
 55 |             value_max_prob[0],
 56 |             rank
 57 |         )
 58 | 
 59 |         if print_on_console:
 60 |             if masked_indices is not None and idx in masked_indices:
 61 |                 print(colored(string_to_print, 'grey', 'on_yellow'))
 62 |             elif indices_to_exclude is not None and idx in indices_to_exclude:
 63 |                 print(colored(string_to_print, 'grey', 'on_grey'))
 64 |             else:
 65 |                 print(string_to_print)
 66 |         msg += string_to_print + "\n"
 67 | 
 68 |     return msg
 69 | 
 70 | 
 71 | def __get_topk(log_probs, topk):
 72 |     value_max_probs, index_max_probs = torch.topk(input=log_probs, k=topk, dim=1)
 73 |     index_max_probs = index_max_probs.numpy()
 74 |     value_max_probs = value_max_probs.detach().numpy()
 75 |     return value_max_probs, index_max_probs
 76 | 
 77 | 
 78 | def print_sentence_predictions(log_probs, token_ids, vocab,
 79 |                                masked_indices=None, print_generation=True,
 80 |                                topk=1000):
 81 | 
 82 |     msg = "\n"
 83 |     log_probs = log_probs[:len(token_ids)]
 84 |     value_max_probs, index_max_probs = __get_topk(log_probs, topk)
 85 | 
 86 |     # remove special symbols from token_ids
 87 |     excluded_indices = __exclude_tokens([t for t in token_ids], vocab)
 88 | 
 89 |     # score only first mask
 90 |     #masked_indices = masked_indices[:1]
 91 | 
 92 |     tokens = torch.from_numpy(np.asarray(token_ids))
 93 | 
 94 |     # get ranking position in topk
 95 |     query = tokens.squeeze().data.unsqueeze(-1)
 96 |     query = query.repeat(1, topk)
 97 | 
 98 |     ranking_position = (index_max_probs == query.numpy()).nonzero()
 99 | 
100 |     rank_dict = dict(zip(*ranking_position))
101 | 
102 |     # get positional score of the correct token
103 |     token_probs = log_probs.gather(
104 |         dim=1,
105 |         index=tokens.view(-1, 1),
106 |     )
107 |     positional_scores = token_probs.squeeze(-1).detach().numpy()
108 | 
109 |     score_sum = 0.
110 |     count = 0
111 |     for idx, score in enumerate(positional_scores):
112 |         if idx not in excluded_indices:
113 |             score_sum += score
114 |             count += 1
115 | 
116 |     if count > 0:
117 |         avg_nll_loss = - (score_sum / count)
118 |     else:
119 |         avg_nll_loss = 0.0
120 |     perplexity = np.exp(avg_nll_loss)
121 | 
122 |     # print("positional_scores: {}".format(positional_scores))
123 |     # print("avg_nll_loss: {}".format(avg_nll_loss))
124 | 
125 |     __print_generation(positional_scores, token_ids, vocab, rank_dict,
126 |                        index_max_probs, value_max_probs, topk,
127 |                        excluded_indices, masked_indices, print_generation)
128 | 
129 |     # msg += return_msg
130 |     msg += '| Perplexity: {:.3f}\n'.format(perplexity)
131 | 
132 |     if print_generation:
133 |         print("\n"+msg+"\n")
134 | 
135 |     return perplexity, msg
136 | 
137 | 
138 | def load_vocab(vocab_filename):
139 |     with open(vocab_filename, "r") as f:
140 |         lines = f.readlines()
141 |     vocab = [x.strip() for x in lines]
142 |     return vocab
143 | 


--------------------------------------------------------------------------------
/mlama/vocab_intersection.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | from lama.modules import build_model_by_name
  8 | from tqdm import tqdm
  9 | import argparse
 10 | import spacy
 11 | import lama.modules.base_connector as base
 12 | 
 13 | 
 14 | CASED_MODELS = [
 15 |   # {
 16 |   #   # "FAIRSEQ WIKI103"
 17 |   #   "lm": "fairseq",
 18 |   #   "data": "pre-trained_language_models/fairseq/wiki103_fconv_lm/",
 19 |   #   "fairseq_model_name": "wiki103.pt",
 20 |   #   "task": "language_modeling",
 21 |   #   "cpu": True,
 22 |   #   "output_dictionary_size": -1
 23 |   # },
 24 |   {
 25 |     # "TransformerXL"
 26 |     "lm": "transformerxl",
 27 |     "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/",
 28 |   },
 29 |   {
 30 |     # "ELMO ORIGINAL"
 31 |     "lm": "elmo",
 32 |     "elmo_model_dir": "pre-trained_language_models/elmo/original",
 33 |     "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway",
 34 |     "elmo_vocab_name": "vocab-2016-09-10.txt",
 35 |     "elmo_warm_up_cycles": 5
 36 |   },
 37 |   {
 38 |     # "ELMO ORIGINAL 5.5B"
 39 |     "lm": "elmo",
 40 |     "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/",
 41 |     "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B",
 42 |     "elmo_vocab_name": "vocab-enwiki-news-500000.txt",
 43 |     "elmo_warm_up_cycles": 5
 44 |   },
 45 |   {
 46 |     # "BERT BASE CASED"
 47 |     "lm": "bert",
 48 |     "bert_model_name": "bert-base-cased",
 49 |     "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12/",
 50 |     "bert_vocab_name": "vocab.txt"
 51 |   },
 52 |   {
 53 |     # "BERT LARGE CASED"
 54 |     "lm" : "bert",
 55 |     "bert_model_name": "bert-large-cased",
 56 |     "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16/",
 57 |     "bert_vocab_name": "vocab.txt"
 58 |   }
 59 | ]
 60 | 
 61 | CASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_cased.txt"
 62 | 
 63 | LOWERCASED_MODELS = [
 64 |  {
 65 |    # "BERT BASE UNCASED"
 66 |    "lm": "bert",
 67 |    "bert_model_name": "bert-base-uncased",
 68 |    "bert_model_dir": None,
 69 |    "bert_vocab_name": "vocab.txt"
 70 |  },
 71 |  {
 72 |    # "BERT LARGE UNCASED"
 73 |    "lm": "bert",
 74 |    "bert_model_name": "bert-large-uncased",
 75 |    "bert_model_dir": None,
 76 |    "bert_vocab_name": "vocab.txt"
 77 |  },
 78 |  {
 79 |    # "OpenAI GPT"
 80 |    "lm": "gpt",
 81 |    "gpt_model_dir": None,
 82 |    "gpt_model_name": "openai-gpt"
 83 |  }
 84 | ]
 85 | 
 86 | LOWERCASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_lowercased.txt"
 87 | 
 88 | 
 89 | def __vocab_intersection(models, filename):
 90 | 
 91 |     vocabularies = []
 92 | 
 93 |     for arg_dict in models:
 94 | 
 95 |         args = argparse.Namespace(**arg_dict)
 96 |         print(args)
 97 |         model = build_model_by_name(args.lm, args)
 98 | 
 99 |         vocabularies.append(model.vocab)
100 |         print(type(model.vocab))
101 | 
102 |     if len(vocabularies) > 0:
103 |         common_vocab = set(vocabularies[0])
104 |         for vocab in vocabularies:
105 |             common_vocab = common_vocab.intersection(set(vocab))
106 | 
107 |         # no special symbols in common_vocab
108 |         for symbol in base.SPECIAL_SYMBOLS:
109 |             if symbol in common_vocab:
110 |                 common_vocab.remove(symbol)
111 | 
112 |         # remove stop words
113 |         from spacy.lang.en.stop_words import STOP_WORDS
114 |         for stop_word in STOP_WORDS:
115 |             if stop_word in common_vocab:
116 |                 print(stop_word)
117 |                 common_vocab.remove(stop_word)
118 | 
119 |         common_vocab = list(common_vocab)
120 | 
121 |         # remove punctuation and symbols
122 |         nlp = spacy.load('en')
123 |         manual_punctuation = ['(', ')', '.', ',']
124 |         new_common_vocab = []
125 |         for i in tqdm(range(len(common_vocab))):
126 |             word = common_vocab[i]
127 |             doc = nlp(word)
128 |             token = doc[0]
129 |             if(len(doc) != 1):
130 |                 print(word)
131 |                 for idx, tok in enumerate(doc):
132 |                     print("{} - {}".format(idx, tok))
133 |             elif word in manual_punctuation:
134 |                 pass
135 |             elif token.pos_ == "PUNCT":
136 |                 print("PUNCT: {}".format(word))
137 |             elif token.pos_ == "SYM":
138 |                 print("SYM: {}".format(word))
139 |             else:
140 |                 new_common_vocab.append(word)
141 |             # print("{} - {}".format(word, token.pos_))
142 |         common_vocab = new_common_vocab
143 | 
144 |         # store common_vocab on file
145 |         with open(filename, 'w') as f:
146 |             for item in sorted(common_vocab):
147 |                 f.write("{}\n".format(item))
148 | 
149 | 
150 | def main():
151 |     # cased version
152 |     __vocab_intersection(CASED_MODELS, CASED_COMMON_VOCAB_FILENAME)
153 |     # lowercased version
154 |     __vocab_intersection(LOWERCASED_MODELS, LOWERCASED_COMMON_VOCAB_FILENAME)
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     main()
159 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython==0.29.2
 2 | numpy==1.15.1
 3 | torch==1.0.1
 4 | pytorch-pretrained-bert==0.6.1
 5 | allennlp==0.8.5
 6 | spacy==2.1.8
 7 | tqdm==4.26.0
 8 | termcolor==1.1.0
 9 | pandas==0.23.4
10 | fairseq==0.8.0
11 | colorama==0.4.1
12 | scipy==1.3.2
13 | 


--------------------------------------------------------------------------------
/scripts/batch_eval_KB_completion_mBERT_ranked.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | from mlama.modules import build_model_by_name
  8 | import mlama.utils as utils
  9 | from mlama.utils import print_sentence_predictions, load_vocab
 10 | import mlama.options as options
 11 | from tqdm import tqdm
 12 | from random import shuffle
 13 | import os
 14 | import json
 15 | import spacy
 16 | import mlama.modules.base_connector as base
 17 | from pprint import pprint
 18 | import logging.config
 19 | import logging
 20 | import pickle
 21 | from multiprocessing.pool import ThreadPool
 22 | import multiprocessing
 23 | import mlama.evaluation_metrics_ranked as metrics
 24 | import time, sys
 25 | import torch
 26 | import numpy as np
 27 | 
 28 | def load_file(filename):
 29 |     data = []
 30 |     with open(filename, "r") as f:
 31 |         for line in f.readlines():
 32 |             data.append(json.loads(line))
 33 |     return data
 34 | 
 35 | 
 36 | def create_logdir_with_timestamp(base_logdir, modelname):
 37 |     timestr = time.strftime("%Y%m%d_%H%M%S")
 38 | 
 39 |     # create new directory
 40 |     log_directory = "{}/{}_{}/".format(base_logdir, modelname, timestr)
 41 |     os.makedirs(log_directory)
 42 | 
 43 |     path = "{}/last".format(base_logdir)
 44 |     try:
 45 |         os.unlink(path)
 46 |     except Exception:
 47 |         pass
 48 |     os.symlink(log_directory, path)
 49 |     return log_directory
 50 | 
 51 | 
 52 | def parse_template(template, subject_label, object_label):
 53 |     SUBJ_SYMBOL = "[X]"
 54 |     OBJ_SYMBOL = "[Y]"
 55 |     template = template.replace(SUBJ_SYMBOL, subject_label)
 56 |     template = template.replace(OBJ_SYMBOL, object_label)
 57 |     return [template]
 58 | 
 59 | 
 60 | def init_logging(log_directory):
 61 |     logger = logging.getLogger("LAMA")
 62 |     logger.setLevel(logging.DEBUG)
 63 | 
 64 |     os.makedirs(log_directory, exist_ok=True)
 65 | 
 66 |     # logging format
 67 |     # "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 68 |     formatter = logging.Formatter(
 69 |         "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 70 |     )
 71 | 
 72 |     # file handler
 73 |     fh = logging.FileHandler(str(log_directory) + "/info.log")
 74 |     fh.setLevel(logging.DEBUG)
 75 |     fh.setFormatter(formatter)
 76 | 
 77 |     # console handler
 78 |     ch = logging.StreamHandler(sys.stdout)
 79 |     ch.setLevel(logging.WARNING)
 80 |     ch.setFormatter(formatter)
 81 | 
 82 |     logger.addHandler(fh)
 83 |     logger.addHandler(ch)
 84 | 
 85 |     logger.propagate = False
 86 | 
 87 |     return logger
 88 | 
 89 | 
 90 | def batchify(data, batch_size):
 91 |     msg = ""
 92 |     list_samples_batches = []
 93 |     list_sentences_batches = []
 94 |     current_samples_batch = []
 95 |     current_sentences_batches = []
 96 |     c = 0
 97 | 
 98 |     # sort to group togheter sentences with similar length
 99 |     for sample in sorted(
100 |         data, key=lambda k: len(" ".join(k["masked_sentences"]).split())
101 |     ):
102 |         masked_sentences = sample["masked_sentences"]
103 |         current_samples_batch.append(sample)
104 |         current_sentences_batches.append(masked_sentences)
105 |         c += 1
106 |         if c >= batch_size:
107 |             list_samples_batches.append(current_samples_batch)
108 |             list_sentences_batches.append(current_sentences_batches)
109 |             current_samples_batch = []
110 |             current_sentences_batches = []
111 |             c = 0
112 | 
113 |     # last batch
114 |     if current_samples_batch and len(current_samples_batch) > 0:
115 |         list_samples_batches.append(current_samples_batch)
116 |         list_sentences_batches.append(current_sentences_batches)
117 | 
118 |     return list_samples_batches, list_sentences_batches, msg
119 | 
120 | 
121 | def run_thread(arguments):
122 | 
123 |     msg = ""
124 | 
125 |     # 1. compute the ranking metrics on the filtered log_probs tensor
126 |     experiment_result, return_msg = metrics.get_ranking(
127 |         arguments["filtered_log_probs"],
128 |         arguments["sample"],
129 |         arguments["masked_indices"],
130 |         arguments["vocab"],
131 |         arguments["candidates"],
132 |         label_index=arguments["label_index"],
133 |         index_list=arguments["index_list"],
134 |         print_generation=arguments["interactive"],
135 |         topk=10,
136 |     )
137 |     msg += "\n" + return_msg
138 | 
139 |     return experiment_result, msg
140 | 
141 | 
142 | def lowercase_samples(samples, use_negated_probes=False):
143 |     new_samples = []
144 |     for sample in samples:
145 |         sample["obj_label"] = sample["obj_label"].lower()
146 |         sample["sub_label"] = sample["sub_label"].lower()
147 |         lower_masked_sentences = []
148 |         for sentence in sample["masked_sentences"]:
149 |             sentence = sentence.lower()
150 |             sentence = sentence.replace(base.MASK.lower(), base.MASK)
151 |             lower_masked_sentences.append(sentence)
152 |         sample["masked_sentences"] = lower_masked_sentences
153 | 
154 |         new_samples.append(sample)
155 |     return new_samples
156 | 
157 | 
158 | def filter_samples(model, samples, vocab_subset, max_sentence_length, template):
159 |     msg = ""
160 |     new_samples = []
161 |     samples_exluded = 0
162 |     for sample in samples:
163 |         excluded = False
164 |         if "obj_label" in sample and "sub_label" in sample:
165 | 
166 |             obj_label_ids = model.get_id(sample["obj_label"])
167 | 
168 |             if obj_label_ids:
169 |                 recostructed_word = " ".join(
170 |                     [model.vocab[x] for x in obj_label_ids]
171 |                 ).strip()
172 |             else:
173 |                 recostructed_word = None
174 | 
175 |             excluded = False
176 |             if not template or len(template) == 0:
177 |                 masked_sentences = sample["masked_sentences"]
178 |                 text = " ".join(masked_sentences)
179 |                 if len(text.split()) > max_sentence_length:
180 |                     msg += "\tEXCLUDED for exeeding max sentence length: {}\n".format(
181 |                         masked_sentences
182 |                     )
183 |                     samples_exluded += 1
184 |                     excluded = True
185 |             """if sample['from_english']:
186 |                 msg += "\tEXCLUDED not in language \n"
187 |                 excluded = True
188 |                 samples_exluded += 1"""
189 |             # MAKE SURE THAT obj_label IS IN VOCABULARIES
190 |             if vocab_subset:
191 |                 for x in sample["obj_label"].split(" "):
192 |                     if x not in vocab_subset:
193 |                         excluded = True
194 |                         msg += "\tEXCLUDED object label {} not in vocab subset\n".format(
195 |                             sample["obj_label"]
196 |                         )
197 |                         samples_exluded += 1
198 |                         break
199 |             if excluded:
200 |                 pass
201 |             elif obj_label_ids is None:
202 |                 msg += "\tEXCLUDED object label is {} None\n".format(
203 |                     sample["obj_label"]
204 |                 )
205 |                 samples_exluded += 1
206 | 
207 |             #   samples_exluded+=1
208 |             elif "judgments" in sample:
209 |                 # only for Google-RE
210 |                 num_no = 0
211 |                 num_yes = 0
212 |                 for x in sample["judgments"]:
213 |                     if x["judgment"] == "yes":
214 |                         num_yes += 1
215 |                     else:
216 |                         num_no += 1
217 |                 if num_no > num_yes:
218 |                     # SKIP NEGATIVE EVIDENCE
219 |                     pass
220 |                 else:
221 |                     new_samples.append(sample)
222 |             else:
223 |                 new_samples.append(sample)
224 |         else:
225 |             msg += "\tEXCLUDED since 'obj_label' not sample or 'sub_label' not in sample: {}\n".format(
226 |                 sample
227 |             )
228 |             samples_exluded += 1
229 |     msg += "samples exluded  : {}\n".format(samples_exluded)
230 |     return new_samples, msg
231 | 
232 | 
233 | def main(args, NUM_MASK, candidates, shuffle_data=True, model=None):
234 | 
235 |     if len(args.models_names) > 1:
236 |         raise ValueError('Please specify a single language model (e.g., --lm "bert").')
237 | 
238 |     msg = ""
239 | 
240 |     [model_type_name] = args.models_names
241 | 
242 |     if model is None:
243 |         model = build_model_by_name(model_type_name, args)
244 | 
245 |     if model_type_name == "fairseq":
246 |         model_name = "fairseq_{}".format(args.fairseq_model_name)
247 |     elif model_type_name == "bert":
248 |         model_name = "BERT_{}".format(args.bert_model_name)
249 |     elif model_type_name == "elmo":
250 |         model_name = "ELMo_{}".format(args.elmo_model_name)
251 |     else:
252 |         model_name = model_type_name.title()
253 | 
254 |     # initialize logging
255 |     if args.full_logdir:
256 |         log_directory = args.full_logdir
257 |     else:
258 |         log_directory = create_logdir_with_timestamp(args.logdir, model_name)
259 |     logger = init_logging(log_directory)
260 |     msg += "model name: {}\n".format(model_name)
261 | 
262 |     # deal with vocab subset
263 |     vocab_subset = None
264 |     index_list = None
265 |     msg += "args: {}\n".format(args)
266 |     if args.common_vocab_filename is not None:
267 |         vocab_subset = load_vocab(args.common_vocab_filename)
268 |         msg += "common vocabulary size: {}\n".format(len(vocab_subset))
269 | 
270 |         # optimization for some LM (such as ELMo)
271 |         model.optimize_top_layer(vocab_subset)
272 | 
273 |         filter_logprob_indices, index_list = model.init_indices_for_filter_logprobs(
274 |             vocab_subset, logger
275 |         )
276 | 
277 |     logger.info("\n" + msg + "\n")
278 | 
279 |     # dump arguments on file for log
280 |     with open("{}/args.json".format(log_directory), "w") as outfile:
281 |         json.dump(vars(args), outfile)
282 | 
283 |     data = load_file(args.dataset_filename)
284 | 
285 |     if args.lowercase:
286 |         # lowercase all samples
287 |         logger.info("lowercasing all samples...")
288 |         all_samples = lowercase_samples(
289 |             data, use_negated_probes=args.use_negated_probes
290 |         )
291 |     else:
292 |         # keep samples as they are
293 |         all_samples = data
294 | 
295 | 
296 |     # create uuid if not present
297 |     i = 0
298 |     for sample in all_samples:
299 |         sample["uuid"] = i
300 |         i += 1
301 | 
302 | 
303 | 
304 | 
305 |     all_samples, ret_msg = filter_samples(
306 |         model, data, vocab_subset, args.max_sentence_length, args.template
307 |     )
308 | 
309 |     # OUT_FILENAME = "{}.jsonl".format(args.dataset_filename)
310 |     # with open(OUT_FILENAME, 'w') as outfile:
311 |     #     for entry in all_samples:
312 |     #         json.dump(entry, outfile)
313 |     #         outfile.write('\n')
314 | 
315 |     logger.info("\n" + ret_msg + "\n")
316 | 
317 | 
318 |     # if template is active (1) use a single example for (sub,obj) and (2) ...
319 |     if args.template and args.template != "":
320 |         facts = []
321 |         for sample in all_samples:
322 |             sub = sample["sub_label"]
323 |             obj = sample["obj_label"]
324 |             uuid = sample["uuid"]
325 |             if (sub, obj, uuid) not in facts:
326 |                 facts.append((sub, obj, uuid))
327 |         local_msg = "distinct template facts: {}".format(len(facts))
328 |         logger.info("\n" + local_msg + "\n")
329 |         print(local_msg)
330 |         all_samples = []
331 |         for fact in facts:
332 |             (sub, obj, uuid) = fact
333 |             sample = {"sub_label": sub, "obj_label": obj, "uuid": uuid}
334 |             # substitute all sentences with a standard template
335 |             sample["masked_sentences"] = parse_template(
336 |                 args.template.strip(), sample["sub_label"].strip(), base.MASK
337 |             )
338 | 
339 |             all_samples.append(sample)
340 | 
341 |     # shuffle data
342 |     if shuffle_data:
343 |         shuffle(all_samples)
344 | 
345 |     samples_batches, sentences_batches, ret_msg = batchify(all_samples, args.batch_size)
346 |     logger.info("\n" + ret_msg + "\n")
347 | 
348 |     # ThreadPool
349 |     num_threads = args.threads
350 |     if num_threads <= 0:
351 |         # use all available threads
352 |         num_threads = multiprocessing.cpu_count()
353 |     pool = ThreadPool(num_threads)
354 |     list_of_results = []
355 | 
356 |     for i in tqdm(range(len(samples_batches))):
357 | 
358 |         samples_b = samples_batches[i]
359 |         sentences_b = []
360 |         current_batch_size = len(samples_b)
361 |         for i, sample in enumerate(samples_b):
362 |             masked_sentences = []
363 |             for num_mask in range(1, NUM_MASK+1):
364 |                 sentence = sample["masked_sentences"][0]
365 |                 sentence = sentence.replace(base.MASK, base.MASK * num_mask)
366 |                 sentence = sentence.replace("][", "] [")
367 |                 masked_sentences.append(sentence)
368 |                 sentences_b.append([sentence])
369 |             samples_b[i]["masked_sentences"] = masked_sentences
370 |         (
371 |             original_log_probs_list,
372 |             token_ids_list,
373 |             masked_indices_list,
374 |         ) = model.get_batch_generation(sentences_b, logger=logger)
375 | 
376 |         if vocab_subset is not None:
377 |             # filter log_probs
378 |             filtered_log_probs_list = model.filter_logprobs(
379 |                 original_log_probs_list, filter_logprob_indices
380 |             )
381 |         else:
382 |             filtered_log_probs_list = original_log_probs_list
383 | 
384 |         label_index_list = []
385 |         for sample in samples_b:
386 |             obj_label_id = model.get_id(sample["obj_label"])
387 | 
388 |             # MAKE SURE THAT obj_label IS IN VOCABULARIES
389 |             if obj_label_id is None:
390 |                 raise ValueError(
391 |                     "object label id {} is None".format(
392 |                         sample["obj_label"]
393 |                     )
394 |                 )
395 | 
396 |             label_index_list.append(obj_label_id)
397 | 
398 |         dim_reshape = (current_batch_size, int(original_log_probs_list.shape[0]/current_batch_size), original_log_probs_list.shape[1], original_log_probs_list.shape[2])
399 |         original_log_probs_list = torch.reshape(original_log_probs_list, dim_reshape)
400 |         filtered_log_probs_list = torch.reshape(filtered_log_probs_list, dim_reshape)
401 | 
402 |         masked_indices_list = np.reshape(np.array(masked_indices_list), (current_batch_size, int(len(masked_indices_list)/current_batch_size)))
403 |         arguments = [
404 |             {
405 |                 "original_log_probs": original_log_probs,
406 |                 "filtered_log_probs": filtered_log_probs,
407 |                 "token_ids": token_ids,
408 |                 "vocab": model.vocab,
409 |                 "label_index": label_index,
410 |                 "masked_indices": masked_indices,
411 |                 "interactive": args.interactive,
412 |                 "index_list": index_list,
413 |                 "sample": sample,
414 |                 "candidates": candidates,
415 |             }
416 |             for sample, original_log_probs, filtered_log_probs, token_ids, label_index, masked_indices in zip(
417 |                 samples_b, original_log_probs_list, filtered_log_probs_list, token_ids_list, label_index_list, masked_indices_list,
418 |             )
419 |         ]
420 | 
421 |         # multithread
422 |         res = pool.map(run_thread, arguments)
423 | 
424 |         for idx, result in enumerate(res):
425 | 
426 |             result_masked_topk, msg = result
427 | 
428 |             logger.info("\n" + msg + "\n")
429 | 
430 |             sample = samples_b[idx]
431 | 
432 |             element = {"sample": sample, "uuid": sample["uuid"], "token_ids": token_ids_list[0],
433 |                        "masked_indices": masked_indices_list[0], "label_index": label_index_list[0],
434 |                        "masked_topk": result_masked_topk}
435 | 
436 |             list_of_results.append(element)
437 | 
438 |     pool.close()
439 |     pool.join()
440 | 
441 |     # dump pickle with the result of the experiment
442 |     all_results = dict(
443 |         list_of_results=list_of_results
444 |     )
445 |     with open("{}/result.pkl".format(log_directory), "wb") as f:
446 |         pickle.dump(all_results, f)
447 | 
448 | 
449 | if __name__ == "__main__":
450 |     parser = options.get_eval_KB_completion_parser()
451 |     args = options.parse_args(parser)
452 |     main(args)
453 | 


--------------------------------------------------------------------------------
/scripts/eval.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import numpy as np
 4 | 
 5 | problem = []
 6 | f_out = open("./output/mbert_ranked.csv", "w")
 7 | output_path = "./output/results/mbert_base/"
 8 | path_compare = "./output/results/bert_base/en/"
 9 | languages = list(os.walk(output_path))[0][1:-1][0]
10 | dict_languages_total = {}
11 | dict_languages_P = {}
12 | 
13 | for lang in languages:
14 |   print(lang)
15 |   P_all = []
16 |   P_all_eng = []
17 |   total_all = []
18 |   relations = list(os.walk(output_path + lang + "/"))[0][1:-1][0]
19 |   for relation in relations:
20 |        if "date" in relation:
21 |            continue
22 |        P = 0.0
23 |        P_eng = 0.0
24 |        total = 0.0
25 | 
26 |        with open(output_path + lang + "/" +  relation + "/" + 'result.pkl', 'rb') as f:
27 |             data = pickle.load(f)
28 | 
29 |        with open(path_compare +  relation + "/" + 'result.pkl', 'rb') as f:
30 |             data_eng = pickle.load(f)
31 | 
32 |        if len(data["list_of_results"]) >0:
33 |            eng_dict = {}
34 |            for d in data_eng["list_of_results"]:
35 |                rank = 0.0
36 |                if d['masked_topk']["rank"]==0:
37 |                    rank = 1.0
38 |                eng_dict[d["sample"]["uuid"]] = [rank, d["sample"]]
39 |            for d in data["list_of_results"]:
40 |                rank = 0.0
41 |                if d['masked_topk']["rank"]==0:
42 |                    rank = 1.0
43 |                P += rank
44 |                total += 1.0
45 |                idx = int(d["sample"]["uuid"])
46 |                if idx in eng_dict:
47 |                    P_eng += eng_dict[idx][0]
48 | 
49 |            P_all.append(P/total)
50 |            P_all_eng.append(P_eng/total)
51 |            total_all.append(total)
52 | 
53 |   f_out.write(lang)
54 |   f_out.write(",")
55 |   f_out.write(str(np.sum(total_all)))
56 |   f_out.write(",")
57 |   f_out.write(str(np.mean(P_all)))
58 |   f_out.write(",")
59 |   f_out.write(str(np.mean(P_all_eng)))
60 |   f_out.write("\n")
61 | f_out.close()
62 | 


--------------------------------------------------------------------------------
/scripts/run_experiments_mBERT_ranked.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | import argparse
  8 | from batch_eval_KB_completion_mBERT_ranked import main as run_evaluation
  9 | from batch_eval_KB_completion_mBERT_ranked import load_file
 10 | from mlama.modules import build_model_by_name
 11 | import pprint
 12 | import statistics
 13 | from os import listdir
 14 | import os
 15 | from os.path import isfile, join
 16 | from shutil import copyfile
 17 | from collections import defaultdict
 18 | import json
 19 | 
 20 | LMs = [
 21 |     {
 22 |         "lm": "bert",
 23 |         "label": "mbert_base",
 24 |         "models_names": ["bert"],
 25 |         "bert_model_name": "bert-base-multilingual-cased",
 26 |         "bert_model_dir": None
 27 |     },
 28 | ]
 29 | 
 30 | 
 31 | def run_experiments(
 32 |     relations,
 33 |     data_path_pre,
 34 |     data_path_post,
 35 |     language,
 36 |     input_param={
 37 |         "lm": "bert",
 38 |         "label": "bert_large",
 39 |         "models_names": ["bert"],
 40 |         "bert_model_name": "bert-large-cased",
 41 |         "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16",
 42 |     },
 43 | ):
 44 |     model = None
 45 |     pp = pprint.PrettyPrinter(width=41, compact=True)
 46 |     if "P" in relations[0]["relation"]:
 47 |         object_path = "./data/TREx_multilingual_objects/" + language + ".json"
 48 |     else:
 49 |         object_path = "./data/GoogleRE_objects/" + language + ".json"
 50 | 
 51 |     with open(object_path) as f:
 52 |         candidates = json.load(f)
 53 | 
 54 |     for relation in relations:
 55 |         pp.pprint(relation)
 56 |         PARAMETERS = {
 57 |             "dataset_filename": "{}{}{}".format(
 58 |                 data_path_pre, relation["relation"], data_path_post
 59 |             ),
 60 |             "common_vocab_filename": None,
 61 |             "template": "",
 62 |             "bert_vocab_name": "vocab.txt",
 63 |             "batch_size": 4,
 64 |             "logdir": "output",
 65 |             "full_logdir": "output/results/{}/{}/{}".format(
 66 |                 input_param["label"], language, relation["relation"]
 67 |             ),
 68 |             "lowercase": False,
 69 |             "max_sentence_length": 100,
 70 |             "threads": -1,
 71 |             "interactive": False,
 72 |         }
 73 | 
 74 |         if "template" in relation:
 75 |             PARAMETERS["template"] = relation["template"]
 76 | 
 77 |         PARAMETERS.update(input_param)
 78 |         print(PARAMETERS)
 79 | 
 80 |         args = argparse.Namespace(**PARAMETERS)
 81 | 
 82 |         # see if file exists
 83 |         try:
 84 |             data = load_file(args.dataset_filename)
 85 |         except Exception as e:
 86 |             print("Relation {} excluded.".format(relation["relation"]))
 87 |             print("Exception: {}".format(e))
 88 |             continue
 89 | 
 90 |         if model is None:
 91 |             [model_type_name] = args.models_names
 92 |             model = build_model_by_name(model_type_name, args)
 93 | 
 94 |         max_length = 0
 95 |         dict_num_mask = {}
 96 |         for obj in candidates[relation["relation"]]["objects"]:
 97 |             if len(model.tokenizer.tokenize(obj)) > max_length:
 98 |                 max_length = len(model.tokenizer.tokenize(obj))
 99 |         for l in range(1, max_length+1):
100 |             dict_num_mask[l] = {}
101 |         for obj in candidates[relation["relation"]]["objects"]:
102 |             dict_num_mask[len(model.tokenizer.tokenize(obj))][obj] = model.get_id(obj)
103 | 
104 |         run_evaluation(args, max_length, dict_num_mask, shuffle_data=False, model=model)
105 | 
106 | 
107 | def get_TREx_parameters(data_path_pre="data/"):
108 |     relations = load_file("{}relations.jsonl".format(data_path_pre))
109 |     data_path_pre += "TREx/"
110 |     data_path_post = ".jsonl"
111 |     return relations, data_path_pre, data_path_post
112 | 
113 | 
114 | def get_GoogleRE_parameters():
115 |     relations = [
116 |         {
117 |             "relation": "place_of_birth",
118 |             "template": "[X] was born in [Y] .",
119 |             "template_negated": "[X] was not born in [Y] .",
120 |         },
121 |         {
122 |             "relation": "date_of_birth",
123 |             "template": "[X] (born [Y]).",
124 |             "template_negated": "[X] (not born [Y]).",
125 |         },
126 |         {
127 |             "relation": "place_of_death",
128 |             "template": "[X] died in [Y] .",
129 |             "template_negated": "[X] did not die in [Y] .",
130 |         },
131 |     ]
132 |     data_path_pre = "data/Google_RE/"
133 |     data_path_post = "_test.jsonl"
134 |     return relations, data_path_pre, data_path_post
135 | 
136 | 
137 | def get_MultiLingual_parameters(data_path_pre="./data/mlama1.1/", language=""):
138 |     relations = load_file("{}/{}/templates.jsonl".format(data_path_pre, language))
139 |     data_path_pre += language + "/"
140 |     data_path_post = ".jsonl"
141 |     return relations, data_path_pre, data_path_post, language
142 | 
143 | 
144 | """def get_MultiLingual_parameters_GoogleRe(data_path_pre="./data/", language=""):
145 |     relations = load_file("{}/templates.jsonl".format(data_path_pre, language))
146 |     data_path_pre += language + "/"
147 |     data_path_post = "_test.jsonl"
148 |     return relations, data_path_pre, data_path_post, language"""
149 | 
150 | 
151 | def run_all_LMs(parameters):
152 |     for ip in LMs:
153 |         print(ip["label"])
154 |         run_experiments(*parameters, input_param=ip)
155 | 
156 | 
157 | def main():
158 |     parser = argparse.ArgumentParser()
159 |     parser.add_argument('--lang', '-l', type=str, default="fr", help='language')
160 | 
161 |     args = parser.parse_args()
162 | 
163 |     l = args.lang
164 |     print(l)
165 |     parameters = get_MultiLingual_parameters(language=l)
166 |     run_all_LMs(parameters)
167 | 
168 | if __name__ == "__main__":
169 |     main()
170 | 


--------------------------------------------------------------------------------