├── .gitignore ├── LICENSE ├── README.md ├── data ├── GoogleRE_objects │ ├── af.json │ ├── ar.json │ ├── az.json │ ├── be.json │ ├── bg.json │ ├── bn.json │ ├── ca.json │ ├── ceb.json │ ├── cs.json │ ├── cy.json │ ├── da.json │ ├── de.json │ ├── el.json │ ├── en.json │ ├── es.json │ ├── et.json │ ├── eu.json │ ├── fa.json │ ├── fi.json │ ├── fr.json │ ├── ga.json │ ├── gl.json │ ├── he.json │ ├── hi.json │ ├── hr.json │ ├── hu.json │ ├── hy.json │ ├── id.json │ ├── it.json │ ├── ja.json │ ├── ka.json │ ├── ko.json │ ├── la.json │ ├── lt.json │ ├── lv.json │ ├── ms.json │ ├── nl.json │ ├── pl.json │ ├── pt.json │ ├── ro.json │ ├── ru.json │ ├── sk.json │ ├── sl.json │ ├── sq.json │ ├── sr.json │ ├── sv.json │ ├── ta.json │ ├── th.json │ ├── tr.json │ ├── uk.json │ ├── ur.json │ ├── vi.json │ └── zh.json └── TREx_multilingual_objects │ ├── af.json │ ├── an.json │ ├── ar.json │ ├── ast.json │ ├── az.json │ ├── azb.json │ ├── ba.json │ ├── bar.json │ ├── be.json │ ├── bg.json │ ├── bn.json │ ├── br.json │ ├── bs.json │ ├── ca.json │ ├── ce.json │ ├── ceb.json │ ├── cs.json │ ├── cv.json │ ├── cy.json │ ├── da.json │ ├── de.json │ ├── el.json │ ├── en.json │ ├── es.json │ ├── et.json │ ├── eu.json │ ├── fa.json │ ├── fi.json │ ├── fr.json │ ├── ga.json │ ├── gl.json │ ├── gu.json │ ├── he.json │ ├── hi.json │ ├── hr.json │ ├── ht.json │ ├── hu.json │ ├── hy.json │ ├── id.json │ ├── io.json │ ├── is.json │ ├── it.json │ ├── ja.json │ ├── jv.json │ ├── ka.json │ ├── kk.json │ ├── kn.json │ ├── ko.json │ ├── ky.json │ ├── la.json │ ├── lb.json │ ├── lmo.json │ ├── lt.json │ ├── lv.json │ ├── mg.json │ ├── min.json │ ├── mk.json │ ├── ml.json │ ├── mn.json │ ├── mr.json │ ├── ms.json │ ├── my.json │ ├── nds.json │ ├── ne.json │ ├── new.json │ ├── nl.json │ ├── nn.json │ ├── no.json │ ├── oc.json │ ├── pa.json │ ├── pl.json │ ├── pms.json │ ├── pnb.json │ ├── pt.json │ ├── ro.json │ ├── ru.json │ ├── scn.json │ ├── sco.json │ ├── sh.json │ ├── sk.json │ ├── sl.json │ ├── sq.json │ ├── sr.json │ ├── su.json │ ├── sv.json │ ├── sw.json │ ├── ta.json │ ├── te.json │ ├── tg.json │ ├── th.json │ ├── tl.json │ ├── tr.json │ ├── tt.json │ ├── uk.json │ ├── ur.json │ ├── uz.json │ ├── vi.json │ ├── vo.json │ ├── war.json │ ├── yo.json │ └── zh.json ├── dataset ├── cleanup.py ├── download_trexentities.py ├── download_wikidata.py ├── mbertlangs.txt ├── mlama.sh ├── reader.py ├── relations.py ├── requirements.txt ├── translate_googlere.py ├── translate_templates.py ├── translate_trex.py └── utils.py ├── mlama ├── __init__.py ├── build_encoded_dataset.py ├── eval_generation.py ├── evaluation_metrics_ranked.py ├── get_contextual_embeddings.py ├── modules │ ├── __init__.py │ ├── base_connector.py │ ├── bert_connector.py │ └── bert_connector_.py ├── options.py ├── utils.py └── vocab_intersection.py ├── requirements.txt └── scripts ├── batch_eval_KB_completion_mBERT_ranked.py ├── eval.py └── run_experiments_mBERT_ranked.py /.gitignore: -------------------------------------------------------------------------------- 1 | RelationExtraction/emnlp2017-relation-extraction-master/resources/glove/glove.6B.50d.txt 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # IPython checkpoints 62 | .ipynb_checkpoints 63 | 64 | # Mac os x stuff 65 | .DS_Store 66 | 67 | pre-trained_language_models/ 68 | src/ 69 | .idea 70 | */.mypy_cache 71 | LAMA-Internal/ 72 | data/ 73 | last_results.csv 74 | output/ 75 | 76 | last_* 77 | .nfs* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mLAMA: multilingual LAnguage Model Analysis 2 | 3 | This repository contains code for the EACL 2021 paper ["Multilingual LAMA: Investigating Knowledge in Multilingual Pretrained Language Models"](https://arxiv.org/abs/2102.00894). 4 | It extends the original LAMA probe to the multilingual setting, e.g. it probes knowledge in pre-trained language models in a multilingual setting. 5 | 6 | The repository is forked from https://github.com/facebookresearch/LAMA and adapted accordingly. 7 | 8 | ## The mLAMA probe 9 | 10 | To reproduce our results: 11 | 12 | ### 1. Create conda environment and install requirements 13 | 14 | (optional) It might be a good idea to use a separate conda environment. It can be created by running: 15 | ``` 16 | conda create -n mlama -y python=3.7 && conda activate mlama 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | add project to path: 21 | 22 | export PYTHONPATH=${PYTHONPATH}:/path-to-project 23 | 24 | ### 2. Download the data 25 | 26 | 27 | ```bash 28 | wget http://cistern.cis.lmu.de/mlama/mlama1.1.zip 29 | unzip mlama1.1.zip 30 | rm mlama1.1.zip 31 | mv mlama1.1 data/mlama1.1/ 32 | ``` 33 | 34 | ### 3. Run the experiments 35 | 36 | ```bash 37 | python scripts/run_experiments_mBERT_ranked.py --lang "fr" 38 | python scripts/eval.py 39 | ``` 40 | 41 | ## The dataset 42 | 43 | Code to recreate the dataset can be found in the folder `dataset`. 44 | 45 | We provide a class to read in the dataset in `dataset/reader.py`. Example for reading the data: 46 | ```python 47 | ml = MLama("data/mlama/") 48 | ml.load() 49 | ``` 50 | 51 | ## Reference: 52 | 53 | ```bibtex 54 | @inproceedings{kassner2021multilingual, 55 | title = "Multilingual {LAMA}: Investigating Knowledge in Multilingual Pretrained Language Models", 56 | author = {Kassner, Nora and 57 | Dufter, Philipp and 58 | Sch{\"u}tze, Hinrich}, 59 | booktitle = "to appear in Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics", 60 | year = "2021", 61 | address = "Online", 62 | publisher = "Association for Computational Linguistics", 63 | } 64 | 65 | @inproceedings{petroni2019language, 66 | title={Language Models as Knowledge Bases?}, 67 | author={F. Petroni, T. Rockt{\"{a}}schel, A. H. Miller, P. Lewis, A. Bakhtin, Y. Wu and S. Riedel}, 68 | booktitle={In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2019}, 69 | year={2019} 70 | } 71 | ``` 72 | 73 | ## Acknowledgements 74 | 75 | * [https://github.com/huggingface/pytorch-pretrained-BERT](https://github.com/huggingface/pytorch-pretrained-BERT) 76 | * [https://github.com/allenai/allennlp](https://github.com/allenai/allennlp) 77 | * [https://github.com/pytorch/fairseq](https://github.com/pytorch/fairseq) 78 | * https://github.com/facebookresearch/LAMA 79 | 80 | ## Licence 81 | 82 | mLAMA is licensed under the CC-BY-NC 4.0 license. The text of the license can be found [here](LICENSE). 83 | -------------------------------------------------------------------------------- /data/GoogleRE_objects/af.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/az.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/be.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/ceb.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/cs.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["\u0158\u00edm", "Koda\u0148", "Praha", "Mnichov", "Sydney", "Var\u0161ava", "Burlington", "Antverpy", "Riga", "Pa\u0159\u00ed\u017e", "Stade", "Havana", "Jamajka", "Iowa", "Lond\u00fdn", "Madrid", "Sydney", "Mil\u00e1n", "Heidelberg", "Montr\u00e9al", "Var\u0161ava", "N\u011bmecko", "Pa\u0159\u00ed\u017e", "Bow", "Lipsko", "Dayton", "Sevilla", "Wellington", "Neapol", "Anglie", "\u0158\u00edm", "Lond\u00fdn", "Neapol", "Bradford", "Havana", "Stockholm", "Melbourne", "Var\u0161ava", "Anglie", "Lond\u00fdn", "Praha", "Montr\u00e9al", "Gloucester", "Budape\u0161\u0165", "Praha", "Jamajka", "Pa\u0159\u00ed\u017e", "Florencie", "Moskva", "Koda\u0148", "Anglie", "Praha", "Budape\u0161\u0165", "Tur\u00edn", "Sydney", "Anglie", "Praha", "Sydney", "Mil\u00e1n", "Aston", "Praha", "Ben\u00e1tky", "Toronto", "Nanking", "Tbilisi", "Francie", "\u0158\u00edm", "Sydney", "Filadelfie", "Cleveland", "Siena", "Stockholm", "Al\u017e\u00edr", "Fairfield", "Neapol", "Detroit", "Watford", "Liverpool", "Nevada", "Waterford", "Berl\u00edn", "Stuttgart", "Barcelona", "Polsko", "Pa\u0159\u00ed\u017e", "Pa\u0159\u00ed\u017e", "\u0158\u00edm", "Lond\u00fdn", "Wilmington", "Austin", "Seattle", "Vancouver", "Anglie", "Pa\u0159\u00ed\u017e", "Buckinghamshire", "Praha", "Melbourne", "It\u00e1lie", "Seattle", "Manchester", "Limerick", "Pa\u0159\u00ed\u017e", "Z\u00e1h\u0159eb", "Preston", "Oslo", "Al\u017e\u00edr", "Manchester", "Pa\u0159\u00ed\u017e", "Anglie", "Florencie", "Nottingham", "Pa\u0159\u00ed\u017e", "Z\u00e1h\u0159eb", "Alb\u00e1nie", "Praha", "Praha", "Brandon", "Boston", "\u0158\u00edm", "Ipswich", "Prefektura Tokio", "B\u011blehrad", "Neapol", "Riga", "Lipsko", "Barcelona", "\u0158\u00edm", "Praha", "Westminster", "Split", "Lisabon", "Split", "Moskva", "Edinburgh", "\u0160v\u00e9dsko", "Macon", "Bukure\u0161\u0165", "Kalifornie", "Madrid", "Dublin", "Columbus", "Lyon", "Pa\u0159\u00ed\u017e", "Francie", "Praha", "Var\u0161ava", "Brooklyn", "Como", "Lond\u00fdn", "Montr\u00e9al", "Lond\u00fdn", "Kalifornie", "Tur\u00edn", "Palermo", "V\u00edde\u0148", "Oslo", "Praha", "Polsko", "\u0160pan\u011blsko", "J\u00e1va", "V\u00edde\u0148", "Rochester", "Tur\u00edn", "Lvov", "Ben\u00e1tky", "Massachusetts", "Devon", "Pa\u0159\u00ed\u017e", "Berl\u00edn", "Chicago", "Toledo", "Surrey", "Anglie", "B\u011blehrad", "Praha", "Hamburk", "Slovensko", "Lipsko", "Rusko", "Finsko", "Victoria", "Hol\u0161t\u00fdnsko", "Berl\u00edn", "Weston", "Neapol", "Lyon", "Mil\u00e1n", "Amsterdam", "Pa\u0159\u00ed\u017e", "Mexiko", "It\u00e1lie", "Stuttgart", "Lond\u00fdn", "Praha", "Berl\u00edn", "Casablanca", "Tampa", "Belgie", "Jeruzal\u00e9m", "Lond\u00fdn", "Janov", "Anglie", "Lipsko", "Mil\u00e1n", "Vancouver", "Lond\u00fdn", "Tours", "Newport", "Springfield", "Austr\u00e1lie", "Mil\u00e1n", "Berl\u00edn", "Tottenham", "Brooklyn", "Borneo", "Berl\u00edn", "Ontario", "Filadelfie", "Mil\u00e1n", "Pa\u0159\u00ed\u017e", "\u0158\u00edm", "Nassau", "Bukure\u0161\u0165", "Lipsko", "V\u00edde\u0148", "Norwich", "Amsterdam", "Bratislava", "Moskva", "Neapol", "Pittsburgh", "Como", "Detroit", "Stuttgart", "Tbilisi", "Praha"], "subjects": ["Eduard Ender", "Eyolf Kleven", "Alois Wachsman", "Marcus Junkelmann", "Julia Wilson", "Zenon Nowosz", "Paul Daniels", "Peeter van Bredael", "Igors Vihrovs", "Renaud Gagneux", "Peter Ording", "Yanitzia Canetti", "Bernard Wright", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Stanis\u0142aw Urban", "Aurel Codoban", "Nikos Aliagas", "Clive Brooks", "Maja Tucholke", "Kelley Deal", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Leo Abrahams", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "Christfried Burmeister", "Aliuska L\u00f3pezov\u00e1", "Staffan de Mistura", "Brett Hayman", "Romuald Giegiel", "Colin Groves", "Benjamin Brecknell Turner", "Rudolf K\u0159es\u0165an", "David Atkinson", "Edward Gardner", "Andrea M\u00e1tayov\u00e1", "Ivo Luka\u010dovi\u010d", "Sheyla Bonnick", "Herv\u00e9 Alphand", "Filippo Soffici", "Jelena Beljakovov\u00e1", "Victor Borge", "Stephen Fox", "Pavel \u017d\u00e1\u010dek", "Katalin Kar\u00e1dy", "Felice Giordano", "Danielle McGrath", "John Mundy", "Milan Orlowski", "Stephen Carr", "Guiniforte Solari", "Trevor Burton", "Jan Anton\u00edn Duchoslav", "Giovanni Francesco Commendone", "Jack Blum", "Kuo \u0164in-lung", "\u017dores Medved\u011bv", "Gratien Ga\u00ebl Suares", "Lucius Verus", "Elizabeth Kell", "Francis Davis", "Mark Buchanan", "Alessandro Frosini", "Ellen Gulbransonov\u00e1", "Paul Belmondo", "Richard Bayley", "Carlo Silipo", "Lawrence Kushner", "George Randolph Pearkes", "Dylan Taite", "Pat Nixonov\u00e1", "Michael Carney", "Guy De Saint Cyr", "Wilhelm Friedrich Boger", "Felipe Alfau", "Li'on Dici'an", "Marc Sangnier", "Corn\u00e9lie Falcon", "Stefano Nolfi", "Alaric Alexander Watts", "Collins J. Seitz", "Sahara Smith", "Shyril O'Steen", "Peter Dembicki", "Pieter de Molyn", "Gabriel Bertrand", "John Borlase", "Petr Kroutil", "Michael Guider", "Tancr\u00e8de Dumas", "Jeff Simmons", "Matt O'Connor", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Snorre Valen", "Maurice Va\u00efsse", "John Mundy", "Victor Antoine Signoret", "Edward Locke", "Enrico Toselli", "Barry Howard", "Alain Ehrenberg", "Aleksandra Romani\u0107", "Thomas Nassi", "Martin Kratochv\u00edl", "Joseph Wilhelm Swoboda", "Tim Long", "Frederick Lewis Allen", "Anastasius I.", "Jamie Moses", "Take\u0161i Maeda", "Andrea Leki\u0107ov\u00e1", "Andrea Giani", "Mordehajs Dubins", "Gottfried Heinrich Bach", "Miguel Garc\u00eda", "Augusto De Marsanich", "Franti\u0161ek Neuwirth", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Jennifer Smith", "Petar \u010culi\u0107", "Lev Le\u0161\u010denko", "Georgina Kennard", "Staffan de Mistura", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Catherine Pakenham", "Sumalee Montano", "Claude Bourgelat", "Henri de Contenson", "Michael Armstrong", "Lud\u011bk Fr\u00fdbort", "Jan Szyszko", "Joe Ascione", "Luca Princiotta", "Malcolm Cecil", "William Reed", "Nigel Preston", "Jimmy Greenspoon", "Alessio Secco", "Francesco Musotto", "Marion Stein", "Erik Dammann", "Regina Mar\u0161\u00edkov\u00e1", "Gosia Piotrowska", "Manola Saavedra", "Ien Angov\u00e1", "Ernst Florian Winter", "Diane Greene", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Giulio Carpioni", "Sarah Stiles", "Neil Doncaster", "Charles Nicholas Aub\u00e9", "Meike Evers", "James Burnham", "Francisco Cervantes de Salazar", "Rob Heanley", "Barry Palmer", "Marinko Mad\u017egalj", "Ond\u0159ej Neff", "Frederick Franklin Schrader", "Pavol Polakovi\u010d", "Johann Friedrich Schleusner", "Rosabelle Sinclair", "Sami Hinkka", "Murray Hocking", "Louis Gurlitt", "Heinz Oestergaard", "Jared Cohen", "Giuseppe de Majo", "Ernest Lafont", "Matteo Salvini", "Theodor Holman", "Ren\u00e9 Mayer", "Leopoldo Gout", "Giancarlo Primo", "Bernhard R\u00fchling", "William Main Page", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Michel Lafosse", "Josef Chari\u0161", "Charles Forbes Ren\u00e9 de Montalembert", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Abbondio Sangiorgio", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Richard Hudson", "Borah Bergman", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Leon Bass", "Marisa Masullo", "Fran\u00e7ois Maspero", "Tommaso Marconi", "Johnny Kemp", "Loredana Errore", "Moritz Wilhelm Drobisch", "Adolf Patera", "Warren Carlyle", "Gijs Vermeulen", "Andrej \u0160eban", "Andrej Maratovi\u010d Babickij", "Diego Nargiso", "David Scott Milton", "Giambattista Nolli", "Dave Marsh", "Kim Bauermeister", "Giorgi Ketojev", "Jan \u010cul\u00edk"]}, "place_of_death": {"objects": ["Stockholm", "Cambridge", "Konstantinopol", "Havaj", "Edinburgh", "Neapol", "Lyon", "V\u00edde\u0148", "\u0158\u00edm", "Var\u0161ava", "Belgie", "Cambridge", "Lond\u00fdn", "Amsterdam", "Lubla\u0148", "Exeter", "Florencie", "Jeruzal\u00e9m", "Litva", "Buffalo", "Vilnius", "Manhattan", "Cincinnati", "\u0158\u00edm", "Florida", "Tunisko", "Gent", "Kalifornie", "Sussex", "Manhattan", "Jerevan", "Helsinky", "Oxford", "Montr\u00e9al", "Florencie", "Lond\u00fdn", "Praha", "Pa\u0159\u00ed\u017e", "Madrid", "Lond\u00fdn", "Var\u0161ava", "Liverpool", "Z\u00e1h\u0159eb", "Perth", "Var\u0161ava", "Lipsko", "Praha", "Toronto", "Jokohama", "Siena", "Atlanta", "Berl\u00edn", "Detroit", "Birmingham", "Madrid", "Var\u0161ava", "Dover", "Lille", "Scarborough", "Konstantinopol", "Berkeley", "Sevilla", "Moskva", "Amsterdam", "Janov", "Filadelfie", "Praha"], "subjects": ["Johann Gustaf Sandberg", "Simon Greenleaf", "Jan I. Dukas", "Donn Lewin", "Henry Siddons", "Raimondo Guarini", "Joseph Jean-Baptiste Xavier Fournet", "Johan Stephan Decker", "Gioseppe Agnelli", "Adolf Dygasi\u0144ski", "Mark\u00e9ta Anglick\u00e1", "Grahame Clark", "Joshua Cristall", "Norbert van Bloemen", "Janez Bleiweis", "John Flavel", "Domenico Passignano", "Wolf Gold", "Simonas Daukantas", "Rose Clark", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Gotthard Deutsch", "Giovanni Battista Caccini", "Jim Chapin", "Georges Madon", "Robert van Audenaerd", "Ethel Catherwoodov\u00e1", "Thomas Slingsby Duncombe", "Der Scutt", "Toros Toramanian", "Olavi Paavolainen", "Homer Hasenpflug Dubs", "Edouard Gagnon", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Alexandr Gu\u010dkov", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Frane Buli\u0107", "Vivian Bullwinkelov\u00e1", "Ludwika J\u0119drzejewicz", "Paul Luther", "Wilhelm Elsner", "Pauline Mills McGibbon", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Christa Wolfov\u00e1", "Orestes Brownson", "Martin Laroche", "Enrique Sarasola", "Wincenty Krasi\u0144ski", "Nathaniel William Wraxall", "John Shortland", "Stephen Joseph", "Abd\u00fclmecid I.", "Franti\u0161ek Wolf", "Felipe de Le\u00f3n", "Nikolaj Strunnikov", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/cy.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/et.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/eu.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenhage", "Munich", "Sydney", "Milwaukee", "Dallas", "Stade", "Habana", "Livingston", "Varsovia", "Tacoma", "Londres", "Madril", "Amsterdam", "Sydney", "Milan", "Heidelberg", "Montreal", "Oklahoma", "Varsovia", "Alemania", "Leipzig", "Wellington", "Ingalaterra", "Lublin", "Hartford", "Erroma", "Londres", "Nebraska", "Bradford", "Melbourne", "Grezia", "Filadelfia", "Ingalaterra", "Shelby", "Montreal", "Praga", "Chicago", "Florentzia", "Kopenhage", "Praga", "Sydney", "Londres", "Ingalaterra", "Paris", "Sydney", "Boston", "Cornish", "Tbilisi", "Erroma", "Sydney", "Filadelfia", "Siena", "Aljer", "Chicago", "Nevada", "Waterford", "Stuttgart", "Bartzelona", "Sevilla", "Aljer", "Paris", "Polonia", "Pittsburgh", "Seattle", "Vancouver", "Utah", "Paris", "Pomerania", "Buckinghamshire", "Paris", "Paris", "Melbourne", "Manhattan", "Bergen", "Hollywood", "Limerick", "Paris", "Zagreb", "Preston", "Chicago", "Manchester", "Ingalaterra", "Nottingham", "Kalifornia", "Zagreb", "Albania", "Praga", "Erroma", "Orlando", "Riga", "Leipzig", "Erroma", "Westminster", "Split", "Chicago", "Split", "Macon", "Bukarest", "Kalifornia", "Espainia", "Columbus", "Lyon", "Paris", "Tallinn", "Frantzia", "Como", "Cardiff", "Paris", "Montreal", "Wilmington", "Turin", "Palermo", "Paris", "Viena", "Oslo", "Turin", "Lviv", "Paris", "Kensington", "Massachusetts", "Devon", "Berlin", "Surrey", "Hanburgo", "Oslo", "Budapest", "Errusia", "Victoria", "Holstein", "Paris", "Pennsylvania", "Milan", "Italia", "Lima", "Stuttgart", "Macon", "Praga", "Berlin", "Casablanca", "Tampa", "Genova", "Ingalaterra", "Leipzig", "Vancouver", "Londres", "Tours", "Newport", "Springfield", "Australia", "Milan", "Berlin", "Hartford", "Malta", "Borneo", "Berlin", "Cardiff", "Ontario", "Mississippi", "Tallinn", "Milan", "Erroma", "Bukarest", "Norwich", "Tallinn", "Amsterdam", "Baltimore", "Filadelfia", "Pittsburgh", "Chicago"], "subjects": ["Eyolf Kleven", "Marcus Junkelmann", "Julia Wilson", "Robert Daniel Murphy", "Shannon Emerick", "Peter Ording", "Yanitzia Canetti", "Ken Niles", "Gaba Kulka", "Michael Manuel", "James William Wallack", "Fina de Calder\u00f3n", "Johan de Graeff", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Charles Kemper", "Stanis\u0142aw Urban", "Aurel Codoban", "Maja Tucholke", "Barry Mitcalfe", "Leo Abrahams", "J\u00f3zef Wieniawski", "Ann Corio", "Enrico Montesano", "Cliff Jones", "Edwin Sutherland", "Christfried Burmeister", "Brett Hayman", "Nektaria Karantzi", "Susan Denin", "Colin Groves", "Nina Repeta", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Byron Morrow", "Filippo Soffici", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Danielle McGrath", "David Parry", "John Mundy", "No\u00ebl Gallon", "Stephen Carr", "John Snyder", "Julie Duncan", "Zhores Medvedev", "Luzio Vero", "Elizabeth Kell", "Francis Davis", "Alessandro Frosini", "Paul Belmondo", "No I.D.", "Patricia Ryan Nixon", "Michael Carney", "Wilhelm Boger", "Felipe Alfau", "Cipriano de Valera", "Alain Dorval", "Marc Sangnier", "Eva Maria Zuk", "Buzzy Linhart", "Shyril O'Steen", "Peter Dembicki", "Leonard Strong", "Gabriel Bertrand", "Martin Kosleck", "John Borlase", "Jean Gallon", "Pierre Joxe", "Michael Guider", "Dennis Davis", "Kjersti Elvik", "Amy Chance", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Kip King", "John Mundy", "Edward Locke", "Barry Howard", "John Friedrich", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Anastasio I.a", "Davis Gaines", "Mordehajs Dubins", "Gottfried Heinrich Bach", "Augusto De Marsanich", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Dino Wells", "Petar \u010culi\u0107", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "\u00c1ngel Garma", "Sumalee Montano", "Claude Bourgelat", "Henri de Contenson", "Martin Jervan", "Michael Armstrong", "Luca Princiotta", "Herbert Bowden, Baron Aylestone", "\u00c9mile L\u00e9vy", "William Reed", "Margaret Gwenver", "Alessio Secco", "Francesco Musotto", "Pierre Cartellier", "Marion Stein", "Erik Dammann", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Armand Toussaint", "Nigel Tangye", "Sarah Stiles", "Neil Doncaster", "Meike Evers", "Rob Heanley", "Frederick Franklin Schrader", "Jon Elster", "\u00c1kos Cs\u00e1sz\u00e1r", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "H\u00e9l\u00e8ne Carr\u00e8re d'Encausse", "Bill Dillard", "Matteo Salvini", "Giancarlo Primo", "Antonio Ruiz de Montoya", "Bernhard R\u00fchling", "Lisa Sheridan", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Nick Karner", "Gabriel Caruana", "Michael Matus", "Erich Werdermann", "Simon Bowman", "Elizabeth Hess", "Judy Dunaway", "Aarne Ruben", "Marisa Masullo", "Tommaso Marconi", "Loredana Errore", "Warren Carlyle", "Martin Zobel", "Gijs Vermeulen", "Alwina Valleria", "Bernie Lowe", "David Scott Milton", "Paul Willis"]}, "place_of_death": {"objects": ["Hawaii", "Edinburgh", "Richmond", "Lyon", "Varsovia", "Colchester", "Hollywood", "Filadelfia", "Cambridge", "Kingston", "Londres", "Carlisle", "Vilnius", "Manhattan", "Paris", "Florida", "Sussex", "Manhattan", "Londres", "Jerusalem", "Kalifornia", "Helsinki", "Oxford", "Houston", "Florentzia", "Londres", "Praga", "Paris", "Madril", "Londres", "Varsovia", "Liverpool", "Sevilla", "Leipzig", "Cheyenne", "Newark", "Toronto", "Yokohama", "Siena", "Atlanta", "Berlin", "Hollywood", "Detroit", "Berkeley", "Lille", "Scarborough", "Hollywood", "Boston", "Konstantinopla", "Paris", "Sevilla", "Bolonia", "Amsterdam", "Genova", "Filadelfia", "Praga"], "subjects": ["Donn Lewin", "Henry Siddons", "Alexander William Doniphan", "Joseph Jean-Baptiste Xavier Fournet", "Adolf Dygasi\u0144ski", "Hugh Iorys Hughes", "Romaine Fielding", "Coral Lansbury", "Grahame Clark", "Mariana Grajales", "Joshua Cristall", "Molly Pitcher", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Andr\u00e9 Chamson", "Jim Chapin", "Thomas Slingsby Duncombe", "Der Scutt", "Pauline Joran", "Elisha Netanyahu", "Anthony George", "Olavi Paavolainen", "Homer Hasenpflug Dubs", "Tommy Leonetti", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Alexander Gutxkov", "Juan P\u00e9rez de Montalv\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Floridablancako kondea", "Paul Luther", "William Pleater Davidge", "Jerry Damon", "Pauline Mills McGibbon", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Christa Wolf", "Gloria Grey", "Orestes Brownson", "Egon Petri", "John Shortland", "Stephen Joseph", "Anita King", "Robert Hazard", "Abdulmezid I.a", "Sibyl Sanderson", "Felipe de Le\u00f3n", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/gl.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["M\u00fanic", "Sydney", "Dallas", "Stade", "Casablanca", "A Habana", "Suecia", "Iowa", "Londres", "Madrid", "Sydney", "Mil\u00e1n", "Heidelberg", "Arxentina", "Montreal", "Toronto", "Varsovia", "Alema\u00f1a", "Bow", "Leipzig", "Wellington", "Roma", "Londres", "Melbourne", "Inglaterra", "Montreal", "Praga", "Xamaica", "Montreal", "Florencia", "Colonia", "Praga", "Sydney", "Londres", "Manchester", "Inglaterra", "Sydney", "Belgrado", "Alema\u00f1a", "Sydney", "Filadelfia", "Melbourne", "Siena", "Escocia", "Chicago", "Waterford", "Berl\u00edn", "Stuttgart", "Par\u00eds", "Madrid", "Seattle", "Vancouver", "Inglaterra", "Par\u00eds", "Buckinghamshire", "Melbourne", "Manhattan", "Limerick", "Par\u00eds", "Honduras", "Zagreb", "Alxer", "Manchester", "Par\u00eds", "Miami", "Inglaterra", "Zagreb", "Albania", "Praga", "Roma", "Riga", "Glasgow", "Bristol", "Roma", "Split", "Edimburgo", "Split", "Londres", "Macon", "Londres", "Bucarest", "California", "Madrid", "Li\u00f3n", "Par\u00eds", "Tal\u00edn", "Francia", "Como", "Cardiff", "Montreal", "Tur\u00edn", "Palermo", "Viena", "Oslo", "Manchester", "Viena", "Sheffield", "Rochester", "Tur\u00edn", "Lviv", "Massachusetts", "Devon", "Berl\u00edn", "Surrey", "Hamburgo", "Rusia", "Victoria", "Berl\u00edn", "Par\u00eds", "Mil\u00e1n", "Italia", "Stuttgart", "Macon", "Praga", "Berl\u00edn", "Casablanca", "Tampa", "X\u00e9nova", "Inglaterra", "Leipzig", "Vancouver", "Londres", "Tours", "Newport", "Australia", "Mil\u00e1n", "Berl\u00edn", "Borneo", "Berl\u00edn", "Caracas", "Cardiff", "Ontario", "Tal\u00edn", "Mil\u00e1n", "Roma", "Bucarest", "Par\u00eds", "Tal\u00edn", "\u00c1msterdam", "Filadelfia", "Alxer", "Roma", "Lisboa"], "subjects": ["Marcus Junkelmann", "Julia Wilson", "Tom Jones", "Peter Ording", "Jos\u00e9 B\u00e9naz\u00e9raf", "Yanitzia Canetti", "Leonard Gyllenhaal", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Horatio Luro", "Wayne Eagling", "Tony Mitchell", "Stanis\u0142aw Urban", "Aurel Codoban", "Clive Brooks", "Maja Tucholke", "Barry Mitcalfe", "Enrico Montesano", "Cliff Jones", "Brett Hayman", "Colin Groves", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Sheyla Bonnick", "Lionel Tiger", "Filippo Soffici", "Josef Metternich", "Pavel \u017d\u00e1\u010dek", "Danielle McGrath", "David Parry", "Alfred Ollivant", "John Mundy", "Stephen Carr", "Aco Petrovi\u0107", "Antonio Ciacca", "Elizabeth Kell", "Francis Davis", "Justine Smethurst", "Alessandro Frosini", "John McHale", "No_ID", "Michael Carney", "Guy De Saint Cyr", "Wilhelm Boger", "Marc Sangnier", "Eduardo Lago", "Shyril O'Steen", "Peter Dembicki", "Pieter de Molijn", "Gabriel Bertrand", "John Borlase", "Michael Guider", "Dennis Davis", "Sam Lynch", "Claude Piel", "Vicente G\u00f3mez", "Luka Grubor", "Maurice Va\u00efsse", "John Mundy", "Victor Antoine Signoret", "Alejandro Gonz\u00e1lez Trujillo", "Edward Locke", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Anastasio I, papa", "Mordehajs Dubins", "Ian Steel", "William Child", "Augusto De Marsanich", "Tomislav Smoljanovi\u0107", "Walter Elliot", "Petar \u010culi\u0107", "Andrew Bell", "Laurence Stallings", "Charles Dixon", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Claude Bourgelat", "Henri de Contenson", "Martin Jervan", "Michael Armstrong", "Luca Princiotta", "Herbert Bowden, Baron Aylestone", "William Reed", "Alessio Secco", "Francesco Musotto", "Marion Stein", "Erik Dammann", "John Owens", "Ernst Florian Winter", "Trevor Taylor", "Diane Greene", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Sarah Stiles", "Neil Doncaster", "Meike Evers", "Rob Heanley", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Heinz Oestergaard", "H\u00e9l\u00e8ne Carr\u00e8re d'Encausse", "Matteo Salvini", "Giancarlo Primo", "Bernhard R\u00fchling", "Lisa Sheridan", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Michael Matus", "Erich Werdermann", "Rosario Marciano", "Simon Bowman", "Elizabeth Hess", "Aarne Ruben", "Marisa Masullo", "Tommaso Marconi", "Loredana Errore", "Elsa Lunghini", "Martin Zobel", "Gijs Vermeulen", "Bernie Lowe", "Hakim Toumi", "Luis Simarro", "Rui Tavares"]}, "place_of_death": {"objects": ["Hawai", "Edimburgo", "Francia", "Li\u00f3n", "Colombia", "Cambridge", "Londres", "Vilnius", "Manhattan", "Florida", "Madrid", "Manhattan", "Par\u00eds", "Oxford", "Florencia", "Londres", "Praga", "Madrid", "Londres", "Varsovia", "Leiden", "Liverpool", "Dunedin", "Toronto", "Iocoama", "Siena", "Berkeley", "Atlanta", "Berl\u00edn", "Detroit", "Madrid", "Lille", "Boston", "Sevilla", "Bolo\u00f1a", "\u00c1msterdam", "X\u00e9nova", "Filadelfia", "Praga"], "subjects": ["Donn Lewin", "Henry Siddons", "Walt Hansgen", "Joseph Jean-Baptiste Xavier Fournet", "Rafael Pombo", "Grahame Clark", "Joshua Cristall", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Jim Chapin", "Esperanza P\u00e9rez Labrador", "Der Scutt", "Jules Quicherat", "Homer Hasenpflug Dubs", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Juan P\u00e9rez de Montalv\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Tiberius Hemsterhuis", "Jefferson Lowndes", "Kurt Baier", "Pauline Mills McGibbon", "Vito Positano", "Francesco Vanni", "Andrew Imbrie", "Cesare Siepi", "Christa Wolf", "Orestes Brownson", "Enrique Sarasola", "John Shortland", "Robert Hazard", "Felipe de Le\u00f3n", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/he.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/hr.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenhagen", "M\u00fcnchen", "Sydney", "Stade", "Havana", "Iowa", "London", "Madrid", "Zagreb", "Sydney", "Heidelberg", "Montr\u00e9al", "London", "Var\u0161ava", "Njema\u010dka", "Leipzig", "Turska", "Sarajevo", "Sevilla", "Wellington", "Napulj", "Bologna", "London", "Napulj", "Bradford", "Melbourne", "Engleska", "Montr\u00e9al", "Zagreb", "Prag", "Turska", "Kopenhagen", "Prag", "Torino", "Sydney", "Engleska", "Sydney", "Toronto", "Sydney", "Philadelphia", "Siena", "Split", "Nevada", "Waterford", "Berlin", "Barcelona", "Pariz", "Turska", "Seattle", "Vancouver", "Pariz", "Buckinghamshire", "Melbourne", "Limerick", "Pariz", "Zagreb", "Al\u017eir", "Manchester", "Engleska", "Nottingham", "London", "Zagreb", "Albanija", "Prag", "Riga", "Leipzig", "Westminster", "Split", "Sarajevo", "Split", "Macon", "Bukure\u0161t", "Kalifornija", "Madrid", "Lyon", "Pariz", "Francuska", "Montr\u00e9al", "London", "Palermo", "Sarajevo", "Be\u010d", "Oslo", "\u0160panjolska", "Sheffield", "Lavov", "Massachusetts", "Devon", "Arkansas", "Berlin", "Toledo", "Beograd", "Hamburg", "Rusija", "Finska", "Victoria", "Holstein", "Gvatemala", "Milano", "London", "Prag", "Berlin", "Engleska", "Pasadena", "Leipzig", "Vancouver", "London", "Tours", "Newport", "Frederick", "Australija", "Berlin", "London", "Zagreb", "Borneo", "Berlin", "Ontario", "Pariz", "Amsterdam", "Bratislava", "Pittsburgh", "Como", "Dublin"], "subjects": ["Eyolf Kleven", "Marcus Junkelmann", "Julia Wilson", "Peter Ording", "Yanitzia Canetti", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Gordan Ko\u017eulj", "Alastair Gordon", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "John Barry", "Stanis\u0142aw Urban", "Aurel Codoban", "Maja Tucholke", "Serdar Apayd\u0131n", "Viktor Ivan\u010di\u0107", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Ottaviano Mascherino", "Cliff Jones", "Warington Wilkinson Smyth", "Christfried Burmeister", "Brett Hayman", "Colin Groves", "David Atkinson", "Marija Lugari\u0107", "Ivo Luka\u010dovi\u010d", "Murat Evliyao\u011flu", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Felice Giordano", "Danielle McGrath", "John Mundy", "Stephen Carr", "Jack Blum", "Elizabeth Kell", "Francis Davis", "Alessandro Frosini", "Tino Vegar", "Pat Nixon", "Michael Carney", "Guy De Saint Cyr", "Felipe Alfau", "Marc Sangnier", "Tolga Tekinalp", "Shyril O'Steen", "Peter Dembicki", "Gabriel Bertrand", "John Borlase", "Michael Guider", "Sam Lynch", "Claude Piel", "Luka Grubor", "Maurice Va\u00efsse", "John Mundy", "Edward Locke", "Barry Howard", "Olivia Poulet", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Mordehajs Dubins", "Gottfried Heinrich Bach", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Davor Su\u010di\u0107", "Petar \u010culi\u0107", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Claude Bourgelat", "Henri de Contenson", "Michael Armstrong", "William Reed", "Nigel Preston", "Francesco Musotto", "Kemal Alispahi\u0107", "Marion Stein", "Erik Dammann", "Manola Saavedra", "Trevor Taylor", "Witold Rodzi\u0144ski", "Sarah Stiles", "Neil Doncaster", "Keena Rothhammer", "Meike Evers", "Francisco Cervantes de Salazar", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Sami Hinkka", "Murray Hocking", "Louis Gurlitt", "Franz Galich Mazariegos", "Matteo Salvini", "William Main Page", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "John Joseph Braham, Sr.", "Gordon Copley", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Scott Ambush", "Claire Baxter", "Peter Lachmann", "Reginald Baliot Brett", "Lovro Artukovi\u0107", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Fran\u00e7ois Maspero", "Gijs Vermeulen", "Andrej \u0160eban", "David Scott Milton", "Giambattista Nolli", "John O'Conor"]}, "place_of_death": {"objects": ["Carigrad", "Havaji", "Edinburgh", "Francuska", "Bologna", "Napulj", "Lyon", "Cambridge", "London", "Litva", "Vilnius", "Manhattan", "Pariz", "Florida", "Beograd", "Sussex", "Manhattan", "Oxford", "London", "Lisabon", "Prag", "Madrid", "London", "Var\u0161ava", "Liverpool", "Zagreb", "Pariz", "Toronto", "Yokohama", "Hollywood", "Detroit", "Madrid", "Lille", "Carigrad", "Sevilla", "Bologna", "Amsterdam", "Philadelphia", "Prag"], "subjects": ["Ivan I. Duka", "Donn Lewin", "Henry Siddons", "Walt Hansgen", "Odofredus Denari", "Raimondo Guarini", "Joseph Jean-Baptiste Xavier Fournet", "Grahame Clark", "Joshua Cristall", "Simonas Daukantas", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Andr\u00e9 Chamson", "Jim Chapin", "Florijan Matekalo", "Thomas Slingsby Duncombe", "Der Scutt", "Homer Hasenpflug Dubs", "David Merrick", "Jos\u00e9 Manuel Soares", "V\u00e1clav Havel", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Frane Buli\u0107", "Marcel Oopa", "Pauline Mills McGibbon", "Vito Positano", "Gloria Grey", "Orestes Brownson", "Enrique Sarasola", "John Shortland", "Abdul Med\u017eid I.", "Felipe de Le\u00f3n", "Thomas Dempster", "Nicolaes de Bruyn", "William More Gabb", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/hy.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["\u0539\u0580\u0565\u0576\u0569\u0578\u0576", "\u054d\u057f\u0578\u056f\u0570\u0578\u056c\u0574", "\u054c\u056b\u0563\u0561", "\u0555\u0576\u057f\u0561\u0580\u056b\u0578", "\u054f\u0561\u056f\u0578\u0574\u0561", "\u0539\u0578\u0582\u056c\u0578\u0582\u0566", "\u054f\u0578\u0580\u0578\u0576\u057f\u0578", "\u0555\u0564\u0565\u057d\u0561", "\u0532\u0578\u0582\u0564\u0561\u057a\u0565\u0577\u057f", "\u0535\u0563\u056b\u057a\u057f\u0578\u057d", "\u0544\u0578\u057d\u056f\u057e\u0561", "\u0532\u0578\u0582\u0564\u0561\u057a\u0565\u0577\u057f", "\u053f\u0561\u057d\u0561\u0562\u056c\u0561\u0576\u056f\u0561", "\u054a\u0580\u0561\u0570\u0561", "\u0540\u057c\u0578\u0574", "\u0531\u056c\u056a\u056b\u0580", "\u0546\u0587\u0561\u0564\u0561", "\u054e\u0561\u0576", "\u0553\u0561\u0580\u056b\u0566", "\u0540\u0561\u0580\u0569\u0586\u0578\u0580\u0564", "\u0540\u057c\u0578\u0574", "\u054e\u056b\u0580\u057b\u056b\u0576\u056b\u0561", "\u0553\u0561\u0580\u056b\u0566", "\u0544\u0565\u0575\u056f\u0578\u0576", "\u0555\u0584\u057d\u0586\u0578\u0580\u0564", "\u054e\u0561\u0580\u0577\u0561\u057e\u0561", "\u0555\u057d\u056c\u0578", "\u0535\u0580\u0587\u0561\u0576", "\u0555\u057d\u056c\u0578", "\u054f\u056b\u0580\u0561\u0576\u0561", "\u0553\u0561\u0580\u056b\u0566", "\u0535\u0580\u0587\u0561\u0576", "\u0540\u0578\u0582\u0576\u0563\u0561\u0580\u056b\u0561", "\u0539\u0562\u056b\u056c\u056b\u057d\u056b", "\u0539\u0562\u056b\u056c\u056b\u057d\u056b", "\u0537\u0564\u056b\u0576\u0562\u0578\u0582\u0580\u0563"], "subjects": ["\u0544\u0561\u0580\u057f\u056b\u0576 \u053f\u0578\u0576\u0578\u0580", "\u0533\u0580\u0565\u057f\u0561 \u053f\u0576\u0578\u0582\u057f\u057d\u0578\u0576", "\u053b\u0563\u0578\u0580 \u054e\u056b\u0570\u0580\u0578\u057e", "\u0548\u0582\u0578\u056c\u0569\u0565\u0580 \u0533\u0580\u0565\u0581\u056f\u056b", "\u0544\u0561\u0575\u0584\u056c \u0544\u0561\u0576\u0578\u0582\u0565\u056c", "\u054c\u0578\u056a\u0565 \u0532\u0580\u0575\u0578\u0582\u0576\u0565", "\u0539\u0578\u0576\u056b \u0544\u056b\u057f\u0579\u0565\u056c", "\u054d\u0561\u0574\u057e\u0565\u056c \u0533\u0575\u0578\u0566\u0561\u056c\u0575\u0561\u0576", "\u053c\u0561\u057d\u056c\u0578 \u0532\u056b\u057f\u0578", "\u0540\u0561\u056f\u0578\u0562 \u0540\u0561\u056f\u0578\u0562\u0575\u0561\u0576", "\u0531\u0576\u0561\u057f\u0578\u056c\u056b \u0531\u056c\u0565\u0584\u057d\u056b\u0576", "\u053f\u0561\u057f\u0561\u056c\u056b\u0576 \u053f\u0561\u0580\u0561\u0564\u056b", "\u053c\u0561\u0570\u057d\u0565\u0576 \u0531\u0562\u0580\u0561\u0574\u056b", "\u0545\u0561\u0576 \u0531\u0576\u057f\u0578\u0576\u056b\u0576 \u0534\u0578\u0582\u056d\u0578\u057d\u056c\u0561\u057e", "\u053c\u0578\u0582\u0581\u056b\u0578\u057d \u054e\u0565\u0580\u0578\u057d", "\u054a\u0578\u056c \u0532\u0565\u056c\u0574\u0578\u0576\u0564\u0578", "\u0553\u0565\u0569 \u0546\u056b\u0584\u057d\u0578\u0576", "\u0531\u0572\u0561\u057d\u056b \u053d\u0561\u0576\u057b\u0575\u0561\u0576", "\u053a\u0561\u056f \u0564'\u0531\u0563\u0561\u0580", "\u053c\u0578\u0582\u057d\u056b\u0576 \u0531\u0574\u0561\u0580\u0561", "\u0531\u0576\u0561\u057d\u057f\u0561\u057d I", "\u054b\u0578\u0576 \u0537\u057e\u0561\u0576\u057d", "\u0531\u056c\u0565\u0584\u057d\u0561\u0576\u0564\u0580 \u0534\u0565\u0563\u0578\u0586", "\u053c\u0578\u0578\u0582\u0580\u0565\u0576\u057d \u054d\u0569\u0561\u056c\u056b\u0576\u0563\u057d", "\u0540\u0565\u056c\u0565\u0576 \u0534\u0580\u0561\u0576\u0563\u0561", "\u0537\u0564\u0574\u0578\u0582\u0576\u0564 \u0556\u0565\u057f\u057f\u056b\u0576\u0563", "\u0537\u0580\u056b\u056f \u0534\u0561\u0574\u0561\u0576", "\u0546\u0565\u0580\u057d\u0565\u057d \u0535\u0580\u056b\u0581\u0575\u0561\u0576", "\u0545\u0578\u0582\u0576 \u0537\u056c\u057d\u0569\u0565\u0580", "\u0545\u0578\u0582\u0574\u0565\u0580 \u054a\u0561\u0574\u057a\u0578\u0582\u0580\u056b", "\u0537\u056c\u0565\u0576 \u053f\u0561\u0580\u0580\u0565\u0580 \u0534\u0561\u0576\u056f\u0578\u057d", "\u0531\u0580\u0574\u0565\u0576 \u0531\u0575\u057e\u0561\u0566\u0575\u0561\u0576", "\u054c\u0578\u0562\u0565\u0580\u057f \u0540\u0565\u0581\u0580\u0578\u0576", "\u053c\u0561\u057e\u0580\u0565\u0576\u057f\u056b \u0531\u0580\u0564\u0561\u0566\u056b\u0561\u0576\u056b", "\u0533\u0565\u0578\u0580\u0563\u056b \u053f\u0565\u057f\u0578\u0587", "\u054d\u0584\u0578\u0569 \u053f\u056c\u0587\u0565\u0580\u0564\u0578\u0576"]}, "place_of_death": {"objects": ["\u0544\u0578\u057d\u056f\u057e\u0561", "\u0544\u0561\u0576\u0570\u0565\u0569\u0565\u0576", "\u0553\u0561\u0580\u056b\u0566", "\u0554\u056b\u0576\u0563\u057d\u057f\u0578\u0576", "\u053c\u056b\u057f\u057e\u0561", "\u0556\u056b\u056c\u0561\u0564\u0565\u056c\u0586\u056b\u0561", "\u0553\u0561\u057d\u0561\u0564\u0565\u0576\u0561", "\u0532\u0578\u0582\u056d\u0561\u0580\u0565\u057d\u057f", "\u0534\u0565\u057f\u0580\u0578\u0575\u0569", "\u0553\u0561\u0580\u056b\u0566", "\u054a\u0565\u056f\u056b\u0576", "\u0535\u0580\u0587\u0561\u0576", "\u0544\u0578\u057d\u056f\u057e\u0561", "\u0553\u0561\u0580\u056b\u0566", "\u0556\u056c\u0578\u0580\u0565\u0576\u0581\u056b\u0561", "\u0545\u0578\u056f\u0578\u0570\u0561\u0574\u0561", "\u053f\u056b\u0578\u057f\u0578", "\u053f\u0578\u057d\u057f\u0561\u0576\u0564\u0576\u0578\u0582\u057a\u0578\u056c\u056b\u057d", "\u0546\u056b\u057d", "\u0544\u0561\u0564\u0580\u056b\u0564"], "subjects": ["\u054e\u0561\u0580\u057e\u0561\u057c\u0561 \u0544\u0561\u057d\u0561\u056c\u056b\u057f\u056b\u0576\u0578\u057e\u0561", "\u0531\u0564\u0565\u056c \u0544\u0561\u0580\u056f\u0578\u0582\u057d", "\u0531\u056c\u0586\u0580\u0565\u0564 \u054c\u0561\u0574\u0562\u0578", "\u0544\u0561\u0580\u056b\u0561\u0576\u0561 \u0533\u0580\u0561\u056d\u0561\u056c\u0565\u057d \u0544\u0561\u057d\u0565\u0578", "\u054d\u056b\u0574\u0578\u0576\u0561\u057d \u0534\u0561\u0578\u0582\u056f\u0561\u0576\u057f\u0561\u057d", "\u0533\u0578\u0580\u057b \u054e\u0578\u056c\u0565\u057d \u0544\u0565\u056c\u057e\u056b\u056c", "\u054b\u0578\u0576 \u054f\u0578\u0564\u0564", "\u054b\u0578\u0580\u057b\u0565 \u054b\u0578\u0580\u057b\u0565\u057d\u056f\u0578\u0582", "\u0533\u0578\u0582\u0580\u0563\u0565\u0576 \u0531\u056c\u0565\u0574\u0577\u0561\u0570", "Andr\u00e9 Jean", "\u0549\u0565\u0576 \u0545\u0561\u0576\u0581\u0575\u0578\u0582", "\u0539\u0578\u0580\u0578\u057d \u0539\u0578\u0580\u0561\u0574\u0561\u0576\u0575\u0561\u0576", "\u054d\u057f\u0565\u057a\u0561\u0576 \u0537\u0580\u0566\u056b\u0561", "\u0531\u056c\u0565\u0584\u057d\u0561\u0576\u0564\u0580 \u0533\u0578\u0582\u0579\u056f\u0578\u057e", "\u0556\u0580\u0565\u0576\u057d\u056b\u057d \u0531\u056c\u0565\u0584\u057d\u0561\u0576\u0564\u0580", "\u054a\u0578\u0566\u056b\u057f\u0561\u0576\u0578 \u054e\u056b\u057f\u0578", "\u0548\u0582\u0565\u0564\u0561 \u0531\u056f\u056b\u0576\u0561\u0580\u056b", "\u0531\u0562\u0564\u0578\u0582\u056c \u0544\u0565\u057b\u056b\u0564", "\u0533\u0565\u0578\u0580\u0563\u056b \u0531\u0564\u0561\u0574\u0578\u057e\u056b\u0579", "\u0532\u0565\u0580\u0576\u0561\u0580\u0564\u0578 \u053c\u0578\u057a\u0565\u0566 \u054a\u056b\u056f\u0565\u0580"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/id.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenhagen", "Sydney", "Australia", "Burlington", "Amsterdam", "Stade", "Havana", "Oxford", "Iowa", "London", "Damaskus", "Wina", "Sydney", "Milan", "Heidelberg", "Montreal", "Toulouse", "Warsawa", "Leipzig", "Wellington", "Portland", "Napoli", "Roma", "London", "Napoli", "Chennai", "Melbourne", "Inggris", "Montreal", "Praha", "Firenze", "Kopenhagen", "Mobile", "Sydney", "London", "Yorkshire", "Inggris", "Praha", "Sydney", "Tbilisi", "Roma", "Sydney", "Siena", "Napoli", "Nevada", "Waterford", "Berlin", "Stuttgart", "Bangkok", "Paris", "Reading", "Paris", "Seattle", "Istanbul", "Vancouver", "Paris", "Canberra", "Melbourne", "Leiden", "Connecticut", "Limerick", "Paris", "Zagreb", "Preston", "Blackburn", "Guangzhou", "Manchester", "Inggris", "Firenze", "Nottingham", "Roma", "Tokyo", "Napoli", "Roma", "Westminster", "Split", "Cincinnati", "Macon", "Bukares", "California", "Paris", "Adelaide", "Amsterdam", "Como", "Brunei Darussalam", "Cardiff", "Istanbul", "Montreal", "Paris", "London", "Torino", "Palermo", "Wina", "Chicago", "Oslo", "Melbourne", "Adelaide", "Beijing", "Torino", "Melbourne", "Massachusetts", "Bordeaux", "Berlin", "Mumbai", "Garland", "Oxford", "Surrey", "Beograd", "Hamburg", "Oslo", "Rusia", "Victoria", "Holstein", "Baltimore", "Napoli", "Chicago", "Milan", "Italia", "Stuttgart", "India", "Casablanca", "Genova", "Inggris", "Leipzig", "Vancouver", "London", "Gent", "Tours", "Newport", "Hamilton, Selandia Baru", "Springfield", "Australia", "Milan", "Berlin", "Jakarta", "Berlin", "Prancis", "Ankara", "Kalimantan", "Cardiff", "Ontario", "Tallinn", "Milan", "Colorado", "Roma", "Queens", "Bukares", "Norwich", "Bretagne", "Amsterdam", "Manila", "Napoli", "Pittsburgh", "Kentucky", "Detroit"], "subjects": ["Eyolf Kleven", "Julia Wilson", "John Seru", "Paul Daniels", "Bernard de Wolff", "Peter Ording", "Yanitzia Canetti", "Thomas Godfrey Faussett", "Eric Ziebold", "James William Wallack", "Bachar Kouatly", "Norbert Balatsch", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Roger Brunet", "Stanis\u0142aw Urban", "Maja Tucholke", "Barry Mitcalfe", "Steve Sundholm", "Renato Caccioppoli", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "K. Bhaskaran", "Brett Hayman", "Colin Groves", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Filippo Soffici", "Victor Borge", "Darnell Kennedy", "Danielle McGrath", "David Parry", "Timothy Drever", "John Mundy", "Milan Orlowski", "Stephen Carr", "Zhores Medvedev", "Lucius Verus", "Elizabeth Kell", "Alessandro Frosini", "Carlo Silipo", "Pat Nixon", "Michael Carney", "Guy De Saint Cyr", "Wilhelm Boger", "Sandrina Malakiano", "Marc Sangnier", "Denys Page", "Hugues Krafft", "Shyril O'Steen", "Do\u011fa Bekleriz", "Peter Dembicki", "Gabriel Bertrand", "Queenie van de Zandt", "Michael Guider", "Simon Binnendijk", "Charles H. Kraft", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Jimmy Brown", "George Kitching", "John Mundy", "Edward Locke", "Enrico Toselli", "Barry Howard", "Paus Anastasius I", "Takeshi Maeda", "Andrea Giani", "Augusto De Marsanich", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Kay Lahusen", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "Henri de Contenson", "Ben Nicholas", "Albertus Jonas Brandt", "Luca Princiotta", "Paula Malai Ali", "Herbert Bowden, Baron Aylestone", "Fuat G\u00fcner", "William Reed", "Eug\u00e8ne Brieux", "Thomas Taylor", "Alessio Secco", "Francesco Musotto", "Marion Stein", "Tim McGill", "Erik Dammann", "Maggie Fitzgibbon", "Janet Ramsey Johnson", "Zhang Xueling", "Nicola Campogrande", "Nicholas Colla", "Sarah Stiles", "Jean Baptiste Rives", "Meike Evers", "Siddhant Karnick", "Nick Richmond", "Edward Stransham", "Rob Heanley", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Jon Elster", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Myra Sklarew", "Giuseppe de Majo", "Raymond R. Schumacher", "Matteo Salvini", "Giancarlo Primo", "Bernhard R\u00fchling", "Brihaspati Dev Triguna", "Migidio Bourifa", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Marc Van Montagu", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Matthew Walker", "Homer Curran", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Pierre Rolland", "Otto Eugen Schulz", "J. B. Jackson", "Serhat", "Michael Matus", "Simon Bowman", "Elizabeth Hess", "Aarne Ruben", "Marisa Masullo", "Tom Maniatis", "Tommaso Marconi", "Stephen K. Benjamin", "Loredana Errore", "Warren Carlyle", "Lo\u00efc Jouannigot", "Gijs Vermeulen", "Elizabeth Cooper", "Diego Nargiso", "David Scott Milton", "George Jewett", "Dave Marsh"]}, "place_of_death": {"objects": ["Santiago de Chile", "Hawaii", "Edinburgh", "Lyon", "Belgia", "Colchester", "Cambridge", "Tbilisi", "London", "Yerusalem", "Manhattan", "Florida", "Inggris", "Australia", "Manhattan", "Amsterdam", "Roma", "Oxford", "Montreal", "Edinburgh", "Mumbai", "London", "Mumbai", "Leiden", "Lansing", "Liverpool", "Leipzig", "Toronto", "Karachi", "Karachi", "Yokohama", "Siena", "Atlanta", "Detroit", "Hollywood", "Kairo", "Dover", "Lille", "Scarborough, Yorkshire Utara", "Vancouver", "Konstantinopel", "Melbourne", "Sevilla", "Bologna", "Amsterdam", "Genova"], "subjects": ["Marta Canales", "Donn Lewin", "Henry Siddons", "Joseph Jean-Baptiste Xavier Fournet", "Margaret, Istri Adipati Brabant", "Hugh Iorys Hughes", "Grahame Clark", "Guram Sharadze", "Joshua Cristall", "Wolf Gold", "Arthur Siegel", "Jim Chapin", "Margo McLennan", "Bettina Welch", "Der Scutt", "Albertus Jonas Brandt", "Sebastiano Baggio", "Homer H. Dubs", "\u00c9douard Gagnon", "William Roxburgh", "Sultan Khan", "David Merrick", "Ali Sardar Jafri", "Tiberius Hemsterhuis", "Geraldine Doyle", "Jefferson Lowndes", "Paul Luther", "Pauline Mills McGibbon", "Abdur Rab Nishtar", "Khursheed Bano", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Orestes Brownson", "Andreas Dippel", "Lotfia El Nady", "Nathaniel Wraxall", "John Shortland", "Stephen Joseph", "Jan Hulsker", "Abd-ul-Mejid I", "Alwyn Kurts", "Felipe de Le\u00f3n", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/ka.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/la.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/lt.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenhaga", "Miunchenas", "Sidn\u0117jus", "Berlingtonas", "Stad\u0117", "Ajova", "Londonas", "Madridas", "Roma", "Sidn\u0117jus", "Heidelbergas", "Monrealis", "Var\u0161uva", "Vokietija", "Pary\u017eius", "Vestfalija", "Leipcigas", "Sevilija", "Velingtonas", "Neapolis", "Bolonija", "At\u0117nai", "Londonas", "Neapolis", "Viena", "Bradfordas", "Melburnas", "Monrealis", "Praha", "Florencija", "Kopenhaga", "Praha", "Turinas", "Sidn\u0117jus", "Anglija", "Sidn\u0117jus", "Tbilisis", "Viena", "Roma", "Sidn\u0117jus", "Filadelfija", "Berlynas", "\u0160tutgartas", "Barselona", "Sevilija", "Pary\u017eius", "Roma", "Vankuveris", "Anglija", "Pary\u017eius", "Bakingam\u0161yras", "Pary\u017eius", "Melburnas", "Pary\u017eius", "Zagrebas", "Prestonas", "Pary\u017eius", "Man\u010desteris", "Melburnas", "Anglija", "Notingamas", "Zagrebas", "Albanija", "Praha", "Ryga", "Pary\u017eius", "Vestminsteris", "Splitas", "Splitas", "Maskva", "Bagdadas", "Bukare\u0161tas", "Madridas", "Lionas", "Pary\u017eius", "Talinas", "Vilnius", "Pranc\u016bzija", "Monrealis", "Londonas", "Viena", "Aleksandrija", "Oslas", "Ispanija", "Masa\u010dusetsas", "Devonas", "Berlynas", "\u010cikaga", "Toledas", "Sur\u0117jus", "Belgradas", "Hamburgas", "Rusija", "Viktorija", "Hol\u0161teinas", "Lionas", "Pary\u017eius", "Kopenhaga", "\u0160tutgartas", "Londonas", "Praha", "Berlynas", "Tampa", "Londonas", "Pary\u017eius", "Anglija", "Leipcigas", "Vankuveris", "Londonas", "Niuportas", "Australija", "Pary\u017eius", "Ankara", "Borneo", "Berlynas", "Ontarijas", "Talinas", "Norid\u017eas", "Talinas", "Amsterdamas", "Maskva", "Pitsbergas", "Komas", "\u0160tutgartas"], "subjects": ["Eyolf Kleven", "Marcus Junkelmann", "Julia Wilson", "Paul Daniels", "Peter Ording", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Anna Maria Villani Scicolone", "Alastair Gordon", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Stanis\u0142aw Urban", "Aurel Codoban", "Nikos Aliagas", "Werner M\u00fcnch", "Maja Tucholke", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Ottaviano Mascherino", "Argiris Pedoulakis", "Cliff Jones", "Warington Wilkinson Smyth", "Ludwig von Wohlgemuth", "Christfried Burmeister", "Brett Hayman", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Filippo Soffici", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Felice Giordano", "Danielle McGrath", "John Mundy", "Stephen Carr", "\u017doresas Medvedevas", "Julius Goldzier", "Lucijus Verus", "Elizabeth Kell", "Francis Davis", "Guy De Saint Cyr", "Wilhelm Boger", "Felipe Alfau", "Cipriano de Valera", "Marc Sangnier", "Stefano Nolfi", "Peter Dembicki", "Pieter de Molijn", "Gabriel Bertrand", "John Borlase", "Pierre Joxe", "Michael Guider", "Claude Piel", "Luka Grubor", "Helen Longworth", "Martin Malvy", "John Mundy", "Diana Trask", "Edward Locke", "Barry Howard", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Mordehajs Dubins", "Robert Lecou", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Petar \u010culi\u0107", "Levas Le\u0161\u010denko", "Jamal Jum\u00e1", "Drago\u0219 Neagu", "M\u00f3nica Estarreado", "Claude Bourgelat", "Henri de Contenson", "Martin Jervan", "Petras Geniu\u0161as", "Michael Armstrong", "William Reed", "Nigel Preston", "Marion Stein", "Maurice Maunoury", "Erik Dammann", "Manola Saavedra", "Sarah Stiles", "Neil Doncaster", "Meike Evers", "James Burnham", "Francisco Cervantes de Salazar", "Rob Heanley", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Ernest Lafont", "Ren\u00e9 Mayer", "Ulla Pia", "Bernhard R\u00fchling", "William Main Page", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Dave Steele", "Charles de Montalembert", "Ren\u00e9 Renoult", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Bernard Lloyd", "Claire Baxter", "\u017diulis Gotje", "Serhat", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Aarne Ruben", "Warren Carlyle", "Martin Zobel", "Gijs Vermeulen", "Andrejus Babickis", "David Scott Milton", "Giambattista Nolli", "Kim Bauermeister"]}, "place_of_death": {"objects": ["Havajai", "Edinburgas", "Neapolis", "Kembrid\u017eas", "Milanas", "Florencija", "Lietuva", "Vilnius", "Manhatanas", "Roma", "Florida", "Manhatanas", "Oksfordas", "Maskva", "Florencija", "Londonas", "Praha", "Hamburgas", "Madridas", "Londonas", "Var\u0161uva", "Liverpulis", "Var\u0161uva", "Pary\u017eius", "Pary\u017eius", "Londonas", "Torontas", "Jokohama", "Madridas", "Lilis", "Skarboras", "Sevilija", "Bolonija", "Filadelfija", "Praha"], "subjects": ["Donn Lewin", "Henry Siddons", "Raimondo Guarini", "Grehemas Klarkas", "Gino Penno", "Domenico Cresti", "Simonas Daukantas", "Karolis Pod\u010da\u0161inskis", "Arthur Siegel", "Giovanni Battista Caccini", "Jim Chapin", "Der Scutt", "Homer Hasenpflug Dubs", "Stepanas Erzia", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Algirdas Klimaitis", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Ludwika J\u0119drzejewicz", "Marcel Pouvanaa Oopa", "Robert Lindet", "Tzvi Hirsch Ferber", "Pauline Mills McGibbon", "Vito Positano", "Enrique Sarasola", "John Shortland", "Stephen Joseph", "Felipe de Le\u00f3n", "Thomas Dempster", "William More Gabb", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/lv.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Kopenh\u0101gena", "Minhene", "R\u012bga", "Aiova", "Londona", "Heidelberga", "Zviedrija", "Neapole", "Londona", "Pr\u0101ga", "Par\u012bze", "Braz\u012blija", "Kopenh\u0101gena", "Anglija", "Filadelfija", "R\u012bga", "Nevada", "Berl\u012bne", "Par\u012bze", "Prestona", "Man\u010destra", "Anglija", "Notingema", "Roma", "R\u012bga", "Vestminstera", "R\u012bga", "Maskava", "Par\u012bze", "Makona", "Bukareste", "Kolumbusa", "Par\u012bze", "Sidneja", "V\u012bne", "Boldera", "Oslo", "R\u012bga", "Masa\u010d\u016bsetsa", "Belgrada", "Hamburga", "Krievija", "Viktorija", "Hol\u0161teina", "Tir\u0101na", "Tampa", "Anglija", "Londona", "\u0145\u016bporta", "Springf\u012blda", "Austr\u0101lija", "Frezno", "Ankara", "Kalimant\u0101na", "Ont\u0101rio", "Tallina", "Maskava", "Norid\u017ea", "Maskava", "Pitsburga", "\u0160tutgarte"], "subjects": ["Eyolf Kleven", "Marcus Junkelmann", "Igors Vihrovs", "Eric Ziebold", "James William Wallack", "Eug\u00e9nie S\u00f6derberg", "Leo \u0112rnr\u016bts", "Renato Caccioppoli", "Cliff Jones", "Ivo Luka\u010dovi\u010d", "Luijs God\u0113ns", "Ruta Kardoso", "Victor Borge", "John Mundy", "Francis Davis", "Aigars V\u012btols", "Peta Niksone", "Guy De Saint Cyr", "Gabriel Bertrand", "Helen Longworth", "John Mundy", "Edward Locke", "Barry Howard", "Anastasijs I", "Mordehajs Dubins", "Henry Bentley", "Juris Sokolovskis", "\u013bevs \u013be\u0161\u010denko", "Aleksandrs Degofs", "Laurence Stallings", "Drago\u0219 Neagu", "Sumalee Montano", "Henri de Contenson", "Tom Kazas", "Marion Stein", "P\u012bters Stetina", "Erik Dammann", "Bruno Rubess", "Sarah Stiles", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Imers Pampuri", "Dave Steele", "John Joseph Braham, Sr.", "Alisa Arnah", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "Toraijs Bragss", "Serhats", "Michael Matus", "Elizabeth Hess", "Aarne Ruben", "Anna Ah\u0161arumova", "Warren Carlyle", "Andrejs Babickis", "David Scott Milton", "Kim Bauermeister"]}, "place_of_death": {"objects": ["Edinburga", "Lietuva", "Pasad\u012bna", "Manhetena", "Florida", "Manhetena", "Maskava", "Toronto", "Berl\u012bne", "Kaira", "Konstantinopole", "Bolo\u0146a", "Filadelfija"], "subjects": ["Henry Siddons", "Simons Daukants", "D\u017eons Tods", "Arthur Siegel", "Jim Chapin", "Der Scutt", "Stepans Erzja", "Pauline Mills McGibbon", "Krista Volfa", "Lotfia El Nadi", "Abdulmed\u017eids I", "Thomas Dempster", "William More Gabb"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/ms.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/pl.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Boston", "Kopenhaga", "Belgia", "Boston", "Luksemburg", "Monachium", "Belgrad", "Sydney", "Tirana", "Budapeszt", "Londyn", "Warszawa", "Sztokholm", "Burlington", "Dallas", "Salzburg", "Ryga", "Berkeley", "Stade", "Serbia", "Wellington", "Warszawa", "Hawana", "Livingston", "Jamajka", "Iowa", "Warszawa", "Londyn", "Warszawa", "Londyn", "Madryt", "Zagrzeb", "Izrael", "Damaszek", "Kij\u00f3w", "Polska", "Wiede\u0144", "Sacramento", "Londyn", "Genua", "Sydney", "Warszawa", "Szkocja", "Moskwa", "Mediolan", "Heidelberg", "Budapeszt", "Pary\u017c", "Montreal", "Tuluza", "Mediolan", "Warszawa", "Niemcy", "Westfalia", "Carlisle", "Bow", "Lipsk", "Kensington", "Ukraina", "Milwaukee", "Lw\u00f3w", "Mediolan", "Sewilla", "Warren", "Wellington", "Budapeszt", "Neapol", "Sofia", "Lublin", "Bolonia", "Warszawa", "Rzym", "Londyn", "Neapol", "Alexandria", "Indianapolis", "Nebraska", "Manchester", "Warszawa", "Lublin", "Budapeszt", "Bradford", "Hawana", "Melbourne", "Christchurch", "Filadelfia", "Warszawa", "Anglia", "Ankara", "Rzym", "Shelby", "Praga", "Montreal", "Londyn", "Pary\u017c", "Budapeszt", "Grenada", "Praga", "Salford", "Birmingham", "Warszawa", "Nankin", "Barcelona", "Belgia", "Tulsa", "Louisville", "Tuluza", "Florencja", "Moskwa", "Coventry", "Kopenhaga", "Cumberland", "Praga", "Dublin", "Moskwa", "Budapeszt", "Turyn", "Sofia", "Sydney", "Londyn", "Anglia", "Wiede\u0144", "Praga", "Casablanca", "Salem", "Sydney", "Mediolan", "Bukareszt", "Austria", "Wenecja", "Sofia", "Tbilisi", "Rzym", "Sydney", "Berlin", "Filadelfia", "Cleveland", "Siena", "Baltimore", "Portugalia", "Tokio", "Boulder", "Split", "Neapol", "Frankfurt nad Menem", "Kopenhaga", "Chicago", "Gandawa", "Ateny", "Nevada", "Preston", "Warszawa", "Waterford", "Anglia", "Berlin", "Stuttgart", "Barcelona", "Pary\u017c", "Polska", "Pary\u017c", "Chicago", "Rzym", "Albany", "Austin", "Warszawa", "Szwecja", "Seattle", "Wan", "Toronto", "Vancouver", "Anglia", "Filadelfia", "Pary\u017c", "Buckinghamshire", "Pensylwania", "Pary\u017c", "Barcelona", "Melbourne", "Budapeszt", "Manhattan", "Kopenhaga", "Polska", "Bergen", "Limerick", "Niemcy", "Lima", "Pary\u017c", "Victoria", "Tokio", "Zagrzeb", "Preston", "Pary\u017c", "Algier", "Manchester", "Lw\u00f3w", "Ryga", "Seattle", "Pune", "Rotterdam", "Anglia", "Buffalo", "Florencja", "Nottingham", "Warszawa", "Middlesex", "Moskwa", "Zagrzeb", "Albania", "Praga", "Sofia", "Rzym", "Ipswich", "Rzym", "Stambu\u0142", "Ottawa", "Belgrad", "Portugalia", "Sztokholm", "Neapol", "Ryga", "Best", "Glasgow", "Pary\u017c", "Bristol", "Rzym", "Westminster", "Ryga", "Oslo", "Split", "Madryt", "Split", "Moskwa", "Genua", "Warszawa", "Budapeszt", "Birmingham", "Macon", "Brze\u015b\u0107", "Bukareszt", "Kalifornia", "Madryt", "Columbus", "Lyon", "Pary\u017c", "Neapol", "Budapeszt", "Antwerpia", "Kilkenny", "Pary\u017c", "Francja", "Warszawa", "Portsmouth", "Londyn", "Warszawa", "Watford", "City of Salford", "Everett", "Kij\u00f3w", "Brooklyn", "Sydney", "Dallas", "Wichita", "Exeter", "Como", "Warszawa", "Londyn", "Bari", "Czechy", "Cardiff", "Montreal", "Londyn", "Lublin", "Kaza\u0144", "Turyn", "Warszawa", "Palermo", "Sarajewo", "Wiede\u0144", "Aleksandria", "Florencja", "Boulder", "Oslo", "Fleet", "Kuba", "Polska", "Hiszpania", "Warszawa", "Sheffield", "Cumberland", "York", "Turyn", "Lw\u00f3w", "Kij\u00f3w", "Massachusetts", "Warszawa", "Devon", "Niemcy", "Arkansas", "Baltimore", "Luksemburg", "Berlin", "Chicago", "Oslo", "Toledo", "Oksford", "Surrey", "Anglia", "Belgrad", "Praga", "Hamburg", "Anglia", "Edmonton", "Oslo", "Sydney", "Moskwa", "Filadelfia", "Rosja", "Mediolan", "Wiktoria", "Berlin", "Holsztyn", "Cincinnati", "Oslo", "Neapol", "Wilno", "Tirana", "Edynburg", "Lyon", "Budapeszt", "Pary\u017c", "Mediolan", "Pary\u017c", "Londyn", "W\u0142ochy", "Kopenhaga", "Stuttgart", "Londyn", "Macon", "Dayton", "Bukareszt", "Praga", "Berlin", "Casablanca", "Wiede\u0144", "Tampa", "Lublin", "Bolonia", "Londyn", "Genua", "Pary\u017c", "Anglia", "Lipsk", "Croydon", "Belfast", "Berlin", "Vancouver", "Londyn", "Swindon", "Tours", "Newport", "Springfield", "Australia", "Mediolan", "Berlin", "Warszawa", "Phoenix", "Montreal", "Polska", "Lyon", "Lw\u00f3w", "Ankara", "Ankara", "Borneo", "Berlin", "Baltimore", "Ontario", "Madryt", "Moskwa", "Mediolan", "Montgomery", "Cambridge", "Pary\u017c", "Rzym", "Nassau", "Bukareszt", "Lipsk", "Pary\u017c", "Middlesex", "Budapeszt", "Albany", "Norwich", "Lyon", "Londyn", "Amsterdam", "Bratys\u0142awa", "Kopenhaga", "Toledo", "Szwecja", "Filadelfia", "Neapol", "Pittsburgh", "Lizbona", "Como", "Filadelfia", "Stuttgart", "Tbilisi", "Londyn", "Glasgow", "Edynburg", "Budapeszt", "Pary\u017c"], "subjects": ["Lucy Toulmin Smith", "Eyolf Kleven", "Marvano", "Douglas Fry", "Jean Hamilius", "Marcus Junkelmann", "Mihael Brejc", "Julia Wilson", "Ylli Bufi", "Ferenc Sipos", "Arthur Harold Stone", "Zenon Nowosz", "Greta Knutson", "Paul Daniels", "Tom Jones", "Eugen Enderlen", "Igors Vihrovs", "Patrick Daughters", "Peter Ording", "Branko Ra\u0161i\u0107", "Paula Tesoriero", "Witold Nazarewicz", "Yanitzia Canetti", "Ken Niles", "Bernard Wright", "Eric Ziebold", "Adam Buszko", "Joan Vincent Murray", "Gaba Kulka", "James William Wallack", "Fina de Calder\u00f3n", "Gordan Ko\u017eulj", "Avi Bortnick", "Bachar Kouatly", "Rusia", "Marcin Gawron", "Norbert Balatsch", "Michael Urbano", "Alain de Cadenet", "Giacomo Luigi Brignole", "Alastair Gordon", "Dariusz Lipi\u0144ski", "Michael Gallagher", "Siergiej Ord\u017conikidze", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "George Clifford Sziklai", "Marthe Chenal", "Wayne Eagling", "Roger Brunet", "Celestino Sfondrati", "Stanis\u0142aw Urban", "Aurel Codoban", "Werner M\u00fcnch", "Samuel Sterett", "Clive Brooks", "Maja Tucholke", "Arthur Kinnaird", "Cippora Laskow", "Terry Zahn", "Anna Kurska", "Paul Zuccarelli", "Francisco de Osuna", "Michael Shine", "Barry Mitcalfe", "Gy\u00f6rgy Br\u00f3dy", "Renato Caccioppoli", "Mire\u0142a Iwanowa", "J\u00f3zef Wieniawski", "Ottaviano Mascherino", "Leszek Korzeniowski", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "John Carlyle Herbert", "John Boling", "Edwin Hardin Sutherland", "Brian Callison", "Hanna O\u017cogowska", "Andrzej Ma\u0144ka", "L\u00e1szl\u00f3 Bit\u00f3", "Christfried Burmeister", "Aliuska L\u00f3pez", "Brett Hayman", "Gilbert de Clare", "Harrison Allen", "Romuald Giegiel", "Colin Groves", "Ahmet G\u00fclhan", "Mario Theodoli", "Nina Repeta", "Rudolf K\u0159es\u0165an", "David Atkinson", "Johnny Mowlem", "Patrick Lemari\u00e9", "Andrea M\u00e1tay", "Francisca Pleguezuelos", "Ivo Luka\u010dovi\u010d", "Gary Titley", "Brian Manning", "Micha\u0142 Tober", "Michael Anti", "Jacques Mehler", "Adam Gierek", "George Clark", "Alexander Pope Field", "Christine de Veyrac", "Filippo Soffici", "Jelena Bielakowa", "Martin Jacques", "Victor Borge", "Thomas Johns Perry", "Pavel \u017d\u00e1\u010dek", "Leslie Paul", "Anatolij Aleksin", "Katalin Kar\u00e1dy", "Felice Giordano", "Lubomir Iwanow", "Danielle McGrath", "David Parry", "John Mundy", "Leopold Alexander", "Milan Orlowski", "Lahcen Abrami", "James Henry Emerton", "Stephen Carr", "Guiniforte Solari", "Valeriu Stoica", "Lucas Auer", "Giovanni Francesco Commendone", "Miglena Markowa", "\u017bores Miedwiediew", "Lucjusz Werus", "Elizabeth Kell", "Leopold Casper", "Francis Davis", "Mark Buchanan", "Alessandro Frosini", "William Samuel Booze", "Jo\u00e3o de Souza Mendes", "Eugene Tzigane", "Alex Figge", "Tino Vegar", "Carlo Silipo", "Karl Chmielewski", "Niels Bjerrum", "No I.D.", "Johan Daisne", "Antigoni Goni", "Pat Nixon", "Robert Holden", "Pawe\u0142 Zalewski", "Michael Carney", "Donald Appleyard", "Guy De Saint Cyr", "Wilhelm Boger", "Felipe Alfau", "Charles Coll\u00e9", "Leon Dycian", "Marc Sangnier", "Leslie Allen", "Stefano Nolfi", "Stephen Levine", "Sahara Smith", "Aleksander \u017babczy\u0144ski", "Emma Ejwertz", "Shyril O'Steen", "Aghasi Chand\u017cian", "David Hackl", "Peter Dembicki", "Pieter de Molyn", "Jeff Chandler", "Gabriel Bertrand", "John Borlase", "Leonard Bosack", "Pierre Joxe", "Salvador Cristau Coll", "Michael Guider", "Csaba \u0150ry", "Dennis Davis", "Thomas Dausgaard", "Artur Zawisza", "Kjersti Elvik", "Sam Lynch", "Ornella Oettl Reyes", "Gunnar Samuelsson", "Claude Piel", "Roland Green", "Masataka Yanagida", "Luka Grubor", "Helen Longworth", "Martin Malvy", "Maurice Va\u00efsse", "John Mundy", "Tadeusz Browicz", "Nico Gardener", "Graham Ackerman", "Abhijit Kunte", "Mark Koevermans", "Edward Locke", "David Marusek", "Enrico Toselli", "Barry Howard", "Binem Heller", "Nathaniel Culverwel", "Jakow Murej", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "W\u0142adimir Georgiew", "Anastazy I", "Jamie Moses", "Antonio Tosti", "Aleksander Hangerli", "Alejandro Abellan", "Andrea Leki\u0107", "Francisco Roxo", "Tommy Waidelich", "Andrea Giani", "Mordehajs Dubins", "Eric Swinkels", "Ian Steel", "Robert Lecou", "William Child", "Augusto De Marsanich", "Henry Bentley", "Juris Sokolovskis", "Tommy Rustad", "Tomislav Smoljanovi\u0107", "Alonso del Arco", "Petar \u010culi\u0107", "Lew Leszczenko", "Carlo Fatuzzo", "Stanis\u0142awa Nowicka", "K\u00e1roly Varga", "Bo Weavil Jackson", "Laurence Stallings", "Natalla Hielach", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Sumalee Montano", "Claude Bourgelat", "Henri de Contenson", "Carlo Emery", "Zolt\u00e1n K\u00f3sz", "Hendrick Andriessen", "Eileen O\u2019Keeffe", "Andr\u00e9 L\u00e9ri", "Michael Armstrong", "Jan Szyszko", "John Randall Reding", "Bertie Felstead", "Kazimierz Flatau", "Alan MacDonald", "Michelle Rogers", "Benjamin Castleman", "Wiktor Krasin", "Joe Ascione", "Tom Kazas", "Clinton D. McKinnon", "Roger Mears", "Chris Welsby", "Luca Princiotta", "Karolina Kosi\u0144ska", "Edward Alfred Cowper", "Marcello Vernola", "Jan Beer", "Herbert Bowden", "William Reed", "Nigel Preston", "Marek Muszy\u0144ski", "Rem Urasin", "Alessio Secco", "Edmund Fetting", "Francesco Musotto", "Kemal Alispahi\u0107", "Marion Stein", "Maurice Maunoury", "Valdo Spini", "Peter Stetina", "Erik Dammann", "Gordon Coppuck", "Juan Carlos Gonz\u00e1lez Zamora", "Ma\u0142gorzata Piotrowska", "Manola Saavedra", "Andrzej Kunert", "Trevor Taylor", "George Alexander Pearre", "Adam Kowalczyk", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Abraham Mintchine", "Sarah Stiles", "Piotr Buciarski", "Neil Doncaster", "Imanu\u2019el Szefer", "Keena Rothhammer", "John Ambrose Meyer", "Claude Wiseler", "Meike Evers", "James Burnham", "Johannes Falkenberg", "Francisco Cervantes de Salazar", "Edward Stransham", "Rob Heanley", "Barry Palmer", "Marinko Mad\u017egalj", "Ond\u0159ej Neff", "Frederick Franklin Schrader", "Janek Schaefer", "Chris Woodhead", "Jon Elster", "Joan Hartigan", "Galina Fokina", "Freddy Winnai", "Rosabelle Sinclair", "Luca Bottale", "Murray Hocking", "Max Lehmann", "Louis Gurlitt", "Tom Luken", "Erik Willoch", "Giuseppe de Majo", "Jacek Sauk", "Ymer Pampuri", "Gillian Cooke", "Ernest Lafont", "Ferenc A. V\u00e1li", "H\u00e9l\u00e8ne Carr\u00e8re d\u2019Encausse", "Matteo Salvini", "Ren\u00e9 Mayer", "Alice Temple", "Giancarlo Primo", "Ulla Pia", "Bernhard R\u00fchling", "William Main Page", "Lisa Sheridan", "Len Zengel", "Nicolae Herlea", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Georg Hellmesberger Jr.", "Dave Steele", "Alina Gut", "Max Angelelli", "Charles de Montalembert", "Monica Esposito", "Ren\u00e9 Renoult", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Anne Clark", "John Garland", "Awner W. Less", "Heather Davis", "Alisa Arnah", "Crowther Charlesworth", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Miros\u0142aw Maliszewski", "Jerry Pettis", "Ira Vail", "Jerzy Adamuszek", "Martine Roure", "Jerzy Lerski", "Serhat", "Ayta\u00e7 Biter", "Michael Matus", "Erich Werdermann", "John Lewis Thomas Jr.", "Elizabeth Hess", "Emilio Men\u00e9ndez del Valle", "Anna Achszarumowa", "Marisa Masullo", "Bill Endicott", "Aleksander Whitaker", "Fran\u00e7ois Maspero", "Tommaso Marconi", "Johnny Kemp", "Loredana Errore", "Moritz Wilhelm Drobisch", "Elsa Lunghini", "Geoffrey Alderman", "B\u00e9la Glattfelder", "Jane Stanford", "Warren Carlyle", "Didier Andr\u00e9", "Keith Wiggins", "Gijs Vermeulen", "Andrej \u0160eban", "Ludwig Drescher", "Cliff Bergere", "Kajsa Kling", "Bernie Lowe", "Diego Nargiso", "David Scott Milton", "Rui Tavares", "Giambattista Nolli", "Spencer Wishart", "Kim Bauermeister", "Gieorgij Kietojew", "Adrian Bowyer", "John Cameron", "Scott Cleverdon", "Imre Zach\u00e1r", "Marcel Bertrand"]}, "place_of_death": {"objects": ["Moskwa", "Sztokholm", "Konstantynopol", "Hawaje", "Vancouver", "Edynburg", "Francja", "Marquette", "Neapol", "Richmond", "Lyon", "Pary\u017c", "Warszawa", "Florencja", "Bolonia", "Londyn", "Warszawa", "Kolumbia", "Bejrut", "Cambridge", "Cambridge", "Turcja", "Londyn", "Caldwell", "Kalifornia", "Exeter", "Wellington", "Lyon", "Litwa", "Milton", "Bukareszt", "Wilno", "Manhattan", "Utrecht", "Rzym", "Floryda", "Tunezja", "Japonia", "Sparta", "Neapol", "Kalifornia", "Sussex", "Cambridge", "Pary\u017c", "Manhattan", "Cambridge", "Rzym", "Berlin", "Moskwa", "Helsinki", "Pary\u017c", "Oksford", "Moskwa", "Montreal", "Chicago", "Florencja", "Massachusetts", "Londyn", "Praga", "Londyn", "Pary\u017c", "Norwich", "Madryt", "Londyn", "Warszawa", "Londyn", "Lejda", "Pary\u017c", "Portugalia", "Cumberland", "Liverpool", "Sewilla", "Warszawa", "Pary\u017c", "Lipsk", "Pary\u017c", "Baltimore", "Florencja", "Warszawa", "Toronto", "Wiede\u0144", "Jokohama", "Baltimore", "Siena", "Manhattan", "Kioto", "Atlanta", "Berlin", "Edynburg", "Detroit", "Pary\u017c", "Madryt", "Warszawa", "Kair", "Berkeley", "Dover", "Lille", "Scarborough", "Vancouver", "Boston", "Konstantynopol", "Melbourne", "Sewilla", "Manila", "Nicea", "Szanghaj", "Moskwa", "Mediolan", "Dublin", "Londyn", "Bolonia", "Amsterdam", "Genua", "Filadelfia", "Aleppo", "Praga"], "subjects": ["Varvara Massalitinova", "Johan Gustaf Sandberg", "Jan I Angelos", "Donn Lewin", "Bent Peder Rasch", "Henry Siddons", "Walt Hansgen", "Frederic Baraga", "Raimondo Guarini", "Alexander William Doniphan", "Joseph Jean-Baptiste Xavier Fournet", "Alfred Nicolas Rambaud", "Adolf Dygasi\u0144ski", "Giovanni Durando", "Carlo Emery", "Pearl Richards Craigie", "Antoni Adam Piotrowski", "Rafael Pombo", "Lidija Lipkowska", "Daniel Maynadier Henry", "John Grahame Douglas Clark", "Ola Hansson", "Joshua Cristall", "Sarah Morgan Bryan Piatt", "Charles Constantine", "John Flavel", "Ernest Beaglehole", "Teresa Couderc", "Simonas Daukantas", "Adeline Dutton Train Whitney", "George Georgescu", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Adriaan Reland", "Giovanni Battista Caccini", "Jim Chapin", "Georges Madon", "Masanobu Okumura", "Jezus Jazon", "Giovanni Pontano", "Ethel Catherwood", "Thomas Slingsby Duncombe", "James Augustus Stewart", "Robert Desoille", "Der Scutt", "Ihor \u0160ev\u010denko", "Sebastiano Baggio", "Moritz Land\u00e9", "Niko\u0142aj Milutin", "Olavi Paavolainen", "Moshe Lewin", "Homer Hasenpflug Dubs", "Stiepan Erzia", "Edouard Gagnon", "Battling Nelson", "Philipp von Stosch", "Ferenc A. V\u00e1li", "David Merrick", "V\u00e1clav Havel", "Lucy Faithfull, Baroness Faithfull", "Aleksandr Guczkow", "Eugene Mallove", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Romola de Pulszky", "Tiberius Hemsterhuis", "Gabriel Nicolas de La Reynie", "David Croft", "George Alexander Pearre", "Jefferson Lowndes", "Jos\u00e9 Mo\u00f1ino y Redondo", "Ludwika Chopin", "Marcel Pouvanaa Oopa", "Paul Luter", "Robert Lindet", "Frank Charles Wachter", "Francis Alexander", "Stanis\u0142aw Grzesiuk", "Pauline Mills McGibbon", "Alfred von Henikstein", "Vito Positano", "William Purington Cole Jr.", "Francesco Vanni", "Enrico Donati", "Akinari Ueda", "Cesare Siepi", "Christa Wolf", "John Gillies", "Orestes Brownson", "W\u0142adys\u0142aw \u017bele\u0144ski", "Enrique Sarasola", "Wincenty Krasi\u0144ski", "Lotfia El Nadi", "Egon Petri", "Nathaniel William Wraxall", "John Shortland", "Stephen Joseph", "Jan Hulsker", "Robert Hazard", "Abd\u00fclmecid I", "Alwyn Kurts", "Felipe de Le\u00f3n", "Joaquina Maria Mercedes Barcelo Pages", "Gieorgij Adamowicz", "Liang Shiyi", "Niko\u0142aj Strunnikow", "Maria Pierina de Micheli", "Robert MacBryde", "Benjamin Dale", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Ibrahim Hananu", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/ro.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Copenhaga", "Belgia", "Bucure\u0219ti", "Luxemburg", "M\u00fcnchen", "Belgrad", "Sydney", "Tirana", "Burlington", "Dallas", "Budapesta", "Stade", "Havana", "Hamburg", "Oxford", "Suedia", "Iowa", "Londra", "Madrid", "Viena", "Roma", "Sydney", "Milano", "Heidelberg", "Montr\u00e9al", "Toulouse", "Portsmouth", "Germania", "Cheshire", "Helsinki", "Bow", "Leipzig", "Kensington", "Ungaria", "Sevilla", "Wellington", "Portland", "Napoli", "Bologna", "Roma", "Londra", "Napoli", "Viena", "Bradford", "Melbourne", "Germania", "Anglia", "Montr\u00e9al", "Granada", "Praga", "Salford", "Strand", "Belgia", "Toulouse", "Anglia", "Floren\u021ba", "Copenhaga", "Praga", "Budapesta", "Torino", "Sydney", "Londra", "Yorkshire", "Anglia", "Sydney", "Bucure\u0219ti", "Austria", "Tbilisi", "Roma", "Sydney", "Madison", "Philadelphia", "Siena", "Alger", "Napoli", "Chicago", "Gent", "Bucure\u0219ti", "Nevada", "Berlin", "Stuttgart", "Barcelona", "Rom\u00e2nia", "Paris", "Roma", "Seattle", "Vancouver", "Paris", "Buckinghamshire", "Melbourne", "Budapesta", "Manhattan", "Paris", "Polonia", "Limerick", "Paris", "Zagreb", "Preston", "Alger", "Manchester", "Anglia", "Buffalo", "Nottingham", "Londra", "Columbus", "Zagreb", "Albania", "Praga", "Roma", "Paris", "Istanbul", "Belgrad", "Napoli", "Riga", "Leipzig", "Roma", "Cipru", "Westminster", "Split", "Split", "Moscova", "Genova", "Macon", "Bucure\u0219ti", "California", "Madrid", "Dublin", "Paris", "Tallinn", "Vilnius", "Como", "Rom\u00e2nia", "Bari", "Montr\u00e9al", "Londra", "Torino", "Palermo", "Brooklyn", "Viena", "Bucure\u0219ti", "Oslo", "Cuba", "Spania", "Berlin", "Torino", "Liov", "Massachusetts", "Devon", "Germania", "Paris", "Berlin", "Toledo", "Oxford", "Surrey", "Belgrad", "Hamburg", "Rusia", "Victoria", "Holstein", "Bucure\u0219ti", "Napoli", "Tirana", "Paris", "Milano", "Amsterdam", "Italia", "Bucure\u0219ti", "Stuttgart", "Londra", "Bucure\u0219ti", "Praga", "Berlin", "Casablanca", "Tampa", "Genova", "Anglia", "Leipzig", "Amsterdam", "Milano", "Vancouver", "Londra", "Tours", "Newport", "Berlin", "Australia", "Milano", "Berlin", "Londra", "Ankara", "Borneo", "Berlin", "Ontario", "Tallinn", "Madrid", "Milano", "Roma", "Bucure\u0219ti", "Leipzig", "Hamilton", "Budapesta", "Bournemouth", "Norwich", "Tallinn", "Amsterdam", "Philadelphia", "Napoli", "Pittsburgh", "Como", "Detroit", "Bucure\u0219ti", "Dublin", "Auckland"], "subjects": ["Eyolf Kleven", "Marvano", "Eugenia Popa", "Jean Hamilius", "Marcus Junkelmann", "Mihael Brejc", "Julia Wilson", "Ylli Bufi", "Paul Daniels", "Tom Jones", "Ilus Vay", "Peter Ording", "Yanitzia Canetti", "Caroline Beil", "Thomas Godfrey Faussett", "Leonard Gyllenhaal", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Norbert Balatsch", "Anna Maria Villani Scicolone", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Roger Brunet", "Andrew O'Neill", "Aurel Codoban", "Susan Bullock", "Taavi Vartia", "Clive Brooks", "Maja Tucholke", "Arthur Kinnaird", "Laszlo Gardony", "Francisco de Osuna", "Barry Mitcalfe", "Steve Sundholm", "Renato Caccioppoli", "Ottaviano Mascherino", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "Ludwig von Wohlgemuth", "Christfried Burmeister", "Brett Hayman", "Walter Hilgers", "Colin Groves", "David Atkinson", "Francisca Pleguezuelos", "Ivo Luka\u010dovi\u010d", "Gary Titley", "Hallgeir Langeland", "Adam Gierek", "Christine de Veyrac", "Ian Hancock", "Filippo Soffici", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Katalin Kar\u00e1dy", "Felice Giordano", "Danielle McGrath", "David Parry", "Timothy Drever", "John Mundy", "Stephen Carr", "Valeriu Stoica", "Lucas Auer", "Zhores Medvedev", "Lucius Verus", "Elizabeth Kell", "Bunita Marcus", "Francis Davis", "Alessandro Frosini", "Paul Belmondo", "Carlo Silipo", "No I.D.", "Johan Daisne", "Mircea Florian", "Pat Nixon", "Guy De Saint Cyr", "Wilhelm Boger", "Felipe Alfau", "Romeo Niram", "Marc Sangnier", "Stefano Nolfi", "Shyril O'Steen", "Peter Dembicki", "Gabriel Bertrand", "John Borlase", "Michael Guider", "Csaba \u0150ry", "Dennis Davis", "Jacques d'Agar", "Artur Zawisza", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Maurice Va\u00efsse", "John Mundy", "Edward Locke", "David Marusek", "Barry Howard", "Olivia Poulet", "Michael Shank", "Aleksandra Romani\u0107", "Thomas Nassi", "Joseph Wilhelm Swoboda", "Papa Anastasie I", "Adolphe Cohn", "Alexandru Hangerli", "Andrea Leki\u0107", "Andrea Giani", "Mordehajs Dubins", "Gottfried Heinrich Bach", "Augusto De Marsanich", "Chad Hartigan", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Petar \u010culi\u0107", "Lev Le\u0219cenco", "Carlo Fatuzzo", "Laurence Stallings", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Catherine Wellesley", "Henri de Contenson", "Martin Jervan", "Petras Geniu\u0161as", "Luca Princiotta", "Ephraim Hertzano", "Marcello Vernola", "William Reed", "Nigel Preston", "Alessio Secco", "Francesco Musotto", "Ralph Schoenman", "Marion Stein", "Vlad Georgescu", "Erik Dammann", "Juan Carlos Gonz\u00e1lez Zamora", "Manola Saavedra", "Ralf Wadephul", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Sarah Stiles", "Neil Doncaster", "Emmanuel Scheffer", "Charles Nicolas Aub\u00e9", "Meike Evers", "Francisco Cervantes de Salazar", "Edward Stransham", "Rob Heanley", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Alma Redlinger", "Giuseppe de Majo", "Ymer Pampuri", "H\u00e9l\u00e8ne Carr\u00e8re d'Encausse", "Matteo Salvini", "Theodor Holman", "Giancarlo Primo", "Christian Wilhelm Berger", "Bernhard R\u00fchling", "William Main Page", "Nicolae Herlea", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Nyncke Beekhuyzen", "Abbondio Sangiorgio", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Albert Heinrich Brendel", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Reginald Brett, viconte de Esher", "Serhat", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Aarne Ruben", "Emilio Men\u00e9ndez", "Marisa Masullo", "Tommaso Marconi", "Loredana Errore", "Moritz Wilhelm Drobisch", "Linda Crockett", "B\u00e9la Glattfelder", "Craig Richards", "Warren Carlyle", "Martin Zobel", "Gijs Vermeulen", "Bernie Lowe", "Diego Nargiso", "David Scott Milton", "Giambattista Nolli", "Dave Marsh", "Niculae Conovici", "John O'Conor", "Cherry Wilder"]}, "place_of_death": {"objects": ["Moscova", "Santiago de Chile", "M\u00fcnchen", "Hawaii", "Edinburgh", "Napoli", "Londra", "Beirut", "Cambridge", "Turcia", "Londra", "Bucure\u0219ti", "Vilnius", "Manhattan", "Cheltenham", "Paris", "Roma", "Florida", "Sussex", "Paris", "Manhattan", "Oxford", "Moscova", "Lisabona", "Floren\u021ba", "Londra", "Praga", "Vancouver", "Paris", "Madrid", "Londra", "Londra", "Paris", "Roma", "Toronto", "Liverpool", "Zagreb", "Manhattan", "Leipzig", "Toronto", "Bucure\u0219ti", "Yokohama", "Siena", "Atlanta", "Detroit", "Madrid", "Lille", "Scarborough", "Boston", "Constantinopol", "Sevilla", "Nisa", "Bologna", "Amsterdam", "Genova", "Philadelphia", "Praga"], "subjects": ["Varvara Massalitinova", "Marta Canales", "Tobias Andreae", "Donn Lewin", "Henry Siddons", "Raimondo Guarini", "Craigie, Pearl Mary Teresa Richards,", "Lidia Lipkovskaia", "Grahame Clark", "Ola Hansson", "Joshua Cristall", "George Georgescu", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Charles Barton", "Andr\u00e9 Chamson", "Giovanni Battista Caccini", "Jim Chapin", "Thomas Slingsby Duncombe", "Robert Desoille", "Der Scutt", "Homer Hasenpflug Dubs", "Stepan Erzia", "Murilo Mendes", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Wells Wintemute Coates", "Aleksandr Gucikov", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Romola de Pulszky", "Gabriel Nicolas de la Reynie", "Johann Konrad Dorner", "Margaret Marshall Saunders", "Jefferson Lowndes", "Frane Buli\u0107", "Lois Gould", "Paul Luther", "Pauline Mills McGibbon", "Paul Everac", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Orestes Brownson", "Enrique Sarasola", "John Shortland", "Stephen Joseph", "Robert Hazard", "Abdul-Medjid", "Felipe de Le\u00f3n", "Gheorghi Adamovici", "Thomas Dempster", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/sk.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["R\u00edm", "Koda\u0148", "Praha", "Mn\u00edchov", "Sydney", "Par\u00ed\u017e", "Stade", "Havana", "Budape\u0161\u0165", "Iowa", "Lond\u00fdn", "Madrid", "Izrael", "Sydney", "Mil\u00e1no", "Heidelberg", "Montreal", "Ben\u00e1tky", "Var\u0161ava", "Nemecko", "Par\u00ed\u017e", "Lipsko", "Kensington", "Dayton", "Sevilla", "Wellington", "Neapol", "R\u00edm", "Lond\u00fdn", "Neapol", "Bradford", "Melbourne", "Anglicko", "Praha", "Montreal", "Praha", "Jamajka", "Florencia", "Koda\u0148", "Anglicko", "Praha", "Tur\u00edn", "Sydney", "Anglicko", "Sydney", "Praha", "G\u00f6teborg", "R\u00edm", "Sydney", "Philadelphia", "Siena", "Neapol", "Nevada", "Waterford", "Berl\u00edn", "Barcelona", "Par\u00ed\u017e", "R\u00edm", "Wilmington", "Seattle", "Vancouver", "Anglicko", "Par\u00ed\u017e", "Buckinghamshire", "Melbourne", "Limerick", "Par\u00ed\u017e", "Z\u00e1hreb", "Al\u017e\u00edr", "Manchester", "Par\u00ed\u017e", "Rotterdam", "Anglicko", "Nottingham", "Z\u00e1hreb", "Alb\u00e1nsko", "Praha", "Praha", "R\u00edm", "Neapol", "Riga", "R\u00edm", "Westminster", "Split", "Split", "Moskva", "Bukure\u0161\u0165", "Kalifornia", "Madrid", "Lyon", "Par\u00ed\u017e", "Franc\u00fazsko", "Como", "Montreal", "Lond\u00fdn", "Kalifornia", "Tur\u00edn", "Palermo", "Viede\u0148", "Oslo", "\u0160panielsko", "J\u00e1va", "Rochester", "Edinburgh", "Tur\u00edn", "\u013dvov", "Ben\u00e1tky", "Massachusetts", "Devon", "Par\u00ed\u017e", "Berl\u00edn", "Toledo", "Surrey", "Belehrad", "Praha", "Hamburg", "Slovensko", "Rusko", "Vikt\u00f3ria", "Neapol", "Mil\u00e1no", "Par\u00ed\u017e", "Taliansko", "Lond\u00fdn", "Praha", "Berl\u00edn", "Casablanca", "Tampa", "Janov", "Anglicko", "Lipsko", "Vancouver", "Lond\u00fdn", "Tours", "Newport", "Austr\u00e1lia", "Mil\u00e1no", "Berl\u00edn", "Afrika", "Borneo", "Berl\u00edn", "Ont\u00e1rio", "Philadelphia", "Mil\u00e1no", "Par\u00ed\u017e", "R\u00edm", "Nassau", "Bukure\u0161\u0165", "Norwich", "Amsterdam", "Bratislava", "Neapol", "Pittsburgh", "Como"], "subjects": ["Eduard Ender", "Eyolf Kleven", "Alois Wachsman", "Marcus Junkelmann", "Julia Wilson", "Renaud Gagneux", "Peter Ording", "Yanitzia Canetti", "\u00c1rp\u00e1d So\u00f3s", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Avi Bortnick", "Alastair Gordon", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Rosina Storchio", "Stanis\u0142aw Urban", "Aurel Codoban", "Nikos Aliagas", "Maja Tucholke", "Arthur Kinnaird", "Kelley Deal", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Enrico Montesano", "Cliff Jones", "Warington Wilkinson Smyth", "Christfried Burmeister", "Brett Hayman", "Colin Groves", "Rudolf K\u0159es\u0165an", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Sheyla Bonnick", "Filippo Soffici", "Victor Borge", "Stephen Fox", "Pavel \u017d\u00e1\u010dek", "Felice Giordano", "Danielle McGrath", "John Mundy", "Stephen Carr", "Jan Anton\u00edn Duchoslav", "Peter Nyborg", "Lucius Verus", "Elizabeth Kell", "Francis Davis", "Alessandro Frosini", "Carlo Silipo", "Pat Nixonov\u00e1", "Michael Carney", "Guy De Saint Cyr", "Felipe Alfau", "Marc Sangnier", "Stefano Nolfi", "Collins J. Seitz", "Shyril O'Steen", "Peter Dembicki", "Pieter de Molyn", "Gabriel Bertrand", "John Borlase", "Michael Guider", "Sam Lynch", "Claude Piel", "Luka Grubor", "Maurice Va\u00efsse", "John Mundy", "Victor Antoine Signoret", "Mark Koevermans", "Edward Locke", "Barry Howard", "Aleksandra Romani\u0107", "Thomas Nassi", "Martin Kratochv\u00edl", "Joseph Wilhelm Swoboda", "Anast\u00e1z I.", "Andrea Giani", "Mordehajs Dubins", "Augusto De Marsanich", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Petar \u010culi\u0107", "Lev Le\u0161\u010denko", "Drago\u0219 Neagu", "Penny Lernoux", "M\u00f3nica Estarreado", "Claude Bourgelat", "Henri de Contenson", "Michael Armstrong", "Luca Princiotta", "William Reed", "Nigel Preston", "Jimmy Greenspoon", "Alessio Secco", "Francesco Musotto", "Marion Stein", "Erik Dammann", "Manola Saavedra", "Ien Angov\u00e1", "Diane Greene", "Marcus Dods", "Nicola Campogrande", "Witold Rodzi\u0144ski", "Giulio Carpioni", "Sarah Stiles", "Neil Doncaster", "Charles Nicolas Aub\u00e9", "Meike Evers", "Francisco Cervantes de Salazar", "Rob Heanley", "Marinko Mad\u017egalj", "Ond\u0159ej Neff", "Frederick Franklin Schrader", "Pavol Polakovi\u010d", "Rosabelle Sinclair", "Murray Hocking", "Giuseppe de Majo", "Matteo Salvini", "Ren\u00e9 Mayer", "Giancarlo Primo", "William Main Page", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Migidio Bourifa", "Dave Steele", "Monica Esposito", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Claire Baxter", "Fulvio Ballabio", "Peter Lachmann", "Publius Annius Florus", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Leon Bass", "Marisa Masullo", "Fran\u00e7ois Maspero", "Tommaso Marconi", "Johnny Kemp", "Loredana Errore", "Warren Carlyle", "Gijs Vermeulen", "Andrej \u0160eban", "Diego Nargiso", "David Scott Milton", "Giambattista Nolli"]}, "place_of_death": {"objects": ["\u0160tokholm", "Edinburgh", "Neapol", "Lyon", "Viede\u0148", "Cambridge", "Lond\u00fdn", "Jeruzalem", "Vilnius", "Manhattan", "R\u00edm", "Florida", "Gent", "Sussex", "Manhattan", "Jerevan", "Oxford", "Florencia", "Lond\u00fdn", "Praha", "Madrid", "Lond\u00fdn", "Var\u0161ava", "Liverpool", "Lipsko", "Toronto", "Jokohama", "Siena", "Atlanta", "Detroit", "Madrid", "K\u00e1hira", "Lille", "Kon\u0161tant\u00ednopol", "Sevilla", "Nice", "Amsterdam", "Janov", "Philadelphia", "Praha"], "subjects": ["Johann Gustaf Sandberg", "Henry Siddons", "Raimondo Guarini", "Joseph Jean-Baptiste Xavier Fournet", "Johan Stephan Decker", "Grahame Clark", "Joshua Cristall", "Wolf Gold", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Giovanni Battista Caccini", "Jim Chapin", "Robert van Audenaerd", "Thomas Slingsby Duncombe", "Der Scutt", "Toros Toramanjan", "Homer Hasenpflug Dubs", "Philipp von Stosch", "David Merrick", "V\u00e1clav Havel", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Paul Luther", "Pauline Mills McGibbon", "Vito Positano", "Francesco Vanni", "Cesare Siepi", "Orestes Brownson", "Enrique Sarasola", "Lotfia ElNadi", "John Shortland", "Abd\u00fclmecid I.", "Felipe de Le\u00f3n", "Georgij Viktorovi\u010d Adamovi\u010d", "Nicolaes de Bruyn", "Cesare Valletti", "William More Gabb", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/ta.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/tr.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Boston", "Kopenhag", "M\u00fcnih", "T\u00fcrkiye", "Tiran", "Budape\u015fte", "Burlington", "\u0130stanbul", "Stade", "Iowa", "Londra", "Madrid", "Kiev", "Milano", "Heidelberg", "Montreal", "Oslo", "Var\u015fova", "Edinburgh", "Almanya", "Konstantiniyye", "\u0130stanbul", "Leipzig", "Kensington", "T\u00fcrkiye", "Sevilla", "Wellington", "Napoli", "Belfast", "Atina", "Londra", "Napoli", "Bradford", "Melbourne", "Yunanistan", "\u0130ngiltere", "Ankara", "Shelby", "Montreal", "Prag", "Durham", "T\u00fcrkiye", "Portland", "Nankin", "Paris", "Floransa", "Coventry", "Kopenhag", "Prag", "\u0130stanbul", "Boston", "Budape\u015fte", "Londra", "Londra", "Yorkshire", "\u0130ngiltere", "Tiflis", "Roma", "Philadelphia", "Arnavutluk", "\u015eikago", "Nevada", "Waterford", "Berlin", "Stuttgart", "Barselona", "T\u00fcrkiye", "Kahire", "Seattle", "\u0130stanbul", "Vancouver", "Paris", "Pomeranya", "Buckinghamshire", "Paris", "Melbourne", "\u0130svi\u00e7re", "Manhattan", "\u0130talya", "Limerick", "Paris", "Zagreb", "Preston", "Paris", "Manchester", "\u0130ngiltere", "Nottingham", "Zagreb", "Arnavutluk", "Almanya", "Prag", "Brandon", "Roma", "\u0130stanbul", "Riga", "Westminster", "Split", "Charlotte", "\u0130stanbul", "Split", "\u0130sve\u00e7", "\u0130sve\u00e7", "Birmingham", "Cincinnati", "Ba\u011fdat", "Macon", "B\u00fckre\u015f", "Madrid", "Suriye", "Columbus", "Lyon", "Paris", "\u0130ngiltere", "Fransa", "Brooklyn", "\u0130svi\u00e7re", "\u0130stanbul", "Montreal", "Kaliforniya", "Viyana", "Oslo", "\u0130spanya", "Berlin", "Lviv", "Venedik", "Leeds", "Massachusetts", "Devon", "Berlin", "\u0130stanbul", "Toledo", "Surrey", "Belgrad", "Hamburg", "Oslo", "Boston", "Brooklyn", "Rusya", "Victoria", "Holstein", "Tiran", "M\u00fcnih", "Glasgow", "Paris", "Kopenhag", "Stuttgart", "Macon", "Prag", "Berlin", "Tampa", "Ankara", "Bangkok", "\u0130ngiltere", "Leipzig", "Amsterdam", "Budape\u015fte", "\u0130stanbul", "Vancouver", "Londra", "Newport", "\u0130stanbul", "Springfield", "Avustralya", "Paris", "\u015eikago", "Ankara", "Ankara", "Borneo", "Berlin", "Ontario", "Paris", "Norwich", "Amsterdam", "Philadelphia", "Pittsburgh", "Stuttgart", "Paris", "Atina"], "subjects": ["Lucy Toulmin Smith", "Eyolf Kleven", "Marcus Junkelmann", "Cenk Renda", "Ylli Bufi", "Ferenc Sipos", "Paul Daniels", "Azra Erhat", "Peter Ording", "Eric Ziebold", "James William Wallack", "Fina de Calder\u00f3n", "Rusya", "Marcello Abbado", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Jan Jakob T\u00f8nseth", "Stanis\u0142aw Urban", "Robert Hamilton Paterson", "Aurel Codoban", "\u0130ngiliz Kemal", "Necmi S\u00f6nmez", "Maja Tucholke", "Arthur Kinnaird", "Serdar Apayd\u0131n", "Francisco de Osuna", "Barry Mitcalfe", "Renato Caccioppoli", "Hamilton Sloan", "Argiris Pedulakis", "Cliff Jones", "Warington Wilkinson Smyth", "Christfried Burmeister", "Brett Hayman", "Nektaria Karantzi", "Colin Groves", "Ahmet G\u00fclhan", "Nina Repeta", "David Atkinson", "Ivo Luka\u010dovi\u010d", "Neil Fingleton", "Murat Evliyao\u011flu", "Jesse A. Hamilton", "Michael Anti", "Herv\u00e9 Alphand", "Filippo Soffici", "Martin Jacques", "Victor Borge", "Pavel \u017d\u00e1\u010dek", "Tayyar Yalaz", "Francis J. Ricciardone, Jr.", "Katalin Kar\u00e1dy", "Coral Amiga", "David Parry", "Tim Robinson", "John Mundy", "Jores Medvedev", "Lucius Verus", "Francis Davis", "Hasna Xhuki\u00e7i", "No I.D.", "Pat Nixon", "Michael Carney", "Guy De Saint Cyr", "Wilhelm Boger", "Felipe Alfau", "Tolga Tekinalp", "Richard Anthony", "Shyril O'Steen", "Do\u011fa Bekleriz", "Peter Dembicki", "Gabriel Bertrand", "Martin Kosleck", "John Borlase", "Pierre Joxe", "Michael Guider", "Raymond Meier", "Dennis Davis", "Tancr\u00e8de Dumas", "Sam Lynch", "Claude Piel", "Luka Grubor", "Helen Longworth", "Martin Malvy", "John Mundy", "Edward Locke", "Barry Howard", "Aleksandra Romani\u0107", "Thomas Nassi", "G\u00f6khan Bozkaya", "Joseph Wilhelm Swoboda", "Tim Long", "I. Anastasius", "Mehmet Ali \u0130rtem\u00e7elik", "Mordehajs Dubins", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Claire Ritter", "\u00d6mer Kaner", "Petar \u010culi\u0107", "Martin Henriksson", "Staffan de Mistura", "Sam Butler", "Kay Lahusen", "Cemal Cuma", "Laurence Stallings", "Drago\u0219 Neagu", "M\u00f3nica Estarreado", "Mohammed Loay Bayazid", "Sumalee Montano", "Claude Bourgelat", "Henri de Contenson", "John Abram", "Michael Armstrong", "Joe Ascione", "Martin Gero", "Fuat G\u00fcner", "William Reed", "Jimmy Greenspoon", "Marion Stein", "Erik Dammann", "Manola Saavedra", "Ralf Wadephul", "Witold Rodzi\u0144ski", "Giulio Carpioni", "John Buckley", "Sarah Stiles", "Neil Doncaster", "Meike Evers", "M\u00fcnir G\u00f6le", "Francisco Cervantes de Salazar", "Rob Heanley", "Marinko Mad\u017egalj", "Frederick Franklin Schrader", "Jon Elster", "Sara Agnes Mclaughlin Conboy", "Stacy Barthe", "Rosabelle Sinclair", "Murray Hocking", "Louis Gurlitt", "Ymer Pampuri", "Eduard von Weber", "Lynn Faulds Wood", "Ren\u00e9 Mayer", "Ulla Pia", "Bernhard R\u00fchling", "Lisa Sheridan", "Lucie Vrbensk\u00e1", "Pawe\u0142 Nowacki", "Dave Steele", "O\u011fuz Abadan", "Kraisak Choonhavan", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Nyncke Beekhuyzen", "Adrienn Bende", "Orhan Demir", "Heather Davis", "Alisa Arnah", "Bernard Lloyd", "\u015eahan \u015eahnur", "Homer Curran", "Claire Baxter", "Jules de Gautier", "Johanna Meier", "Serhat", "Ayta\u00e7 Biter", "Michael Matus", "Erich Werdermann", "Elizabeth Hess", "Elsa Lunghini", "Warren Carlyle", "Gijs Vermeulen", "Bernie Lowe", "David Scott Milton", "Kim Bauermeister", "Jane Bathori", "Lukas Sideras"]}, "place_of_death": {"objects": ["Hawaii", "Arizona", "Edinburgh", "Kud\u00fcs", "Philadelphia", "Vilnius", "Manhattan", "Roma", "Florida", "Sussex", "Paris", "Detroit", "Manhattan", "Kahire", "Oxford", "Lizbon", "Floransa", "Mumbai", "Londra", "Prag", "Londra", "Paris", "Madrid", "Londra", "Var\u015fova", "Liverpool", "Toronto", "Yokohama", "Berlin", "Madrid", "Kahire", "Lille", "Boston", "Konstantiniyye", "Sevilla", "Philadelphia", "Halep", "Prag"], "subjects": ["Donn Lewin", "Hac\u0131 Ali", "Henry Siddons", "Yakir Geron", "George Wallace Melville", "Karol Podczaszy\u0144ski", "Arthur Siegel", "Giovanni Battista Caccini", "Jim Chapin", "Thomas Slingsby Duncombe", "Robert Desoille", "Kurken Alemshah", "Der Scutt", "Nasr Hamid Ebu Zeyd", "Homer Hasenpflug Dubs", "Murilo Mendes", "Philipp von Stosch", "Sultan Khan", "David Merrick", "V\u00e1clav Havel", "Lucy Faithfull, Baroness Faithfull", "Aleksandr Gu\u00e7kov", "Juan P\u00e9rez de Montalb\u00e1n", "Aleksy \u0106wiakowski", "Marian Porwit", "Jefferson Lowndes", "Pauline Mills McGibbon", "Vito Positano", "Christa Wolf", "Enrique Sarasola", "Lotfia El Nadi", "John Shortland", "Robert Hazard", "Abd\u00fclmecid", "Felipe de Le\u00f3n", "William More Gabb", "\u0130brahim Hananu", "Florentina Mall\u00e1"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/ur.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/vi.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": ["Sydney", "Burlington", "Stade", "Lu\u00e2n \u0110\u00f4n", "Sydney", "Heidelberg", "Montr\u00e9al", "Santiago de Chile", "Leipzig", "Wellington", "Napoli", "Lu\u00e2n \u0110\u00f4n", "Napoli", "Melbourne", "Anh", "Montr\u00e9al", "Sunderland", "Nam Kinh", "Vi\u1ec7t Nam", "Coventry", "Sydney", "Anh", "Sydney", "Nam Kinh", "Roma", "Sydney", "Philadelphia", "Nevada", "Waterford", "Stuttgart", "Seattle", "Van", "Vancouver", "Melbourne", "Limerick", "Preston", "Manchester", "Sheffield", "Anh", "Roma", "T\u00f4ky\u00f4", "Westminster", "Split", "Macon", "California", "Cardiff", "Montr\u00e9al", "Palermo", "Vi\u00ean", "Massachusetts", "Surrey", "Lagos", "Hamburg", "Nga", "Victoria", "Stuttgart", "Anh", "Leipzig", "Vancouver", "Lu\u00e2n \u0110\u00f4n", "Tours", "Newport", "Springfield", "\u00dac", "Ph\u00e1p", "Borneo", "Cardiff", "Ontario", "Norwich", "Pittsburgh", "Lu\u00e2n \u0110\u00f4n"], "subjects": ["Julia Wilson", "Paul Daniels", "Peter Ording", "James William Wallack", "Alastair Gordon", "Eug\u00e9nie S\u00f6derberg", "Wayne Eagling", "Ximena Armas", "Maja Tucholke", "Barry Mitcalfe", "Renato Caccioppoli", "Cliff Jones", "Warington Wilkinson Smyth", "Brett Hayman", "Colin Groves", "David Atkinson", "Arthur Andrews", "Michael Anti", "L\u1ea1i Thanh H\u00e0", "Martin Jacques", "Danielle McGrath", "John Mundy", "Stephen Carr", "Qu\u00e1ch Kim Long", "Lucius Verus", "Elizabeth Kell", "Francis Davis", "Pat Nixon", "Michael Carney", "Wilhelm Boger", "Shyril O'Steen", "Aghasi Khanjian", "Peter Dembicki", "Michael Guider", "Sam Lynch", "Helen Longworth", "John Mundy", "Alec Briggs", "Edward Locke", "Gi\u00e1o ho\u00e0ng Anastasi\u00f4 I", "Maeda Takeshi", "Henry Bentley", "Tomislav Smoljanovi\u0107", "Laurence Stallings", "Penny Lernoux", "Herbert Bowden, Baron Aylestone", "William Reed", "Francesco Musotto", "Marion Stein", "Sarah Stiles", "Rob Heanley", "Jumoke Verissimo", "Frederick Franklin Schrader", "Rosabelle Sinclair", "Murray Hocking", "Bernhard R\u00fchling", "John Joseph Braham, Sr.", "Kirsten Wenzel", "Heather Davis", "Alisa Arnah", "Ren\u00e9 Th\u00e9odore Berthon", "Bernard Lloyd", "Homer Curran", "Claire Baxter", "J.B. Jackson", "Michael Matus", "Simon Bowman", "Elizabeth Hess", "Warren Carlyle", "David Scott Milton", "Adrian Bowyer"]}, "place_of_death": {"objects": ["Hawaii", "Edinburgh", "Lyon", "B\u1ec9", "Cambridge", "Lu\u00e2n \u0110\u00f4n", "Manhattan", "\u0110\u00e0i B\u1eafc", "Florida", "Manhattan", "Montr\u00e9al", "Santiago de Chile", "Lu\u00e2n \u0110\u00f4n", "Liverpool", "Toronto", "Yokohama", "Berkeley", "Detroit", "Lille", "Constantinopolis", "Sevilla", "Nice", "Th\u01b0\u1ee3ng H\u1ea3i", "Bologna"], "subjects": ["Donn Lewin", "Henry Siddons", "Joseph Jean-Baptiste Xavier Fournet", "Margaret c\u1ee7a Anh, N\u1eef C\u00f4ng t\u01b0\u1edbc x\u1ee9 Brabant", "Grahame Clark", "Joshua Cristall", "Arthur Siegel", "Th\u1ea9m Ki\u1ebfm H\u1ed3ng", "Jim Chapin", "Der Scutt", "\u00c9douard Gagnon", "Nicolasa Vald\u00e9s", "David Merrick", "Jefferson Lowndes", "Pauline Mills McGibbon", "Vito Positano", "Andrew Imbrie", "Orestes Brownson", "John Shortland", "Abd\u00fcl Mecid I", "Felipe de Le\u00f3n", "Georgi Victorovich Adamovich", "L\u01b0\u01a1ng S\u0129 Di", "Thomas Dempster"]}} -------------------------------------------------------------------------------- /data/GoogleRE_objects/zh.json: -------------------------------------------------------------------------------- 1 | {"date_of_birth": {"objects": [], "subjects": []}, "place_of_birth": {"objects": [], "subjects": []}, "place_of_death": {"objects": [], "subjects": []}} -------------------------------------------------------------------------------- /dataset/cleanup.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | from utils import get_logger 5 | 6 | LOG = get_logger(__name__) 7 | 8 | 9 | def clean_triple(line): 10 | data = json.loads(line) 11 | relevant_keys = {"obj_label", "sub_label", "obj_uri", "sub_uri"} 12 | result = {k: v for k, v in data.items() if k in relevant_keys and data["from_english"] is False} 13 | return result 14 | 15 | 16 | def clean_relation(line): 17 | data = json.loads(line) 18 | relevant_keys = {"relation", "template"} 19 | result = {k: v for k, v in data.items() if k in relevant_keys} 20 | return result 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--infolder", default=None, type=str, required=True, help="") 26 | parser.add_argument("--outfolder", default=None, type=str, required=True, help="") 27 | args = parser.parse_args() 28 | 29 | langs = [x.replace("relations_", "") for x in os.listdir( 30 | os.path.join(args.infolder, "templates")) if "relations_" in x] 31 | relations = [x.replace(".jsonl", "") for x in os.listdir(os.path.join(args.infolder, "en"))] 32 | 33 | for lang in langs: 34 | os.makedirs(os.path.join(args.outfolder, lang)) 35 | 36 | for lang in langs: 37 | LOG.info(lang) 38 | # transfer triples 39 | for relation in relations: 40 | current_path = os.path.join(args.infolder, lang, relation + ".jsonl") 41 | if os.path.exists(current_path): 42 | with open(current_path) as fin: 43 | with open(os.path.join(args.outfolder, lang, relation + ".jsonl"), "w") as fout: 44 | for i, line in enumerate(fin): 45 | triple = clean_triple(line) 46 | if triple: 47 | triple["lineid"] = i 48 | fout.write("{}\n".format(json.dumps(triple))) 49 | # transfer templates 50 | with open(os.path.join(args.outfolder, lang, "templates.jsonl"), "a") as fout: 51 | if os.path.exists(os.path.join(args.infolder, "templates", "relations_{}.jsonl".format(lang))): 52 | with open(os.path.join(args.infolder, "templates", "relations_{}.jsonl".format(lang))) as fin: 53 | for line in fin: 54 | template = clean_relation(line) 55 | fout.write("{}\n".format(json.dumps(template))) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /dataset/download_trexentities.py: -------------------------------------------------------------------------------- 1 | from relations import Relations 2 | import argparse 3 | from typing import Text 4 | import requests 5 | import tqdm 6 | import os 7 | import json 8 | from utils import get_logger 9 | 10 | LOG = get_logger(__name__) 11 | 12 | 13 | def download_entity(url: Text, outfile: Text) -> None: 14 | try: 15 | answer = requests.get(url) 16 | with open(outfile, "w") as fp: 17 | fp.write(json.dumps(json.loads(answer.content))) 18 | except Exception as e: 19 | LOG.warning("Getting {} failed.".format(url)) 20 | LOG.warning("Exception: {}.".format(e)) 21 | 22 | 23 | def download_from_wikidata() -> None: 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--datapath", default=None, type=str, required=True, help="") 26 | parser.add_argument("--outpath", default=None, type=str, required=True, help="") 27 | parser.add_argument("--use", action="store_true", help="") 28 | args = parser.parse_args() 29 | t = Relations(args.datapath) 30 | filenames = t.get_available_filenames() 31 | t.load_data(filenames) 32 | entities = t.get_all_entities(["obj_uri", "sub_uri"]) 33 | base_url = "https://www.wikidata.org/wiki/Special:EntityData/{}.json" 34 | for entity in tqdm.tqdm(entities): 35 | download_entity(base_url.format(entity), os.path.join(args.outpath, entity + ".json")) 36 | 37 | 38 | if __name__ == '__main__': 39 | download_from_wikidata() 40 | -------------------------------------------------------------------------------- /dataset/download_wikidata.py: -------------------------------------------------------------------------------- 1 | from .data import Relations 2 | import argparse 3 | from typing import Text 4 | import requests 5 | import tqdm 6 | import os 7 | import json 8 | 9 | 10 | def download_entity(url: Text, outfile: Text) -> None: 11 | try: 12 | answer = requests.get(url) 13 | with open(outfile, "w") as fp: 14 | fp.write(json.dumps(json.loads(answer.content))) 15 | except Exception as e: 16 | print("Getting {} failed.".format(url)) 17 | print("Exception: {}.".format(e)) 18 | 19 | 20 | def download_from_wikidata() -> None: 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("--datapath", default=None, type=str, required=True, help="") 23 | parser.add_argument("--outpath", default=None, type=str, required=True, help="") 24 | parser.add_argument("--use", action="store_true", help="") 25 | args = parser.parse_args() 26 | t = Relations(args.datapath) 27 | filenames = t.get_available_filenames() 28 | t.load_data(filenames) 29 | entities = t.get_all_entities(["obj_uri", "sub_uri"]) 30 | base_url = "https://www.wikidata.org/wiki/Special:EntityData/{}.json" 31 | for entity in tqdm.tqdm(entities): 32 | download_entity(base_url.format(entity), os.path.join(args.outpath, entity + ".json")) 33 | 34 | 35 | if __name__ == '__main__': 36 | download_from_wikidata() 37 | -------------------------------------------------------------------------------- /dataset/mbertlangs.txt: -------------------------------------------------------------------------------- 1 | wiki name iso googletranslate 2 | af Afrikaans afr af 3 | sq Albanian sqi sq 4 | ar Arabic ara,arb ar 5 | an Aragonese arg 6 | hy Armenian hye hy 7 | ast Asturian ast 8 | az Azerbaijani aze az 9 | ba Bashkir bak 10 | eu Basque eus eu 11 | bar Bavarian bar 12 | be Belarusian bel be 13 | bn Bengali ben bn 14 | bpy Bishnupriya Manipuri bpy 15 | bs Bosnian bos bs 16 | br Breton bre 17 | bg Bulgarian bul bg 18 | my Burmese mya 19 | ca Catalan cat ca 20 | ceb Cebuano ceb ceb 21 | ce Chechen che 22 | zh Chinese (Simplified) zho zh-CN 23 | zh-classical Chinese (Traditional) lzh zh-TW 24 | cv Chuvash chv 25 | hr Croatian hrv hr 26 | cs Czech ces cs 27 | da Danish dan da 28 | nl Dutch nld nl 29 | en English eng en 30 | et Estonian est et 31 | fi Finnish fin fi 32 | fr French fra fr 33 | gl Galician glg gl 34 | ka Georgian kat ka 35 | de German deu de 36 | el Greek ell el 37 | gu Gujarati guj gu 38 | ht Haitian hat ht 39 | he Hebrew heb he 40 | hi Hindi hin hi 41 | hu Hungarian hun hu 42 | is Icelandic isl is 43 | io Ido ido 44 | id Indonesian ind id 45 | ga Irish gle ga 46 | it Italian ita it 47 | ja Japanese jpn ja 48 | jv Javanese jav jv 49 | kn Kannada kan kn 50 | kk Kazakh kaz kk 51 | ky Kirghiz kir 52 | ko Korean kor ko 53 | la Latin lat la 54 | lv Latvian lav lv 55 | lt Lithuanian lit lt 56 | lmo Lombard lmo 57 | nds Low Saxon nds 58 | lb Luxembourgish ltz lb 59 | mk Macedonian mkd mk 60 | mg Malagasy mlg,plt mg 61 | ms Malay msa ms 62 | ml Malayalam mal ml 63 | mr Marathi mar mr 64 | min Minangkabau min 65 | ne Nepali nep ne 66 | new Newar new 67 | no Norwegian (Bokmal) nob no 68 | nn Norwegian (Nynorsk) nno 69 | oc Occitan oci 70 | fa Persian (Farsi) fas,pes fa 71 | pms Piedmontese pms 72 | pl Polish pol pl 73 | pt Portuguese por pt 74 | pa Punjabi pan pa 75 | ro Romanian ron ro 76 | ru Russian rus ru 77 | sco Scots sco 78 | sr Serbian srp sr 79 | sh Serbo-Croatian hbs,srp,hrv 80 | scn Sicilian scn 81 | sk Slovak slk sk 82 | sl Slovenian slv sl 83 | azb South Azerbaijani azb 84 | es Spanish spa es 85 | su Sundanese sun su 86 | sw Swahili swh sw 87 | sv Swedish swe sv 88 | tl Tagalog tgl tl 89 | tg Tajik tgk tg 90 | ta Tamil tam ta 91 | tt Tatar tat tt 92 | te Telugu tel te 93 | tr Turkish tur tr 94 | uk Ukrainian ukr uk 95 | ur Urdu urd ur 96 | uz Uzbek uzb uz 97 | vi Vietnamese vie vi 98 | vo Volapük vol 99 | war Waray-Waray war 100 | cy Welsh cym cy 101 | fy West Frisian fry fy 102 | pnb Western Punjabi pnb,pan 103 | yo Yoruba yor yo 104 | th Thai tha th 105 | mn Mongolian mon,khk -------------------------------------------------------------------------------- /dataset/mlama.sh: -------------------------------------------------------------------------------- 1 | WORKDIR="/mounts/work/philipp/tmp/mlama" 2 | 3 | # 1. Download TREx and GoogleRE 4 | wget https://dl.fbaipublicfiles.com/LAMA/data.zip -P ${WORKDIR} 5 | unzip ${WORKDIR}/data.zip -d ${WORKDIR} && rm ${WORKDIR}/data.zip 6 | 7 | # 2. Translate TREx 8 | 9 | # download entity data 10 | mkdir -p ${WORKDIR}/data/wikidata_entities 11 | 12 | python download_trexentities.py \ 13 | --datapath ${WORKDIR}/data/TREx \ 14 | --outpath ${WORKDIR}/data/wikidata_entities 15 | 16 | # create multilingual json files 17 | mkdir -p ${WORKDIR}/data/multilingual 18 | python translate_trex.py \ 19 | --data ${WORKDIR}/data/TREx \ 20 | --entities ${WORKDIR}/data/wikidata_entities \ 21 | --outpath ${WORKDIR}/data/multilingual \ 22 | --languagemapping mbertlangs.txt 23 | 24 | 25 | # 3. Translate GoogleRE 26 | # You will need a valid Google Knowledge Graph API key in the environment variable `GOOGLEAPIKEY for this section 27 | mv ${WORKDIR}/data/Google_RE/date_of_birth_test.jsonl ${WORKDIR}/data/Google_RE/date_of_birth.jsonl 28 | mv ${WORKDIR}/data/Google_RE/place_of_birth_test.jsonl ${WORKDIR}/data/Google_RE/place_of_birth.jsonl 29 | mv ${WORKDIR}/data/Google_RE/place_of_death_test.jsonl ${WORKDIR}/data/Google_RE/place_of_death.jsonl 30 | 31 | for relation in date_of_birth place_of_death 32 | do 33 | python translate_googlere.py \ 34 | --inputpath ${WORKDIR}/data/Google_RE \ 35 | --relation ${relation} \ 36 | --outpath ${WORKDIR}/data/multilingual \ 37 | --languagemapping mbertlangs.txt 38 | done 39 | 40 | # 4.1. Translate Templates TREx 41 | mkdir -p ${WORKDIR}/data/multilingual/templates_original 42 | python translate_templates.py translate \ 43 | --templates ${WORKDIR}/data/relations.jsonl \ 44 | --outfile ${WORKDIR}/data/multilingual/templates_original \ 45 | --languagemapping mbertlangs.txt 46 | 47 | 48 | # 4.2. Translate Templates GoogleRE 49 | # manually copy the two googlere relations templates and translate them 50 | python translate_templates.py translate \ 51 | --templates ${WORKDIR}/data/relations_googlere.jsonl \ 52 | --outfile ${WORKDIR}/data/multilingual/templates_original \ 53 | --languagemapping mbertlangs.txt 54 | 55 | # 4.3. Clean Templates in place 56 | cp -r ${WORKDIR}/data/multilingual/templates_original ${WORKDIR}/data/multilingual/templates 57 | python translate_templates.py clean \ 58 | --templates ${WORKDIR}/data/multilingual/templates 59 | 60 | # 5. Copy each template json into the language folder 61 | mkdir -p ${WORKDIR}/data_clean 62 | python cleanup.py \ 63 | --infolder ${WORKDIR}/data/multilingual \ 64 | --outfolder ${WORKDIR}/data_clean 65 | 66 | # 6. Load mLAMA 67 | python reader.py --path ${WORKDIR}/data_clean/ 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /dataset/reader.py: -------------------------------------------------------------------------------- 1 | from typing import Text, List, Set, Any, Text, Dict 2 | import os 3 | import json 4 | 5 | 6 | class MLama(object): 7 | """docstring for MLama""" 8 | 9 | def __init__(self, path: Text) -> None: 10 | super(MLama, self).__init__() 11 | self.path = path 12 | self.data = {} 13 | 14 | def get_all_languages(self) -> List[Text]: 15 | # not for all languages templates are available. 16 | return os.listdir(self.path) 17 | 18 | def get_official_languages(self) -> List[Text]: 19 | return ["ca", "az", "en", "ar", "uk", "fa", "tr", "it", "el", "ru", "hr", "hi", "sv", "sq", "fr", "ga", "eu", "de", "nl", "et", "he", "es", "bn", "ms", "sr", 20 | "hy", "ur", "hu", "la", "sl", "cs", "af", "gl", "fi", "ro", "ko", "cy", "th", "be", "id", "pt", "vi", "ka", "ja", "da", "bg", "zh", "pl", "lv", "sk", "lt", "ta", "ceb"] 21 | 22 | def get_relations(self, language) -> List[Text]: 23 | files = os.listdir(os.path.join(self.path, language)) 24 | return [file.replace(".jsonl", "") for file in files if file != "templates.jsonl"] 25 | 26 | @staticmethod 27 | def _load_templates(path: Text) -> Dict[Text, Text]: 28 | templates = {} 29 | with open(path) as fp: 30 | for line in fp: 31 | line = json.loads(line) 32 | templates[line["relation"]] = line["template"] 33 | return templates 34 | 35 | @staticmethod 36 | def _load_triples(path: Text) -> Dict[Text, Dict[Text, Text]]: 37 | triples = {} 38 | with open(path) as fp: 39 | for line in fp: 40 | line = json.loads(line) 41 | triples[line["lineid"]] = line 42 | return triples 43 | 44 | def load(self, languages: List[Text] = [], relations: List[Text] = []) -> None: 45 | self.data = {} 46 | if not languages: 47 | languages = self.get_official_languages() 48 | for language in languages: 49 | self.data[language] = {} 50 | if not relations: 51 | relations = self.get_relations(language) 52 | templates = self._load_templates(os.path.join(self.path, language, "templates.jsonl")) 53 | for relation in relations: 54 | self.data[language][relation] = {} 55 | if relation not in templates: 56 | print("Template missing for relation {} in language {}.".format(relation, language)) 57 | self.data[language][relation]["template"] = templates.get(relation, "") 58 | self.data[language][relation]["triples"] = self._load_triples( 59 | os.path.join(self.path, language, relation + ".jsonl")) 60 | 61 | @staticmethod 62 | def is_valid_template(template: Text) -> bool: 63 | return ("[X]" in template and "[Y]" in template) 64 | 65 | def _fill_templates(self, template: Text, triples: Dict[Text, Dict[Text, Text]], mode: Text) -> Dict[Text, Text]: 66 | ''' 67 | mode in ["x", "y", "xy"] 68 | ''' 69 | if not self.is_valid_template(template): 70 | print("Invalid template: {}".format(template)) 71 | return {} 72 | else: 73 | filled_templates = {} 74 | for triple_id, triple in triples.items(): 75 | filled_templates[triple_id] = template 76 | if "x" in mode: 77 | filled_templates[triple_id] = filled_templates[triple_id].replace("[X]", triple["sub_label"]) 78 | if "y" in mode: 79 | filled_templates[triple_id] = filled_templates[triple_id].replace("[Y]", triple["obj_label"]) 80 | return filled_templates 81 | 82 | def fill_all_templates(self, mode: Text): 83 | for language in self.data: 84 | for relation in self.data[language]: 85 | self.data[language][relation]["filled_templates"] = self._fill_templates( 86 | self.data[language][relation]["template"], self.data[language][relation]["triples"], mode) 87 | 88 | 89 | def view_sample(): 90 | import random 91 | # prints a part of a latex table 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument("--path", default=None, type=str, required=True, help="") 94 | args = parser.parse_args() 95 | ml = MLama(args.path) 96 | ml.load() 97 | ml.fill_all_templates("xy") 98 | for lang in ml.data: 99 | all_instances = [] 100 | for relation in ml.data[lang]: 101 | all_instances.extend(ml.data[lang][relation]["filled_templates"].values()) 102 | examples = random.sample(all_instances, 3) 103 | print("\\multirow{{3}}{{0.3cm}}{{{}}}".format(lang), end="") 104 | for example in examples: 105 | print(" & {}\\\\".format(example)) 106 | 107 | 108 | if __name__ == '__main__': 109 | view_sample() 110 | -------------------------------------------------------------------------------- /dataset/relations.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Text, List, Set 3 | import collections 4 | import tqdm 5 | import json 6 | 7 | 8 | class Relations(object): 9 | """docstring for Relations""" 10 | 11 | def __init__(self, path: Text, suffix: Text = ".jsonl") -> None: 12 | self.path = path 13 | self.suffix = suffix 14 | self.data = collections.defaultdict(list) 15 | 16 | def get_available_filenames(self) -> List[Text]: 17 | filenames = [] 18 | for file in os.listdir(self.path): 19 | filenames.append(file.replace(self.suffix, "")) 20 | return filenames 21 | 22 | def load_data(self, filenames: List[Text]) -> None: 23 | for filename in tqdm.tqdm(filenames): 24 | with open(os.path.join(self.path, filename + self.suffix)) as fp: 25 | for line in fp: 26 | if line: 27 | self.data[filename].append(json.loads(line)) 28 | 29 | def get_all_entities(self, fields: List[Text]) -> Set[Text]: 30 | entities = set() 31 | for filename, triples in self.data.items(): 32 | for triple in triples: 33 | for field in fields: 34 | if field in triple: 35 | entities.add(triple[field].strip()) 36 | return entities 37 | 38 | -------------------------------------------------------------------------------- /dataset/requirements.txt: -------------------------------------------------------------------------------- 1 | googletrans>=3.1.0a0 -------------------------------------------------------------------------------- /dataset/translate_googlere.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Text, Dict, Set, Any 3 | import json 4 | import requests 5 | import os 6 | from tqdm import tqdm 7 | from utils import get_logger, load_languagemapping 8 | 9 | LOG = get_logger(__name__) 10 | 11 | 12 | 13 | 14 | 15 | def translate(kgids: Set[Text], targetlang: Text, key: Text) -> Dict[Text, Text]: 16 | translations = {} 17 | kgids = list(kgids) 18 | kgids = [x for x in kgids if x.startswith("/m/")] 19 | batch_size = 16 20 | for i in tqdm(range(0, len(kgids), batch_size)): 21 | response = requests.get("https://kgsearch.googleapis.com/v1/entities:search", {"key": key, "languages": targetlang, "ids": kgids[i:i + batch_size]}) 22 | if response.status_code == 200: 23 | result = json.loads(response.content) 24 | for elem in result['itemListElement']: 25 | kgid = elem["result"]["@id"].replace("kg:", "") 26 | name = elem["result"]["name"] 27 | translations[kgid] = name 28 | else: 29 | LOG.warning("Wrong status code: {}".format(response)) 30 | break 31 | return translations 32 | 33 | 34 | def get_translation(current_id: Text, translations: Dict[Text, Text], triple: Dict[Text, Any]) -> Dict[Text, Any]: 35 | if current_id.startswith("/m/"): 36 | if current_id in translations: 37 | sub_translated = translations[current_id] 38 | else: 39 | sub_translated = None 40 | else: 41 | if triple["sub"] != triple["sub_label"]: 42 | sub_translated = None 43 | else: 44 | sub_translated = triple["sub"] 45 | return sub_translated 46 | 47 | 48 | def main(): 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("--inputpath", default=None, type=str, required=True, help="") 51 | parser.add_argument("--relation", default=None, type=str, required=True, help="") 52 | parser.add_argument("--outpath", default=None, type=str, required=True, help="") 53 | parser.add_argument("--languagemapping", default=None, type=str, required=True, help="") 54 | args = parser.parse_args() 55 | key = os.environ["GOOGLEAPIKEY"] 56 | lang2translateid = load_languagemapping(args.languagemapping) 57 | triples = [] 58 | with open(os.path.join(args.inputpath, args.relation + ".jsonl")) as fp: 59 | for line in fp: 60 | if line.strip(): 61 | triples.append(json.loads(line)) 62 | 63 | kgids = set() 64 | for triple in triples: 65 | if "sub" in triple: 66 | kgids.add(triple["sub"]) 67 | if "obj" in triple: 68 | kgids.add(triple["obj"]) 69 | 70 | for langid, googleid in lang2translateid.items(): 71 | LOG.info(langid) 72 | translations = translate(kgids, googleid, key) 73 | result = [] 74 | for triple in triples: 75 | if "sub" not in triple or "obj" not in triple or "sub_label" not in triple or "obj_label" not in triple: 76 | triple["from_english"] = True 77 | result.append(triple) 78 | else: 79 | subid = triple["sub"] 80 | objid = triple["obj"] 81 | sub_translated = get_translation(subid, translations, triple) 82 | obj_translated = get_translation(objid, translations, triple) 83 | if sub_translated is None or obj_translated is None: 84 | triple["from_english"] = True 85 | result.append(triple) 86 | else: 87 | triple["from_english"] = False 88 | triple["sub_label"] = sub_translated 89 | triple["obj_label"] = obj_translated 90 | result.append(triple) 91 | os.makedirs(os.path.join(args.outpath, langid), exist_ok=True) 92 | with open(os.path.join(args.outpath, langid, args.relation + ".jsonl"), "w") as fout: 93 | for triple in result: 94 | fout.write(json.dumps(triple) + "\n") 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /dataset/translate_templates.py: -------------------------------------------------------------------------------- 1 | from googletrans import Translator 2 | import json 3 | import os 4 | import argparse 5 | from utils import get_logger 6 | 7 | LOG = get_logger(__name__) 8 | 9 | 10 | def fix_template(template, lang): 11 | # general rules 12 | if "[X]" not in template: 13 | template = template.replace("X", "[X]", 1) 14 | if "[Y]" not in template: 15 | template = template.replace("Y", "[Y]", 1) 16 | template = template.replace("[Y ]", "[Y] ", 1) 17 | 18 | if lang == "tl": 19 | template = template.replace("Naglalaro ang [X] sa posisyon.", "Naglalaro si [X] sa posisyon na [Y]", 1) 20 | template = template.replace("Sumali sa [X] ang [X].", "Sumali ang [X] sa [Y].", 1) 21 | template = template.replace("Naglalaro ang [X] ng musika.", "Naglalaro si [X] ng [Y] musika.", 1) 22 | template = template.replace("Naglalaro ang [X].", "Ginawa ni [X] ang [Y].", 1) 23 | if lang == "el": 24 | template = template.replace("[Χ]", "[X]", 1) 25 | template = template.replace("[Υ]", "[Y]", 1) 26 | if "[Y]" in template and "[X]" not in template: 27 | template = template.replace("[Ο]", "[X]", 1) 28 | if "[X]" in template and "[Y]" not in template: 29 | template = template.replace("[Ο]", "[Y]", 1) 30 | if lang == "ceb": 31 | # to be checked 32 | template = template.replace("Natawo sa [Y].", "Natawo ang [X] sa [Y].", 1) 33 | template = template.replace("Nag-apil sa [X] ang [X].", "Ang [X] miapil sa [Y].", 1) 34 | 35 | if lang == "pa": 36 | template = template.replace("[ਐਕਸ]", "[X]", 1) 37 | template = template.replace("[ਵਾਈ]", "[Y]", 1) 38 | if lang == "ta": 39 | template = template.replace("[எக்ஸ்]", "[X]", 1) 40 | template = template.replace("[ஒய்]", "[Y]", 1) 41 | if lang == "mg": 42 | template = template.replace( 43 | "Tamin'ny voalohany, nalefan'i [Y] tany am-boalohany.", "Tamin'ny voalohany, ny X [X] dia nalefa tamin'ny [Y].", 1) 44 | if lang == "gu": 45 | template = template.replace("[એક્સ]", "[X]", 1) 46 | template = template.replace("[વાય]", "[Y]", 1) 47 | if lang == "mr": 48 | template = template.replace("[एक्स]", "[X]", 1) 49 | template = template.replace("[वाई]", "[Y]", 1) 50 | template = template.replace("[वाय]", "[Y]", 1) 51 | if lang == "sr": 52 | template = template.replace("[Кс]", "[X]", 1) 53 | template = template.replace("[И]", "[Y]", 1) 54 | template = template.replace("[X] је рођен у И.", "[X] је рођен у [Y].", 1) 55 | if lang == "kk": 56 | template = template.replace("[Х] университетте білім алған.", "[X] [Y] университетінде білім алған.", 1) 57 | template = template.replace("Ана тілі [Х] болып табылады.", "[Х] -дің ана тілі - [Y].", 1) 58 | template = template.replace("[Х]", "[X]", 1) 59 | template = template.replace("[Y]", "[Y]", 1) 60 | if lang == "kn": 61 | template = template.replace("[ಎಕ್ಸ್]", "[X]", 1) 62 | template = template.replace("[ವೈ]", "[Y]", 1) 63 | if lang == "ne": 64 | template = template.replace("[एक्स]", "[X]", 1) 65 | template = template.replace("[Y]", "[Y]", 1) 66 | if lang == "hy": 67 | template = template.replace("[X]", "[X]", 1) 68 | template = template.replace("[Յ]", "[Y]", 1) 69 | if lang == "uz": 70 | template = template.replace("[X] universitetida tahsil olgan.", "[X] [Y] universitetida tahsil olgan.", 1) 71 | template = template.replace("[X] din bilan bog'liq.", "[X] [Y] diniga mansub.", 1) 72 | if lang == "tg": 73 | template = template.replace("[X] аз рӯи касб аст.", "[X] аз рӯи касб [Y] аст.", 1) 74 | template = template.replace("[Ю]", "[Y]", 1) 75 | template = template.replace("[Х]", "[X]", 1) 76 | template = template.replace("[Y]", "[Y]", 1) 77 | if lang == "lt": 78 | template = template.replace( 79 | "Buvo įgijęs išsilavinimą [Y] universitete.", "[X] įgijo išsilavinimą [Y] universitete.", 1) 80 | if lang == "bn": 81 | template = template.replace("[এক্স]", "[X]", 1) 82 | template = template.replace("[ওয়াই]", "[Y]", 1) 83 | if lang == "la": 84 | template = template.replace("[K]", "[Y]", 1) 85 | template = template.replace("[A]", "[Y]", 1) 86 | template = template.replace("[N]", "[Y]", 1) 87 | template = template.replace("[V]", "[Y]", 1) 88 | template = template.replace("[ego]", "[Y]", 1) 89 | template = template.replace("[Ego]", "[Y]", 1) 90 | if lang == "hi": 91 | if "[X]" not in template: 92 | template = template.replace("[एक्स]", "[X]", 1) 93 | if "[Y]" not in template: 94 | template = template.replace("[वाई]", "[Y]", 1) 95 | return template 96 | 97 | 98 | def clean(args): 99 | to_fix = [] 100 | broken = 0 101 | for file in os.listdir(args.templates): 102 | with open(os.path.join(args.templates, file), "r") as fp: 103 | for line in fp: 104 | if line: 105 | template = json.loads(line) 106 | #lang = file.replace(".jsonl", "").split("_")[-1] 107 | #template["template"] = fix_template(template["template"], lang) 108 | if template["template"].count("[X]") != 1 or template["template"].count("[Y]") != 1: 109 | LOG.warning("Broken Template {} {} {}".format(file, template["relation"], template["template"])) 110 | to_fix.append(file) 111 | broken += 1 112 | to_fix = set(to_fix) 113 | LOG.info("Fixing {} broken templates across {} languages.".format(broken, len(to_fix))) 114 | for file in to_fix: 115 | with open(os.path.join(args.templates, file), "r") as fp: 116 | fixed_templates = [] 117 | for line in fp: 118 | if line: 119 | template = json.loads(line) 120 | lang = file.replace(".jsonl", "").split("_")[-1] 121 | if template["template"].count("[X]") != 1 or template["template"].count("[Y]") != 1: 122 | template["template"] = fix_template(template["template"], lang) 123 | fixed_templates.append(template) 124 | with open(os.path.join(args.templates, file), "w") as fp: 125 | for line in fixed_templates: 126 | fp.write(json.dumps(line) + "\n") 127 | 128 | 129 | def translate(args): 130 | lang2translateid = {} 131 | with open(args.languagemapping) as fp: 132 | next(fp) 133 | for line in fp: 134 | if line: 135 | wikiid, _, _, googleid = line.split("\t") 136 | if not googleid: 137 | # try the other id and see what comes out of goole translate 138 | googleid = wikiid 139 | lang2translateid[wikiid.strip()] = googleid.strip() 140 | 141 | templates = [] 142 | with open(args.templates) as fp: 143 | for line in fp: 144 | if line: 145 | templates.append(json.loads(line)) 146 | 147 | # get translations 148 | for wikiid, googleid in lang2translateid.items(): 149 | LOG.info("TRANSLATING {}".format(wikiid)) 150 | translated = [] 151 | for template in templates: 152 | try: 153 | translator = Translator() 154 | result = translator.translate(template["template"], src="en", dest=googleid) 155 | translated_template = template.copy() 156 | translated_template["template"] = result.text 157 | translated.append(translated_template) 158 | except Exception as e: 159 | LOG.info("Exception: {}".format(e)) 160 | if len(translated) != len(templates): 161 | LOG.warning("Not all translations succesful!") 162 | LOG.warning("Skipping language") 163 | else: 164 | # write out 165 | with open(os.path.join(args.outfile, "relations_{}.jsonl".format(wikiid)), "w") as fout: 166 | for template in translated: 167 | fout.write("{}\n".format(json.dumps(template))) 168 | 169 | 170 | if __name__ == '__main__': 171 | parser = argparse.ArgumentParser() 172 | subparsers = parser.add_subparsers() 173 | 174 | parser_translate = subparsers.add_parser('translate') 175 | parser_translate.set_defaults(func=translate) 176 | parser_translate.add_argument("--templates", default=None, type=str, required=True, help="") 177 | parser_translate.add_argument("--languagemapping", default=None, type=str, required=True, help="") 178 | parser_translate.add_argument("--outfile", default=None, type=str, required=True, help="") 179 | 180 | parser_clean = subparsers.add_parser('clean') 181 | parser_clean.set_defaults(func=clean) 182 | parser_clean.add_argument("--templates", default=None, type=str, required=True, help="") 183 | 184 | args = parser.parse_args() 185 | args.func(args) 186 | -------------------------------------------------------------------------------- /dataset/translate_trex.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from relations import Relations 3 | from typing import Text 4 | import tqdm 5 | import os 6 | import json 7 | import collections 8 | from utils import get_logger, load_languagemapping 9 | 10 | LOG = get_logger(__name__) 11 | 12 | 13 | def get_entity_surface(basepath: Text, uri: Text, language: Text) -> Text: 14 | try: 15 | with open(os.path.join(basepath, uri + ".json")) as fp: 16 | data = json.load(fp) 17 | 18 | surfaces = data['entities'][uri]['labels'] 19 | if language in surfaces: 20 | if surfaces[language]["language"] != language: 21 | raise Warning("Language mismatch in data: {}".format(surfaces)) 22 | return surfaces[language]["value"] 23 | else: 24 | return "" 25 | except Exception as e: 26 | print("Exception: {} (probably entity file does not exist).".format(e)) 27 | return "" 28 | 29 | 30 | def main(): 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("--data", default=None, type=str, required=True, help="") 33 | parser.add_argument("--entities", default=None, type=str, required=True, help="") 34 | parser.add_argument("--outpath", default=None, type=str, required=True, help="") 35 | parser.add_argument("--languagemapping", default=None, type=str, required=True, help="") 36 | args = parser.parse_args() 37 | lang2translateid = load_languagemapping(args.languagemapping) 38 | 39 | for lang in lang2translateid: 40 | t = Relations(args.data) 41 | filenames = t.get_available_filenames() 42 | t.load_data(filenames) 43 | count = collections.Counter() 44 | logfile = open(os.path.join(args.outpath, lang + ".log"), "w") 45 | for filename, relations in t.data.items(): 46 | LOG.info("Processing relation: {}".format(filename)) 47 | outdirectory = os.path.join(args.outpath, lang) 48 | os.makedirs(outdirectory, exist_ok=True) 49 | with open(os.path.join(outdirectory, filename + ".jsonl"), "w") as fout: 50 | for relation in relations: 51 | count["in_file"] += 1 52 | if ("sub_uri" in relation and "obj_uri" in relation and "sub_label" in relation and "obj_label" in relation): 53 | count["available"] += 1 54 | obj_uri = relation["obj_uri"] 55 | sub_uri = relation["sub_uri"] 56 | # load entitiy information 57 | obj_surface = get_entity_surface(args.entities, obj_uri, lang) 58 | sub_surface = get_entity_surface(args.entities, sub_uri, lang) 59 | # write out 60 | if obj_surface and sub_surface: 61 | count["converted"] += 1 62 | to_write = {"sub_uri": sub_uri, "obj_uri": obj_uri, 63 | "obj_label": obj_surface, "sub_label": sub_surface, "from_english": False} 64 | else: 65 | # use english surface forms 66 | to_write = {"sub_uri": sub_uri, "obj_uri": obj_uri, 67 | "obj_label": relation["obj_label"], "sub_label": relation["sub_label"], "from_english": True} 68 | fout.write(json.dumps(to_write) + "\n") 69 | summary = "{}|{}|{}|(converted/available/in_file)".format(count["converted"], count["available"], count["in_file"]) 70 | LOG.info(summary) 71 | logfile.write("{}|{}\n".format(filename, summary)) 72 | logfile.close() 73 | 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /dataset/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import logging 3 | 4 | 5 | def rec_dd(): 6 | return collections.defaultdict(rec_dd) 7 | 8 | 9 | def load_languagemapping(path): 10 | lang2translateid = {} 11 | with open(path) as fp: 12 | next(fp) 13 | for line in fp: 14 | if line: 15 | wikiid, _, _, googleid = line.split("\t") 16 | if not googleid: 17 | # try the other id and see what comes out of google translate 18 | googleid = wikiid 19 | lang2translateid[wikiid.strip()] = googleid.strip() 20 | return lang2translateid 21 | 22 | 23 | def get_logger(name, filename=None, level=logging.DEBUG): 24 | logger = logging.getLogger(name) 25 | logger.setLevel(level) 26 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 27 | 28 | ch = logging.StreamHandler() 29 | ch.setLevel(level) 30 | ch.setFormatter(formatter) 31 | logger.addHandler(ch) 32 | 33 | if filename is not None: 34 | fh = logging.FileHandler(filename) 35 | fh.setLevel(level) 36 | fh.setFormatter(formatter) 37 | logger.addHandler(fh) 38 | return logger -------------------------------------------------------------------------------- /mlama/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mlama/build_encoded_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import pickle as pkl 8 | from tqdm import tqdm 9 | try: 10 | import ujson as json 11 | except ImportError: 12 | import json 13 | import collections 14 | import torch 15 | from lama.modules import build_model_by_name 16 | 17 | # A tuple containing a single example from the input dataset with sentences 18 | # mapped into a sequence of vectors: 19 | # embeddings: tensor with shape (some_length, embedding_dim). 20 | # Note that some_length differs from example to example, while 21 | # embedding_dim is the same for all examples for the encoded dataset. 22 | EncodedSentence = collections.namedtuple('EncodedSentence', 23 | 'embedding, length, tokens') 24 | 25 | 26 | class EncodedDataset(torch.utils.data.Dataset): 27 | 28 | def __init__(self, encoded_sentences=None): 29 | if encoded_sentences: 30 | # make sure encoded_sentences is a list of Strings 31 | assert isinstance(encoded_sentences, list) 32 | sample = encoded_sentences[0] 33 | assert len(sample) == 3 34 | assert isinstance(sample[0], torch.Tensor) 35 | self._encodings = encoded_sentences 36 | else: 37 | self._embeddings = [] 38 | 39 | def __len__(self): 40 | return len(self._encodings) 41 | 42 | def __getitem__(self, idx): 43 | encoding = self._encodings[idx] 44 | embedding, sent_length, tokens = encoding 45 | 46 | return EncodedSentence(embedding=embedding, length=sent_length, tokens=tokens) 47 | 48 | def save(self, path): 49 | with open(path, 'wb') as f: 50 | pkl.dump(self._encodings, f) 51 | 52 | def load(self, path): 53 | """ Read precomputed contextual embeddings from file 54 | 55 | :param path: path to the embedding file (in npz format) 56 | """ 57 | with open(path, 'rb') as f: 58 | self._encodings = pkl.load(f) 59 | 60 | 61 | def load_encoded_dataset(path): 62 | dataset = EncodedDataset() 63 | dataset.load(path) 64 | return dataset 65 | 66 | 67 | def _batchify(sentences, batch_size): 68 | start = 0 69 | while start < len(sentences): 70 | yield sentences[start:start + batch_size] 71 | start += batch_size 72 | 73 | 74 | def _aggregate_layers(embeddings): 75 | """ Average over all layers """ 76 | new_embed = torch.stack(embeddings, 0) # [#layers, #batchsize, #max_sent_len, #dim] 77 | agg_embed = torch.mean(new_embed, 0) # [#batchsize, #max_sent_len, #dim] 78 | return agg_embed 79 | 80 | 81 | def encode(args, sentences, sort_input=False): 82 | """Create an EncodedDataset from a list of sentences 83 | 84 | Parameters: 85 | sentences (list[list[string]]): list of elements. Each element is a list 86 | that contains either a single sentence 87 | or two sentences 88 | sort_input (bool): if true, sort sentences by number of tokens in them 89 | 90 | Returns: 91 | dataset (EncodedDataset): an object that contains the contextual 92 | representations of the input sentences 93 | """ 94 | print("Language Models: {}".format(args.lm)) 95 | model = build_model_by_name(args.lm, args) 96 | 97 | # sort sentences by number of tokens in them to make sure that in all 98 | # batches there are sentence with a similar numbers of tokens 99 | if sort_input: 100 | sorted(sentences, key=lambda k: len(" ".join(k).split()) ) 101 | 102 | encoded_sents = [] 103 | for current_batch in tqdm(_batchify(sentences, args.batch_size)): 104 | embeddings, sent_lens, tokenized_sents = model.get_contextual_embeddings(current_batch) 105 | 106 | agg_embeddings = _aggregate_layers(embeddings) # [#batchsize, #max_sent_len, #dim] 107 | sent_embeddings = [agg_embeddings[i, :l] for i, l in enumerate(sent_lens)] 108 | encoded_sents.extend(list(zip(sent_embeddings, sent_lens, tokenized_sents))) 109 | 110 | dataset = EncodedDataset(encoded_sents) 111 | return dataset -------------------------------------------------------------------------------- /mlama/eval_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from lama.modules import build_model_by_name 8 | from lama.utils import print_sentence_predictions, load_vocab 9 | import lama.options as options 10 | import lama.evaluation_metrics as evaluation_metrics 11 | 12 | 13 | def main(args): 14 | 15 | if not args.text and not args.interactive: 16 | msg = "ERROR: either you start LAMA eval_generation with the " \ 17 | "interactive option (--i) or you pass in input a piece of text (--t)" 18 | raise ValueError(msg) 19 | 20 | stopping_condition = True 21 | 22 | print("Language Models: {}".format(args.models_names)) 23 | 24 | models = {} 25 | for lm in args.models_names: 26 | models[lm] = build_model_by_name(lm, args) 27 | 28 | vocab_subset = None 29 | if args.common_vocab_filename is not None: 30 | common_vocab = load_vocab(args.common_vocab_filename) 31 | print("common vocabulary size: {}".format(len(common_vocab))) 32 | vocab_subset = [x for x in common_vocab] 33 | 34 | while stopping_condition: 35 | if args.text: 36 | text = args.text 37 | stopping_condition = False 38 | else: 39 | text = input("insert text:") 40 | 41 | if args.split_sentence: 42 | import spacy 43 | # use spacy to tokenize input sentence 44 | nlp = spacy.load(args.spacy_model) 45 | tokens = nlp(text) 46 | print(tokens) 47 | sentences = [] 48 | for s in tokens.sents: 49 | print(" - {}".format(s)) 50 | sentences.append(s.text) 51 | else: 52 | sentences = [text] 53 | 54 | if len(sentences) > 2: 55 | print("WARNING: only the first two sentences in the text will be considered!") 56 | sentences = sentences[:2] 57 | 58 | for model_name, model in models.items(): 59 | print("\n{}:".format(model_name)) 60 | original_log_probs_list, [token_ids], [masked_indices] = model.get_batch_generation([sentences], try_cuda=False) 61 | 62 | index_list = None 63 | if vocab_subset is not None: 64 | # filter log_probs 65 | filter_logprob_indices, index_list = model.init_indices_for_filter_logprobs(vocab_subset) 66 | filtered_log_probs_list = model.filter_logprobs(original_log_probs_list, filter_logprob_indices) 67 | else: 68 | filtered_log_probs_list = original_log_probs_list 69 | 70 | # rank over the subset of the vocab (if defined) for the SINGLE masked tokens 71 | if masked_indices and len(masked_indices) > 0: 72 | evaluation_metrics.get_ranking(filtered_log_probs_list[0], masked_indices, model.vocab, index_list=index_list) 73 | 74 | # prediction and perplexity for the whole softmax 75 | print_sentence_predictions(original_log_probs_list[0], token_ids, model.vocab, masked_indices=masked_indices) 76 | 77 | 78 | if __name__ == '__main__': 79 | parser = options.get_eval_generation_parser() 80 | args = options.parse_args(parser) 81 | main(args) 82 | -------------------------------------------------------------------------------- /mlama/evaluation_metrics_ranked.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import torch 8 | import numpy as np 9 | import scipy 10 | 11 | 12 | def __max_probs_values_indices(masked_indices, log_probs, topk=1000): 13 | 14 | masked_index = masked_indices 15 | 16 | objects = candidates[len(masked_index)] 17 | 18 | log_probs = log_probs[masked_index] 19 | 20 | value_max_probs, index_max_probs = torch.topk(input=log_probs,k=topk,dim=1) 21 | index_max_probs = index_max_probs.numpy().astype(int) 22 | value_max_probs = value_max_probs.detach().numpy() 23 | 24 | return log_probs, index_max_probs, value_max_probs 25 | 26 | 27 | def __print_top_k(value_max_probs, index_max_probs, vocab, mask_topk, index_list, candidates_obj, max_printouts = 10): 28 | result = [] 29 | msg = "\n| Top{} predictions\n".format(max_printouts) 30 | for i in range(mask_topk): 31 | idx_joined = [] 32 | word_form_joined = [] 33 | 34 | for n_mask in range(len(value_max_probs)): 35 | filtered_idx = index_max_probs[n_mask][i].item() 36 | 37 | if index_list is not None: 38 | # the softmax layer has been filtered using the vocab_subset 39 | # the original idx should be retrieved 40 | idx = index_list[filtered_idx] 41 | else: 42 | idx = filtered_idx 43 | 44 | log_prob = value_max_probs[n_mask][i].item() 45 | word_form = vocab[idx] 46 | 47 | word_form_joined.append(word_form) 48 | idx_joined.append(idx) 49 | if i < max_printouts: 50 | msg += "{:<8d}{:<20s}{:<12.3f}\n".format( 51 | i, 52 | word_form, 53 | log_prob 54 | ) 55 | element = {'i' : i, 'token_idx': idx_joined, 'log_prob': log_prob, 'token_word_form': word_form_joined} 56 | result.append(element) 57 | return result, msg 58 | 59 | def get_prediction(log_probs, masked_indices, vocab, label_index = None, index_list = None, topk = 1000, P_AT = 10, print_generation=True): 60 | 61 | experiment_result = {} 62 | 63 | # score only first mask 64 | masked_indices = masked_indices[:1] 65 | 66 | masked_index = masked_indices[0] 67 | log_probs = log_probs[masked_index] 68 | 69 | value_max_probs, index_max_probs = torch.topk(input=log_probs,k=topk,dim=0) 70 | index_max_probs = index_max_probs.numpy().astype(int) 71 | value_max_probs = value_max_probs.detach().numpy() 72 | 73 | result_masked_topk, return_msg = __print_top_k(value_max_probs, index_max_probs, vocab, topk, index_list) 74 | 75 | return result_masked_topk, return_msg 76 | 77 | 78 | def get_ranking(log_probs, sample, masked_indices, vocab, candidates, label_index = None, index_list = None, topk = 10, P_AT = 10, print_generation=True): 79 | experiment_result = {} 80 | dict_probs = {} 81 | return_msg = "" 82 | objects_true = sample["obj_label"] 83 | 84 | for i, num_masks in enumerate(candidates): 85 | if len(masked_indices) >1: 86 | masked_idx = masked_indices[i] 87 | else: 88 | masked_idx = [masked_indices[i]] 89 | predictions = log_probs[i][masked_idx] 90 | 91 | for object in candidates[num_masks]: 92 | probs = [] 93 | for id, prediction in zip(candidates[num_masks][object], predictions): 94 | #print(id) 95 | #print("pred", prediction) 96 | probs.append(prediction[id]) 97 | dict_probs[object] = np.mean(probs) 98 | object_keys = np.array(list(dict_probs.keys())) 99 | object_values = np.array(list(dict_probs.values())) 100 | 101 | idx_true = np.argwhere(objects_true == object_keys)[0][0] 102 | idcs = np.argsort(object_values) 103 | rank = len(object_values) - np.argwhere(idcs==idx_true)[0][0] 104 | 105 | experiment_result["rank"] = rank - 1 106 | experiment_result["prob_true"] = dict_probs[objects_true] 107 | experiment_result["predicted"] = object_keys[idcs] 108 | experiment_result["probs"] = object_values[idcs] 109 | 110 | return experiment_result, return_msg 111 | -------------------------------------------------------------------------------- /mlama/get_contextual_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from lama.modules import build_model_by_name 8 | import lama.options as options 9 | 10 | def main(args): 11 | sentences = [ 12 | ["the cat is on the table ."], # single-sentence instance 13 | ["the dog is sleeping on the sofa .", "he makes happy noises ."], # two-sentence 14 | ] 15 | 16 | print("Language Models: {}".format(args.models_names)) 17 | 18 | models = {} 19 | for lm in args.models_names: 20 | models[lm] = build_model_by_name(lm, args) 21 | 22 | for model_name, model in models.items(): 23 | print("\n{}:".format(model_name)) 24 | if args.cuda: 25 | model.try_cuda() 26 | contextual_embeddings, sentence_lengths, tokenized_text_list = model.get_contextual_embeddings( 27 | sentences) 28 | 29 | # contextual_embeddings is a list of tensors, one tensor for each layer. 30 | # Each element contains one layer of the representations with shape 31 | # (x, y, z). 32 | # x - the batch size 33 | # y - the sequence length of the batch 34 | # z - the length of each layer vector 35 | 36 | print(f'Number of layers: {len(contextual_embeddings)}') 37 | for layer_id, layer in enumerate(contextual_embeddings): 38 | print(f'Layer {layer_id} has shape: {layer.shape}') 39 | 40 | print("sentence_lengths: {}".format(sentence_lengths)) 41 | print("tokenized_text_list: {}".format(tokenized_text_list)) 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = options.get_general_parser() 46 | parser.add_argument('--cuda', action='store_true', help='Try to run on GPU') 47 | args = options.parse_args(parser) 48 | main(args) 49 | -------------------------------------------------------------------------------- /mlama/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from .bert_connector import Bert 8 | 9 | def build_model_by_name(lm, args, verbose=True): 10 | """Load a model by name and args. 11 | 12 | Note, args.lm is not used for model selection. args are only passed to the 13 | model's initializator. 14 | """ 15 | MODEL_NAME_TO_CLASS = dict( 16 | bert=Bert 17 | ) 18 | if lm not in MODEL_NAME_TO_CLASS: 19 | raise ValueError("Unrecognized Language Model: %s." % lm) 20 | if verbose: 21 | print("Loading %s model..." % lm) 22 | return MODEL_NAME_TO_CLASS[lm](args) 23 | -------------------------------------------------------------------------------- /mlama/modules/base_connector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import re 8 | import torch 9 | 10 | MASK = "[MASK]" 11 | BERT_UNK = "[UNK]" 12 | BERT_CLS = "[CLS]" 13 | BERT_SEP = "[SEP]" 14 | BERT_PAD = "[PAD]" 15 | ELMO_UNK = "" 16 | ELMO_START_SENTENCE = "" 17 | ELMO_END_SENTENCE = "" 18 | OPENAI_UNK = "" 19 | OPENAI_EOS = "" 20 | ROBERTA_MASK = "" 21 | ROBERTA_START_SENTENCE = "" 22 | ROBERTA_END_SENTENCE = "" 23 | ROBERTA_VOCAB_SIZE = 50266 24 | 25 | SPECIAL_SYMBOLS = [ 26 | MASK, 27 | BERT_UNK, 28 | BERT_CLS, 29 | BERT_SEP, 30 | BERT_PAD, 31 | ELMO_UNK, 32 | ELMO_START_SENTENCE, 33 | ELMO_END_SENTENCE, 34 | OPENAI_UNK, 35 | OPENAI_EOS 36 | ] 37 | 38 | SPACE_NORMALIZER = re.compile(r"\s+") 39 | 40 | 41 | def default_tokenizer(line): 42 | """Default tokenizer for models that don't have one 43 | 44 | Args: 45 | line: a string representing a sentence 46 | 47 | Returns: 48 | A list of tokens 49 | """ 50 | 51 | line = SPACE_NORMALIZER.sub(" ", line) 52 | line = line.strip() 53 | line = line.replace(MASK, " "+str(MASK)+" ") #make sure MASK is correctly splitted 54 | 55 | # fix tokenization for parentheses 56 | line = line.replace('(', " ( ") 57 | line = line.replace(')', " ) ") 58 | 59 | # fix tokenization for comma 60 | line = line.replace(',', " , ") 61 | 62 | # fix tokenization for -- (e.g., 1954--1988) 63 | line = line.replace('--', " -- ") 64 | 65 | result = line.split() 66 | return result 67 | 68 | 69 | class Base_Connector(): 70 | 71 | def __init__(self): 72 | 73 | # these variables should be initialized 74 | self.vocab = None 75 | 76 | # This defines where the device where the model is. Changed by try_cuda. 77 | self._model_device = 'cpu' 78 | 79 | def optimize_top_layer(self, vocab_subset): 80 | """ 81 | optimization for some LM 82 | """ 83 | pass 84 | 85 | def _init_inverse_vocab(self): 86 | self.inverse_vocab = {w: i for i, w in enumerate(self.vocab)} 87 | 88 | def try_cuda(self): 89 | """Move model to GPU if one is available.""" 90 | if torch.cuda.is_available(): 91 | if self._model_device != 'cuda': 92 | print('Moving model to CUDA') 93 | self._cuda() 94 | self._model_device = 'cuda' 95 | else: 96 | print('No CUDA found') 97 | 98 | def _cuda(self): 99 | """Move model to GPU.""" 100 | raise NotImplementedError 101 | 102 | def init_indices_for_filter_logprobs(self, vocab_subset, logger=None): 103 | index_list = [] 104 | new_vocab_subset = [] 105 | for word in vocab_subset: 106 | if word in self.inverse_vocab: 107 | inverse_id = self.inverse_vocab[word] 108 | index_list.append(inverse_id) 109 | new_vocab_subset.append(word) 110 | else: 111 | msg = "word {} from vocab_subset not in model vocabulary!".format(word) 112 | if logger is not None: 113 | logger.warning(msg) 114 | else: 115 | print("WARNING: {}".format(msg)) 116 | 117 | # 1. gather correct indices 118 | indices = torch.as_tensor(index_list) 119 | return indices, index_list 120 | 121 | def filter_logprobs(self, log_probs, indices): 122 | new_log_probs = log_probs.index_select(dim=2 , index=indices) 123 | return new_log_probs 124 | 125 | def get_id(self, string): 126 | raise NotImplementedError() 127 | 128 | def get_generation(self, sentences, logger=None): 129 | [log_probs], [token_ids], [masked_indices] = self.get_batch_generation( 130 | [sentences], logger=logger, try_cuda=False) 131 | return log_probs, token_ids, masked_indices 132 | 133 | def get_batch_generation(self, sentences_list, logger= None, try_cuda=True): 134 | raise NotImplementedError() 135 | 136 | def get_contextual_embeddings(self, sentences): 137 | """Compute the contextual embeddings of a list of sentences 138 | 139 | Parameters: 140 | sentences (list[list[string]]): list of elements. Each element is a list 141 | that contains either a single sentence 142 | or two sentences 143 | 144 | Returns: 145 | encoder_layers (list(Tensor)): a list of the full sequences of encoded-hidden-states 146 | at the end of each attention block (e.g., 12 full 147 | sequences for BERT-base,), each encoded-hidden-state 148 | is a torch.FloatTensor of size [batch_size, 149 | sequence_length, hidden_size] 150 | sentence_lengths (list[int]): list of lenghts for the sentences in the 151 | batch 152 | tokenized_text_list: (list[list[string]]): tokenized text for the sentences 153 | in the batch 154 | """ 155 | raise NotImplementedError() 156 | -------------------------------------------------------------------------------- /mlama/modules/bert_connector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import torch 8 | import pytorch_pretrained_bert.tokenization as btok 9 | from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM, BasicTokenizer, BertModel 10 | import numpy as np 11 | from mlama.modules.base_connector import * 12 | import torch.nn.functional as F 13 | 14 | 15 | class CustomBaseTokenizer(BasicTokenizer): 16 | 17 | def tokenize(self, text): 18 | """Tokenizes a piece of text.""" 19 | text = self._clean_text(text) 20 | # This was added on November 1st, 2018 for the multilingual and Chinese 21 | # models. This is also applied to the English models now, but it doesn't 22 | # matter since the English models were not trained on any Chinese data 23 | # and generally don't have any Chinese data in them (there are Chinese 24 | # characters in the vocabulary because Wikipedia does have some Chinese 25 | # words in the English Wikipedia.). 26 | text = self._tokenize_chinese_chars(text) 27 | orig_tokens = btok.whitespace_tokenize(text) 28 | split_tokens = [] 29 | for token in orig_tokens: 30 | 31 | # pass MASK forward 32 | if MASK in token: 33 | split_tokens.append(MASK) 34 | if token != MASK: 35 | remaining_chars = token.replace(MASK,"").strip() 36 | if remaining_chars: 37 | split_tokens.append(remaining_chars) 38 | continue 39 | 40 | if self.do_lower_case: 41 | token = token.lower() 42 | token = self._run_strip_accents(token) 43 | split_tokens.extend(self._run_split_on_punc(token)) 44 | 45 | output_tokens = btok.whitespace_tokenize(" ".join(split_tokens)) 46 | return output_tokens 47 | 48 | 49 | class Bert(Base_Connector): 50 | 51 | def __init__(self, args, vocab_subset = None): 52 | super().__init__() 53 | 54 | bert_model_name = args.bert_model_name 55 | dict_file = bert_model_name 56 | 57 | if args.bert_model_dir is not None: 58 | # load bert model from file 59 | bert_model_name = str(args.bert_model_dir) + "/" 60 | dict_file = bert_model_name+args.bert_vocab_name 61 | self.dict_file = dict_file 62 | print("loading BERT model from {}".format(bert_model_name)) 63 | else: 64 | # load bert model from huggingface cache 65 | pass 66 | 67 | # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer 68 | do_lower_case = False 69 | if 'uncased' in bert_model_name: 70 | do_lower_case=True 71 | #print(do_lower_case) 72 | # Load pre-trained model tokenizer (vocabulary) 73 | self.tokenizer = BertTokenizer.from_pretrained(dict_file) 74 | 75 | # original vocab 76 | self.map_indices = None 77 | self.vocab = list(self.tokenizer.ids_to_tokens.values()) 78 | self._init_inverse_vocab() 79 | 80 | # Add custom tokenizer to avoid splitting the ['MASK'] token 81 | custom_basic_tokenizer = CustomBaseTokenizer(do_lower_case = do_lower_case) 82 | self.tokenizer.basic_tokenizer = custom_basic_tokenizer 83 | 84 | # Load pre-trained model (weights) 85 | # ... to get prediction/generation 86 | self.masked_bert_model = BertForMaskedLM.from_pretrained(bert_model_name) 87 | 88 | self.masked_bert_model.eval() 89 | 90 | # ... to get hidden states 91 | self.bert_model = self.masked_bert_model.bert 92 | 93 | self.pad_id = self.inverse_vocab[BERT_PAD] 94 | 95 | self.unk_index = self.inverse_vocab[BERT_UNK] 96 | 97 | def get_id(self, string): 98 | tokenized_text = self.tokenizer.tokenize(string) 99 | indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) 100 | if self.map_indices is not None: 101 | # map indices to subset of the vocabulary 102 | indexed_string = self.convert_ids(indexed_string) 103 | 104 | return indexed_string 105 | 106 | def __get_input_tensors_batch(self, sentences_list): 107 | tokens_tensors_list = [] 108 | segments_tensors_list = [] 109 | masked_indices_list = [] 110 | tokenized_text_list = [] 111 | max_tokens = 0 112 | for sentences in sentences_list: 113 | tokens_tensor, segments_tensor, masked_indices, tokenized_text = self.__get_input_tensors(sentences) 114 | tokens_tensors_list.append(tokens_tensor) 115 | segments_tensors_list.append(segments_tensor) 116 | masked_indices_list.append(masked_indices) 117 | tokenized_text_list.append(tokenized_text) 118 | # assert(tokens_tensor.shape[1] == segments_tensor.shape[1]) 119 | if (tokens_tensor.shape[1] > max_tokens): 120 | max_tokens = tokens_tensor.shape[1] 121 | # print("MAX_TOKENS: {}".format(max_tokens)) 122 | # apply padding and concatenate tensors 123 | # use [PAD] for tokens and 0 for segments 124 | final_tokens_tensor = None 125 | final_segments_tensor = None 126 | final_attention_mask = None 127 | for tokens_tensor, segments_tensor in zip(tokens_tensors_list, segments_tensors_list): 128 | dim_tensor = tokens_tensor.shape[1] 129 | pad_lenght = max_tokens - dim_tensor 130 | attention_tensor = torch.full([1,dim_tensor], 1, dtype= torch.long) 131 | if pad_lenght>0: 132 | pad_1 = torch.full([1,pad_lenght], self.pad_id, dtype= torch.long) 133 | pad_2 = torch.full([1,pad_lenght], 0, dtype= torch.long) 134 | attention_pad = torch.full([1,pad_lenght], 0, dtype= torch.long) 135 | tokens_tensor = torch.cat((tokens_tensor,pad_1), dim=1) 136 | segments_tensor = torch.cat((segments_tensor,pad_2), dim=1) 137 | attention_tensor = torch.cat((attention_tensor,attention_pad), dim=1) 138 | if final_tokens_tensor is None: 139 | final_tokens_tensor = tokens_tensor 140 | final_segments_tensor = segments_tensor 141 | final_attention_mask = attention_tensor 142 | else: 143 | final_tokens_tensor = torch.cat((final_tokens_tensor,tokens_tensor), dim=0) 144 | final_segments_tensor = torch.cat((final_segments_tensor,segments_tensor), dim=0) 145 | final_attention_mask = torch.cat((final_attention_mask,attention_tensor), dim=0) 146 | # print(final_tokens_tensor) 147 | # print(final_segments_tensor) 148 | # print(final_attention_mask) 149 | # print(final_tokens_tensor.shape) 150 | # print(final_segments_tensor.shape) 151 | # print(final_attention_mask.shape) 152 | return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list 153 | 154 | def __get_input_tensors(self, sentences): 155 | 156 | if len(sentences) > 2: 157 | print(sentences) 158 | raise ValueError("BERT accepts maximum two sentences in input for each data point") 159 | 160 | first_tokenized_sentence = self.tokenizer.tokenize(sentences[0]) 161 | first_segment_id = np.zeros(len(first_tokenized_sentence), dtype=int).tolist() 162 | 163 | # add [SEP] token at the end 164 | first_tokenized_sentence.append(BERT_SEP) 165 | first_segment_id.append(0) 166 | 167 | if len(sentences)>1 : 168 | second_tokenized_sentece = self.tokenizer.tokenize(sentences[1]) 169 | second_segment_id = np.full(len(second_tokenized_sentece),1, dtype=int).tolist() 170 | 171 | # add [SEP] token at the end 172 | second_tokenized_sentece.append(BERT_SEP) 173 | second_segment_id.append(1) 174 | 175 | tokenized_text = first_tokenized_sentence + second_tokenized_sentece 176 | segments_ids = first_segment_id + second_segment_id 177 | else: 178 | tokenized_text = first_tokenized_sentence 179 | segments_ids = first_segment_id 180 | 181 | # add [CLS] token at the beginning 182 | tokenized_text.insert(0,BERT_CLS) 183 | segments_ids.insert(0,0) 184 | 185 | # look for masked indices 186 | masked_indices = [] 187 | for i in range(len(tokenized_text)): 188 | token = tokenized_text[i] 189 | if token == MASK: 190 | masked_indices.append(i) 191 | 192 | indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text) 193 | 194 | # Convert inputs to PyTorch tensors 195 | tokens_tensor = torch.tensor([indexed_tokens]) 196 | segments_tensors = torch.tensor([segments_ids]) 197 | 198 | return tokens_tensor, segments_tensors, masked_indices, tokenized_text 199 | 200 | def __get_token_ids_from_tensor(self, indexed_string): 201 | token_ids = [] 202 | if self.map_indices is not None: 203 | # map indices to subset of the vocabulary 204 | indexed_string = self.convert_ids(indexed_string) 205 | token_ids = np.asarray(indexed_string) 206 | else: 207 | token_ids = indexed_string 208 | return token_ids 209 | 210 | def _cuda(self): 211 | self.masked_bert_model.cuda() 212 | 213 | def get_batch_generation(self, sentences_list, logger= None, 214 | try_cuda=True): 215 | #print("see") 216 | if not sentences_list: 217 | return None 218 | if try_cuda: 219 | self.try_cuda() 220 | 221 | tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) 222 | 223 | if logger is not None: 224 | logger.debug("\n{}\n".format(tokenized_text_list)) 225 | 226 | with torch.no_grad(): 227 | logits = self.masked_bert_model( 228 | input_ids=tokens_tensor.to(self._model_device), 229 | token_type_ids=segments_tensor.to(self._model_device), 230 | attention_mask=attention_mask_tensor.to(self._model_device), 231 | ) 232 | 233 | log_probs = F.log_softmax(logits, dim=-1).cpu() 234 | #print(logits.shape) 235 | token_ids_list = [] 236 | for indexed_string in tokens_tensor.numpy(): 237 | token_ids_list.append(self.__get_token_ids_from_tensor(indexed_string)) 238 | 239 | return log_probs, token_ids_list, masked_indices_list 240 | 241 | def get_contextual_embeddings(self, sentences_list, try_cuda=True): 242 | 243 | # assume in input 1 or 2 sentences - in general, it considers only the first 2 sentences 244 | if not sentences_list: 245 | return None 246 | if try_cuda: 247 | self.try_cuda() 248 | 249 | tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) 250 | 251 | with torch.no_grad(): 252 | all_encoder_layers, _ = self.bert_model( 253 | tokens_tensor.to(self._model_device), 254 | segments_tensor.to(self._model_device)) 255 | 256 | all_encoder_layers = [layer.cpu() for layer in all_encoder_layers] 257 | 258 | sentence_lengths = [len(x) for x in tokenized_text_list] 259 | 260 | # all_encoder_layers: a list of the full sequences of encoded-hidden-states at the end 261 | # of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each 262 | # encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] 263 | return all_encoder_layers, sentence_lengths, tokenized_text_list 264 | -------------------------------------------------------------------------------- /mlama/modules/bert_connector_.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import torch 8 | import pytorch_pretrained_bert.tokenization as btok 9 | from pytorch_pretrained_bert.tokenization import BertTokenizer, BertForMaskedLM, BasicTokenizer, BertModel 10 | import numpy as np 11 | from lama.modules.base_connector import * 12 | import torch.nn.functional as F 13 | from transformers import AutoTokenizer, AutoModelWithLMHead 14 | 15 | class CustomBaseTokenizer(BasicTokenizer): 16 | 17 | def tokenize(self, text): 18 | """Tokenizes a piece of text.""" 19 | text = self._clean_text(text) 20 | # This was added on November 1st, 2018 for the multilingual and Chinese 21 | # models. This is also applied to the English models now, but it doesn't 22 | # matter since the English models were not trained on any Chinese data 23 | # and generally don't have any Chinese data in them (there are Chinese 24 | # characters in the vocabulary because Wikipedia does have some Chinese 25 | # words in the English Wikipedia.). 26 | text = self._tokenize_chinese_chars(text) 27 | orig_tokens = btok.whitespace_tokenize(text) 28 | split_tokens = [] 29 | for token in orig_tokens: 30 | 31 | # pass MASK forward 32 | if MASK in token: 33 | split_tokens.append(MASK) 34 | if token != MASK: 35 | remaining_chars = token.replace(MASK,"").strip() 36 | if remaining_chars: 37 | split_tokens.append(remaining_chars) 38 | continue 39 | 40 | if self.do_lower_case: 41 | token = token.lower() 42 | token = self._run_strip_accents(token) 43 | split_tokens.extend(self._run_split_on_punc(token)) 44 | 45 | output_tokens = btok.whitespace_tokenize(" ".join(split_tokens)) 46 | return output_tokens 47 | 48 | 49 | class Bert(Base_Connector): 50 | 51 | def __init__(self, args, vocab_subset = None): 52 | super().__init__() 53 | 54 | bert_model_name = args.bert_model_name 55 | dict_file = bert_model_name 56 | 57 | if args.bert_model_dir is not None: 58 | # load bert model from file 59 | bert_model_name = str(args.bert_model_dir) + "/" 60 | dict_file = bert_model_name+args.bert_vocab_name 61 | self.dict_file = dict_file 62 | print("loading BERT model from {}".format(bert_model_name)) 63 | else: 64 | # load bert model from huggingface cache 65 | dict_file = args.bert_model_name 66 | self.dict_file = dict_file 67 | # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer 68 | do_lower_case = False 69 | if 'uncased' in bert_model_name: 70 | do_lower_case=True 71 | print(do_lower_case) 72 | # Load pre-trained model tokenizer (vocabulary) 73 | self.tokenizer = BertTokenizer.from_pretrained(dict_file) 74 | #self.tokenizer = AutoTokenizer.from_pretrained(dict_file) 75 | # original vocab 76 | self.map_indices = None 77 | self.vocab = list(self.tokenizer.ids_to_tokens.values()) 78 | self._init_inverse_vocab() 79 | 80 | # Add custom tokenizer to avoid splitting the ['MASK'] token 81 | #custom_basic_tokenizer = CustomBaseTokenizer(do_lower_case = do_lower_case) 82 | #self.tokenizer.basic_tokenizer = custom_basic_tokenizer 83 | 84 | # Load pre-trained model (weights) 85 | # ... to get prediction/generation 86 | self.masked_bert_model = BertForMaskedLM.from_pretrained(bert_model_name) 87 | #self.masked_bert_model = AutoModelWithLMHead.from_pretrained(bert_model_name) 88 | self.masked_bert_model.eval() 89 | 90 | # ... to get hidden states 91 | self.bert_model = self.masked_bert_model.bert 92 | 93 | self.pad_id = self.inverse_vocab[BERT_PAD] 94 | 95 | self.unk_index = self.inverse_vocab[BERT_UNK] 96 | 97 | def get_id(self, string): 98 | tokenized_text = self.tokenizer.tokenize(string) 99 | indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) 100 | if self.map_indices is not None: 101 | # map indices to subset of the vocabulary 102 | indexed_string = self.convert_ids(indexed_string) 103 | 104 | return indexed_string 105 | 106 | def __get_input_tensors_batch(self, sentences_list): 107 | tokens_tensors_list = [] 108 | segments_tensors_list = [] 109 | masked_indices_list = [] 110 | tokenized_text_list = [] 111 | max_tokens = 0 112 | for sentences in sentences_list: 113 | tokens_tensor, segments_tensor, masked_indices, tokenized_text = self.__get_input_tensors(sentences) 114 | tokens_tensors_list.append(tokens_tensor) 115 | segments_tensors_list.append(segments_tensor) 116 | masked_indices_list.append(masked_indices) 117 | tokenized_text_list.append(tokenized_text) 118 | # assert(tokens_tensor.shape[1] == segments_tensor.shape[1]) 119 | if (tokens_tensor.shape[1] > max_tokens): 120 | max_tokens = tokens_tensor.shape[1] 121 | # print("MAX_TOKENS: {}".format(max_tokens)) 122 | # apply padding and concatenate tensors 123 | # use [PAD] for tokens and 0 for segments 124 | final_tokens_tensor = None 125 | final_segments_tensor = None 126 | final_attention_mask = None 127 | for tokens_tensor, segments_tensor in zip(tokens_tensors_list, segments_tensors_list): 128 | dim_tensor = tokens_tensor.shape[1] 129 | pad_lenght = max_tokens - dim_tensor 130 | attention_tensor = torch.full([1,dim_tensor], 1, dtype= torch.long) 131 | if pad_lenght>0: 132 | pad_1 = torch.full([1,pad_lenght], self.pad_id, dtype= torch.long) 133 | pad_2 = torch.full([1,pad_lenght], 0, dtype= torch.long) 134 | attention_pad = torch.full([1,pad_lenght], 0, dtype= torch.long) 135 | tokens_tensor = torch.cat((tokens_tensor,pad_1), dim=1) 136 | segments_tensor = torch.cat((segments_tensor,pad_2), dim=1) 137 | attention_tensor = torch.cat((attention_tensor,attention_pad), dim=1) 138 | if final_tokens_tensor is None: 139 | final_tokens_tensor = tokens_tensor 140 | final_segments_tensor = segments_tensor 141 | final_attention_mask = attention_tensor 142 | else: 143 | final_tokens_tensor = torch.cat((final_tokens_tensor,tokens_tensor), dim=0) 144 | final_segments_tensor = torch.cat((final_segments_tensor,segments_tensor), dim=0) 145 | final_attention_mask = torch.cat((final_attention_mask,attention_tensor), dim=0) 146 | # print(final_tokens_tensor) 147 | # print(final_segments_tensor) 148 | # print(final_attention_mask) 149 | # print(final_tokens_tensor.shape) 150 | # print(final_segments_tensor.shape) 151 | # print(final_attention_mask.shape) 152 | return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list 153 | 154 | def __get_input_tensors(self, sentences): 155 | 156 | if len(sentences) > 2: 157 | print(sentences) 158 | raise ValueError("BERT accepts maximum two sentences in input for each data point") 159 | 160 | first_tokenized_sentence = self.tokenizer.tokenize(sentences[0]) 161 | first_segment_id = np.zeros(len(first_tokenized_sentence), dtype=int).tolist() 162 | 163 | # add [SEP] token at the end 164 | first_tokenized_sentence.append(BERT_SEP) 165 | first_segment_id.append(0) 166 | 167 | if len(sentences)>1 : 168 | second_tokenized_sentece = self.tokenizer.tokenize(sentences[1]) 169 | second_segment_id = np.full(len(second_tokenized_sentece),1, dtype=int).tolist() 170 | 171 | # add [SEP] token at the end 172 | second_tokenized_sentece.append(BERT_SEP) 173 | second_segment_id.append(1) 174 | 175 | tokenized_text = first_tokenized_sentence + second_tokenized_sentece 176 | segments_ids = first_segment_id + second_segment_id 177 | else: 178 | tokenized_text = first_tokenized_sentence 179 | segments_ids = first_segment_id 180 | 181 | # add [CLS] token at the beginning 182 | tokenized_text.insert(0,BERT_CLS) 183 | segments_ids.insert(0,0) 184 | 185 | # look for masked indices 186 | masked_indices = [] 187 | for i in range(len(tokenized_text)): 188 | token = tokenized_text[i] 189 | if token == MASK: 190 | masked_indices.append(i) 191 | 192 | indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text) 193 | 194 | # Convert inputs to PyTorch tensors 195 | tokens_tensor = torch.tensor([indexed_tokens]) 196 | segments_tensors = torch.tensor([segments_ids]) 197 | 198 | return tokens_tensor, segments_tensors, masked_indices, tokenized_text 199 | 200 | def __get_token_ids_from_tensor(self, indexed_string): 201 | token_ids = [] 202 | if self.map_indices is not None: 203 | # map indices to subset of the vocabulary 204 | indexed_string = self.convert_ids(indexed_string) 205 | token_ids = np.asarray(indexed_string) 206 | else: 207 | token_ids = indexed_string 208 | return token_ids 209 | 210 | def _cuda(self): 211 | self.masked_bert_model.cuda() 212 | 213 | def get_batch_generation(self, sentences_list, logger= None, 214 | try_cuda=True): 215 | if not sentences_list: 216 | return None 217 | if try_cuda: 218 | self.try_cuda() 219 | 220 | tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) 221 | 222 | if logger is not None: 223 | logger.debug("\n{}\n".format(tokenized_text_list)) 224 | 225 | with torch.no_grad(): 226 | logits = self.masked_bert_model( 227 | input_ids=tokens_tensor.to(self._model_device), 228 | token_type_ids=segments_tensor.to(self._model_device), 229 | attention_mask=attention_mask_tensor.to(self._model_device), 230 | ) 231 | 232 | log_probs = F.log_softmax(logits, dim=-1).cpu() 233 | token_ids_list = [] 234 | for indexed_string in tokens_tensor.numpy(): 235 | token_ids_list.append(self.__get_token_ids_from_tensor(indexed_string)) 236 | 237 | return log_probs, token_ids_list, masked_indices_list 238 | 239 | def get_contextual_embeddings(self, sentences_list, try_cuda=True): 240 | 241 | # assume in input 1 or 2 sentences - in general, it considers only the first 2 sentences 242 | if not sentences_list: 243 | return None 244 | if try_cuda: 245 | self.try_cuda() 246 | 247 | tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) 248 | 249 | with torch.no_grad(): 250 | all_encoder_layers, _ = self.bert_model( 251 | tokens_tensor.to(self._model_device), 252 | segments_tensor.to(self._model_device)) 253 | 254 | all_encoder_layers = [layer.cpu() for layer in all_encoder_layers] 255 | 256 | sentence_lengths = [len(x) for x in tokenized_text_list] 257 | 258 | # all_encoder_layers: a list of the full sequences of encoded-hidden-states at the end 259 | # of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each 260 | # encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] 261 | return all_encoder_layers, sentence_lengths, tokenized_text_list 262 | -------------------------------------------------------------------------------- /mlama/options.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import argparse 8 | 9 | 10 | def get_general_parser(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument( 13 | "--language-models", 14 | "--lm", 15 | dest="models", 16 | help="comma separated list of language models", 17 | required=True, 18 | ) 19 | parser.add_argument( 20 | "--spacy_model", 21 | "--sm", 22 | dest="spacy_model", 23 | default="en_core_web_sm", 24 | help="spacy model file path", 25 | ) 26 | parser.add_argument( 27 | "--common-vocab-filename", 28 | "--cvf", 29 | dest="common_vocab_filename", 30 | help="common vocabulary filename", 31 | ) 32 | parser.add_argument( 33 | "--interactive", 34 | "--i", 35 | dest="interactive", 36 | action="store_true", 37 | help="perform the evaluation interactively", 38 | ) 39 | parser.add_argument( 40 | "--max-sentence-length", 41 | dest="max_sentence_length", 42 | type=int, 43 | default=100, 44 | help="max sentence lenght", 45 | ) 46 | __add_bert_args(parser) 47 | __add_elmo_args(parser) 48 | __add_gpt_args(parser) 49 | __add_transformerxl_args(parser) 50 | __add_roberta_args(parser) 51 | return parser 52 | 53 | 54 | def get_eval_generation_parser(): 55 | parser = get_general_parser() 56 | parser.add_argument( 57 | "--text", "--t", dest="text", help="text to compute the generation for" 58 | ) 59 | parser.add_argument( 60 | "--split_sentence", 61 | dest="split_sentence", 62 | action="store_true", 63 | help="split the input text in sentences", 64 | ) 65 | return parser 66 | 67 | 68 | def get_eval_KB_completion_parser(): 69 | parser = get_general_parser() 70 | parser.add_argument( 71 | "--dataset-filename", 72 | "--df", 73 | dest="dataset_filename", 74 | help="filename containing dataset", 75 | ) 76 | parser.add_argument( 77 | "--logdir", 78 | dest="logdir", 79 | default="../experiments_logs/", 80 | help="logging directory", 81 | ) 82 | parser.add_argument( 83 | "--full-logdir", 84 | help="Full path to the logging folder. If set, wiill override log_dir.", 85 | ) 86 | parser.add_argument( 87 | "--template", dest="template", default="", help="template for surface relation" 88 | ) 89 | parser.add_argument( 90 | "--batch-size", dest="batch_size", type=int, default=32, help="batch size" 91 | ) 92 | parser.add_argument( 93 | "--lowercase", 94 | "--lower", 95 | dest="lowercase", 96 | action="store_true", 97 | help="perform the evaluation using lowercase text", 98 | ) 99 | parser.add_argument( 100 | "--threads", 101 | dest="threads", 102 | type=int, 103 | default=-1, 104 | help="number of threads for evaluation metrics computation (defaults: all available)", 105 | ) 106 | return parser 107 | 108 | 109 | def __add_bert_args(parser): 110 | group = parser.add_argument_group("BERT") 111 | group.add_argument( 112 | "--bert-model-dir", 113 | "--bmd", 114 | dest="bert_model_dir", 115 | help="directory that contains the BERT pre-trained model and the vocabulary", 116 | ) 117 | group.add_argument( 118 | "--bert-model-name", 119 | "--bmn", 120 | dest="bert_model_name", 121 | default="bert-base-cased", 122 | help="name of the BERT pre-trained model (default = 'bert-base-cased')", 123 | ) 124 | group.add_argument( 125 | "--bert-vocab-name", 126 | "--bvn", 127 | dest="bert_vocab_name", 128 | default="vocab.txt", 129 | help="name of vocabulary used to pre-train the BERT model (default = 'vocab.txt')", 130 | ) 131 | return group 132 | 133 | 134 | def __add_roberta_args(parser): 135 | group = parser.add_argument_group("RoBERTa") 136 | group.add_argument( 137 | "--roberta-model-dir", 138 | "--rmd", 139 | dest="roberta_model_dir", 140 | help="directory that contains the ROBERTA pre-trained model and the vocabulary", 141 | ) 142 | group.add_argument( 143 | "--roberta-model-name", 144 | "--rmn", 145 | dest="roberta_model_name", 146 | default="model.pt", 147 | help="name of the ROBERTA pre-trained model (default = 'model.pt')", 148 | ) 149 | group.add_argument( 150 | "--roberta-vocab-name", 151 | "--rvn", 152 | dest="roberta_vocab_name", 153 | default="dict.txt", 154 | help="name of vocabulary used to pre-train the ROBERTA model (default = 'vocab.txt')", 155 | ) 156 | return group 157 | 158 | 159 | def __add_gpt_args(parser): 160 | group = parser.add_argument_group("GPT") 161 | group.add_argument( 162 | "--gpt-model-dir", 163 | "--gmd", 164 | dest="gpt_model_dir", 165 | help="directory that contains the gpt pre-trained model and the vocabulary", 166 | ) 167 | group.add_argument( 168 | "--gpt-model-name", 169 | "--gmn", 170 | dest="gpt_model_name", 171 | default="openai-gpt", 172 | help="name of the gpt pre-trained model (default = 'openai-gpt')", 173 | ) 174 | return group 175 | 176 | 177 | def __add_transformerxl_args(parser): 178 | group = parser.add_argument_group("GPT") 179 | group.add_argument( 180 | "--transformerxl-model-dir", 181 | "--tmd", 182 | help="directory that contains the pre-trained model and the vocabulary", 183 | ) 184 | group.add_argument( 185 | "--transformerxl-model-name", 186 | "--tmn", 187 | default="transfo-xl-wt103", 188 | help="name of the pre-trained model (default = 'transfo-xl-wt103')", 189 | ) 190 | return group 191 | 192 | 193 | def __add_elmo_args(parser): 194 | group = parser.add_argument_group("ELMo") 195 | group.add_argument( 196 | "--elmo-model-dir", 197 | "--emd", 198 | dest="elmo_model_dir", 199 | help="directory that contains the ELMo pre-trained model and the vocabulary", 200 | ) 201 | group.add_argument( 202 | "--elmo-model-name", 203 | "--emn", 204 | dest="elmo_model_name", 205 | default="elmo_2x4096_512_2048cnn_2xhighway", 206 | help="name of the ELMo pre-trained model (default = 'elmo_2x4096_512_2048cnn_2xhighway')", 207 | ) 208 | group.add_argument( 209 | "--elmo-vocab-name", 210 | "--evn", 211 | dest="elmo_vocab_name", 212 | default="vocab-2016-09-10.txt", 213 | help="name of vocabulary used to pre-train the ELMo model (default = 'vocab-2016-09-10.txt')", 214 | ) 215 | group.add_argument( 216 | "--elmo-warm-up-cycles", 217 | dest="elmo_warm_up_cycles", 218 | type=int, 219 | default=5, 220 | help="ELMo warm up cycles", 221 | ) 222 | return group 223 | 224 | 225 | def parse_args(parser): 226 | args = parser.parse_args() 227 | args.models_names = [x.strip().lower() for x in args.models.split(",")] 228 | if "fconv" in args.models_names: 229 | if args.data is None: 230 | raise ValueError( 231 | "to use fconv you should specify the directory that contains " 232 | "the pre-trained model and the vocabulary with the option --fconv-model-dir/--fmd\n" 233 | "you can also specify the fconv model name with the option --fconv-model-name/--fmn (default = 'wiki103.pt')\n" 234 | "the vocabulary should be in the provided fconv-model-dir and be named dict.txt" 235 | ) 236 | if "bert" in args.models_names: 237 | # use the default shortcut name of a Google AI's pre-trained model (default = 'bert-base-cased') 238 | pass 239 | if "elmo" in args.models_names: 240 | if args.elmo_model_dir is None: 241 | raise ValueError( 242 | "to use elmo you should specify the directory that contains " 243 | "the pre-trained model and the vocabulary with the option --elmo-model-dir/--emd\n" 244 | "you can also specify the elmo model name with the option --elmo-model-name/--emn (default = 'elmo_2x4096_512_2048cnn_2xhighway')\n" 245 | "and the elmo vocabulary name with the option --elmo-vocab-name/--evn (default = 'vocab-2016-09-10.txt')" 246 | ) 247 | 248 | return args 249 | -------------------------------------------------------------------------------- /mlama/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import torch 8 | from colorama import init 9 | from termcolor import colored 10 | import numpy as np 11 | import mlama.modules.base_connector as base 12 | 13 | 14 | def __exclude_tokens(token_ids, vocab): 15 | indices_to_exclude = [] 16 | for i, tok in enumerate(token_ids): 17 | word_form = vocab[tok] 18 | if (word_form in base.SPECIAL_SYMBOLS): 19 | indices_to_exclude.append(i) 20 | return indices_to_exclude 21 | 22 | 23 | def __print_generation(positional_scores, token_ids, vocab, rank_dict, 24 | index_max_probs, value_max_probs, topk, 25 | indices_to_exclude, masked_indices, print_on_console): 26 | init() # colorful output 27 | msg = "" 28 | dash = '-' * 82 29 | msg += dash + "\n" 30 | msg += '{:<8s}{:<20s}{:<12s}{:<20}{:<12s}{:<12s}'.format( 31 | "index", "token", "log_prob", "prediction", 32 | "log_prob", "rank@{}".format(topk)) 33 | msg += "\n" + dash 34 | if print_on_console: 35 | print(msg) 36 | msg += '\n' 37 | 38 | for idx, tok in enumerate(token_ids): 39 | 40 | word_form = vocab[tok] 41 | 42 | rank = -1 43 | if idx in rank_dict: 44 | rank = rank_dict[idx] 45 | index_max_prob = index_max_probs[idx] 46 | 47 | predicted_token_id = index_max_prob[0] 48 | 49 | value_max_prob = value_max_probs[idx] 50 | string_to_print = '{:<8d}{:<20s}{:<12.3f}{:<20s}{:<12.3f}{:<12d}'.format( 51 | idx, 52 | str(word_form), 53 | positional_scores[idx], 54 | str(vocab[predicted_token_id]), 55 | value_max_prob[0], 56 | rank 57 | ) 58 | 59 | if print_on_console: 60 | if masked_indices is not None and idx in masked_indices: 61 | print(colored(string_to_print, 'grey', 'on_yellow')) 62 | elif indices_to_exclude is not None and idx in indices_to_exclude: 63 | print(colored(string_to_print, 'grey', 'on_grey')) 64 | else: 65 | print(string_to_print) 66 | msg += string_to_print + "\n" 67 | 68 | return msg 69 | 70 | 71 | def __get_topk(log_probs, topk): 72 | value_max_probs, index_max_probs = torch.topk(input=log_probs, k=topk, dim=1) 73 | index_max_probs = index_max_probs.numpy() 74 | value_max_probs = value_max_probs.detach().numpy() 75 | return value_max_probs, index_max_probs 76 | 77 | 78 | def print_sentence_predictions(log_probs, token_ids, vocab, 79 | masked_indices=None, print_generation=True, 80 | topk=1000): 81 | 82 | msg = "\n" 83 | log_probs = log_probs[:len(token_ids)] 84 | value_max_probs, index_max_probs = __get_topk(log_probs, topk) 85 | 86 | # remove special symbols from token_ids 87 | excluded_indices = __exclude_tokens([t for t in token_ids], vocab) 88 | 89 | # score only first mask 90 | #masked_indices = masked_indices[:1] 91 | 92 | tokens = torch.from_numpy(np.asarray(token_ids)) 93 | 94 | # get ranking position in topk 95 | query = tokens.squeeze().data.unsqueeze(-1) 96 | query = query.repeat(1, topk) 97 | 98 | ranking_position = (index_max_probs == query.numpy()).nonzero() 99 | 100 | rank_dict = dict(zip(*ranking_position)) 101 | 102 | # get positional score of the correct token 103 | token_probs = log_probs.gather( 104 | dim=1, 105 | index=tokens.view(-1, 1), 106 | ) 107 | positional_scores = token_probs.squeeze(-1).detach().numpy() 108 | 109 | score_sum = 0. 110 | count = 0 111 | for idx, score in enumerate(positional_scores): 112 | if idx not in excluded_indices: 113 | score_sum += score 114 | count += 1 115 | 116 | if count > 0: 117 | avg_nll_loss = - (score_sum / count) 118 | else: 119 | avg_nll_loss = 0.0 120 | perplexity = np.exp(avg_nll_loss) 121 | 122 | # print("positional_scores: {}".format(positional_scores)) 123 | # print("avg_nll_loss: {}".format(avg_nll_loss)) 124 | 125 | __print_generation(positional_scores, token_ids, vocab, rank_dict, 126 | index_max_probs, value_max_probs, topk, 127 | excluded_indices, masked_indices, print_generation) 128 | 129 | # msg += return_msg 130 | msg += '| Perplexity: {:.3f}\n'.format(perplexity) 131 | 132 | if print_generation: 133 | print("\n"+msg+"\n") 134 | 135 | return perplexity, msg 136 | 137 | 138 | def load_vocab(vocab_filename): 139 | with open(vocab_filename, "r") as f: 140 | lines = f.readlines() 141 | vocab = [x.strip() for x in lines] 142 | return vocab 143 | -------------------------------------------------------------------------------- /mlama/vocab_intersection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from lama.modules import build_model_by_name 8 | from tqdm import tqdm 9 | import argparse 10 | import spacy 11 | import lama.modules.base_connector as base 12 | 13 | 14 | CASED_MODELS = [ 15 | # { 16 | # # "FAIRSEQ WIKI103" 17 | # "lm": "fairseq", 18 | # "data": "pre-trained_language_models/fairseq/wiki103_fconv_lm/", 19 | # "fairseq_model_name": "wiki103.pt", 20 | # "task": "language_modeling", 21 | # "cpu": True, 22 | # "output_dictionary_size": -1 23 | # }, 24 | { 25 | # "TransformerXL" 26 | "lm": "transformerxl", 27 | "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", 28 | }, 29 | { 30 | # "ELMO ORIGINAL" 31 | "lm": "elmo", 32 | "elmo_model_dir": "pre-trained_language_models/elmo/original", 33 | "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", 34 | "elmo_vocab_name": "vocab-2016-09-10.txt", 35 | "elmo_warm_up_cycles": 5 36 | }, 37 | { 38 | # "ELMO ORIGINAL 5.5B" 39 | "lm": "elmo", 40 | "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", 41 | "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", 42 | "elmo_vocab_name": "vocab-enwiki-news-500000.txt", 43 | "elmo_warm_up_cycles": 5 44 | }, 45 | { 46 | # "BERT BASE CASED" 47 | "lm": "bert", 48 | "bert_model_name": "bert-base-cased", 49 | "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12/", 50 | "bert_vocab_name": "vocab.txt" 51 | }, 52 | { 53 | # "BERT LARGE CASED" 54 | "lm" : "bert", 55 | "bert_model_name": "bert-large-cased", 56 | "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16/", 57 | "bert_vocab_name": "vocab.txt" 58 | } 59 | ] 60 | 61 | CASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_cased.txt" 62 | 63 | LOWERCASED_MODELS = [ 64 | { 65 | # "BERT BASE UNCASED" 66 | "lm": "bert", 67 | "bert_model_name": "bert-base-uncased", 68 | "bert_model_dir": None, 69 | "bert_vocab_name": "vocab.txt" 70 | }, 71 | { 72 | # "BERT LARGE UNCASED" 73 | "lm": "bert", 74 | "bert_model_name": "bert-large-uncased", 75 | "bert_model_dir": None, 76 | "bert_vocab_name": "vocab.txt" 77 | }, 78 | { 79 | # "OpenAI GPT" 80 | "lm": "gpt", 81 | "gpt_model_dir": None, 82 | "gpt_model_name": "openai-gpt" 83 | } 84 | ] 85 | 86 | LOWERCASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_lowercased.txt" 87 | 88 | 89 | def __vocab_intersection(models, filename): 90 | 91 | vocabularies = [] 92 | 93 | for arg_dict in models: 94 | 95 | args = argparse.Namespace(**arg_dict) 96 | print(args) 97 | model = build_model_by_name(args.lm, args) 98 | 99 | vocabularies.append(model.vocab) 100 | print(type(model.vocab)) 101 | 102 | if len(vocabularies) > 0: 103 | common_vocab = set(vocabularies[0]) 104 | for vocab in vocabularies: 105 | common_vocab = common_vocab.intersection(set(vocab)) 106 | 107 | # no special symbols in common_vocab 108 | for symbol in base.SPECIAL_SYMBOLS: 109 | if symbol in common_vocab: 110 | common_vocab.remove(symbol) 111 | 112 | # remove stop words 113 | from spacy.lang.en.stop_words import STOP_WORDS 114 | for stop_word in STOP_WORDS: 115 | if stop_word in common_vocab: 116 | print(stop_word) 117 | common_vocab.remove(stop_word) 118 | 119 | common_vocab = list(common_vocab) 120 | 121 | # remove punctuation and symbols 122 | nlp = spacy.load('en') 123 | manual_punctuation = ['(', ')', '.', ','] 124 | new_common_vocab = [] 125 | for i in tqdm(range(len(common_vocab))): 126 | word = common_vocab[i] 127 | doc = nlp(word) 128 | token = doc[0] 129 | if(len(doc) != 1): 130 | print(word) 131 | for idx, tok in enumerate(doc): 132 | print("{} - {}".format(idx, tok)) 133 | elif word in manual_punctuation: 134 | pass 135 | elif token.pos_ == "PUNCT": 136 | print("PUNCT: {}".format(word)) 137 | elif token.pos_ == "SYM": 138 | print("SYM: {}".format(word)) 139 | else: 140 | new_common_vocab.append(word) 141 | # print("{} - {}".format(word, token.pos_)) 142 | common_vocab = new_common_vocab 143 | 144 | # store common_vocab on file 145 | with open(filename, 'w') as f: 146 | for item in sorted(common_vocab): 147 | f.write("{}\n".format(item)) 148 | 149 | 150 | def main(): 151 | # cased version 152 | __vocab_intersection(CASED_MODELS, CASED_COMMON_VOCAB_FILENAME) 153 | # lowercased version 154 | __vocab_intersection(LOWERCASED_MODELS, LOWERCASED_COMMON_VOCAB_FILENAME) 155 | 156 | 157 | if __name__ == '__main__': 158 | main() 159 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython==0.29.2 2 | numpy==1.15.1 3 | torch==1.0.1 4 | pytorch-pretrained-bert==0.6.1 5 | allennlp==0.8.5 6 | spacy==2.1.8 7 | tqdm==4.26.0 8 | termcolor==1.1.0 9 | pandas==0.23.4 10 | fairseq==0.8.0 11 | colorama==0.4.1 12 | scipy==1.3.2 13 | -------------------------------------------------------------------------------- /scripts/batch_eval_KB_completion_mBERT_ranked.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from mlama.modules import build_model_by_name 8 | import mlama.utils as utils 9 | from mlama.utils import print_sentence_predictions, load_vocab 10 | import mlama.options as options 11 | from tqdm import tqdm 12 | from random import shuffle 13 | import os 14 | import json 15 | import spacy 16 | import mlama.modules.base_connector as base 17 | from pprint import pprint 18 | import logging.config 19 | import logging 20 | import pickle 21 | from multiprocessing.pool import ThreadPool 22 | import multiprocessing 23 | import mlama.evaluation_metrics_ranked as metrics 24 | import time, sys 25 | import torch 26 | import numpy as np 27 | 28 | def load_file(filename): 29 | data = [] 30 | with open(filename, "r") as f: 31 | for line in f.readlines(): 32 | data.append(json.loads(line)) 33 | return data 34 | 35 | 36 | def create_logdir_with_timestamp(base_logdir, modelname): 37 | timestr = time.strftime("%Y%m%d_%H%M%S") 38 | 39 | # create new directory 40 | log_directory = "{}/{}_{}/".format(base_logdir, modelname, timestr) 41 | os.makedirs(log_directory) 42 | 43 | path = "{}/last".format(base_logdir) 44 | try: 45 | os.unlink(path) 46 | except Exception: 47 | pass 48 | os.symlink(log_directory, path) 49 | return log_directory 50 | 51 | 52 | def parse_template(template, subject_label, object_label): 53 | SUBJ_SYMBOL = "[X]" 54 | OBJ_SYMBOL = "[Y]" 55 | template = template.replace(SUBJ_SYMBOL, subject_label) 56 | template = template.replace(OBJ_SYMBOL, object_label) 57 | return [template] 58 | 59 | 60 | def init_logging(log_directory): 61 | logger = logging.getLogger("LAMA") 62 | logger.setLevel(logging.DEBUG) 63 | 64 | os.makedirs(log_directory, exist_ok=True) 65 | 66 | # logging format 67 | # "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 68 | formatter = logging.Formatter( 69 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 70 | ) 71 | 72 | # file handler 73 | fh = logging.FileHandler(str(log_directory) + "/info.log") 74 | fh.setLevel(logging.DEBUG) 75 | fh.setFormatter(formatter) 76 | 77 | # console handler 78 | ch = logging.StreamHandler(sys.stdout) 79 | ch.setLevel(logging.WARNING) 80 | ch.setFormatter(formatter) 81 | 82 | logger.addHandler(fh) 83 | logger.addHandler(ch) 84 | 85 | logger.propagate = False 86 | 87 | return logger 88 | 89 | 90 | def batchify(data, batch_size): 91 | msg = "" 92 | list_samples_batches = [] 93 | list_sentences_batches = [] 94 | current_samples_batch = [] 95 | current_sentences_batches = [] 96 | c = 0 97 | 98 | # sort to group togheter sentences with similar length 99 | for sample in sorted( 100 | data, key=lambda k: len(" ".join(k["masked_sentences"]).split()) 101 | ): 102 | masked_sentences = sample["masked_sentences"] 103 | current_samples_batch.append(sample) 104 | current_sentences_batches.append(masked_sentences) 105 | c += 1 106 | if c >= batch_size: 107 | list_samples_batches.append(current_samples_batch) 108 | list_sentences_batches.append(current_sentences_batches) 109 | current_samples_batch = [] 110 | current_sentences_batches = [] 111 | c = 0 112 | 113 | # last batch 114 | if current_samples_batch and len(current_samples_batch) > 0: 115 | list_samples_batches.append(current_samples_batch) 116 | list_sentences_batches.append(current_sentences_batches) 117 | 118 | return list_samples_batches, list_sentences_batches, msg 119 | 120 | 121 | def run_thread(arguments): 122 | 123 | msg = "" 124 | 125 | # 1. compute the ranking metrics on the filtered log_probs tensor 126 | experiment_result, return_msg = metrics.get_ranking( 127 | arguments["filtered_log_probs"], 128 | arguments["sample"], 129 | arguments["masked_indices"], 130 | arguments["vocab"], 131 | arguments["candidates"], 132 | label_index=arguments["label_index"], 133 | index_list=arguments["index_list"], 134 | print_generation=arguments["interactive"], 135 | topk=10, 136 | ) 137 | msg += "\n" + return_msg 138 | 139 | return experiment_result, msg 140 | 141 | 142 | def lowercase_samples(samples, use_negated_probes=False): 143 | new_samples = [] 144 | for sample in samples: 145 | sample["obj_label"] = sample["obj_label"].lower() 146 | sample["sub_label"] = sample["sub_label"].lower() 147 | lower_masked_sentences = [] 148 | for sentence in sample["masked_sentences"]: 149 | sentence = sentence.lower() 150 | sentence = sentence.replace(base.MASK.lower(), base.MASK) 151 | lower_masked_sentences.append(sentence) 152 | sample["masked_sentences"] = lower_masked_sentences 153 | 154 | new_samples.append(sample) 155 | return new_samples 156 | 157 | 158 | def filter_samples(model, samples, vocab_subset, max_sentence_length, template): 159 | msg = "" 160 | new_samples = [] 161 | samples_exluded = 0 162 | for sample in samples: 163 | excluded = False 164 | if "obj_label" in sample and "sub_label" in sample: 165 | 166 | obj_label_ids = model.get_id(sample["obj_label"]) 167 | 168 | if obj_label_ids: 169 | recostructed_word = " ".join( 170 | [model.vocab[x] for x in obj_label_ids] 171 | ).strip() 172 | else: 173 | recostructed_word = None 174 | 175 | excluded = False 176 | if not template or len(template) == 0: 177 | masked_sentences = sample["masked_sentences"] 178 | text = " ".join(masked_sentences) 179 | if len(text.split()) > max_sentence_length: 180 | msg += "\tEXCLUDED for exeeding max sentence length: {}\n".format( 181 | masked_sentences 182 | ) 183 | samples_exluded += 1 184 | excluded = True 185 | """if sample['from_english']: 186 | msg += "\tEXCLUDED not in language \n" 187 | excluded = True 188 | samples_exluded += 1""" 189 | # MAKE SURE THAT obj_label IS IN VOCABULARIES 190 | if vocab_subset: 191 | for x in sample["obj_label"].split(" "): 192 | if x not in vocab_subset: 193 | excluded = True 194 | msg += "\tEXCLUDED object label {} not in vocab subset\n".format( 195 | sample["obj_label"] 196 | ) 197 | samples_exluded += 1 198 | break 199 | if excluded: 200 | pass 201 | elif obj_label_ids is None: 202 | msg += "\tEXCLUDED object label is {} None\n".format( 203 | sample["obj_label"] 204 | ) 205 | samples_exluded += 1 206 | 207 | # samples_exluded+=1 208 | elif "judgments" in sample: 209 | # only for Google-RE 210 | num_no = 0 211 | num_yes = 0 212 | for x in sample["judgments"]: 213 | if x["judgment"] == "yes": 214 | num_yes += 1 215 | else: 216 | num_no += 1 217 | if num_no > num_yes: 218 | # SKIP NEGATIVE EVIDENCE 219 | pass 220 | else: 221 | new_samples.append(sample) 222 | else: 223 | new_samples.append(sample) 224 | else: 225 | msg += "\tEXCLUDED since 'obj_label' not sample or 'sub_label' not in sample: {}\n".format( 226 | sample 227 | ) 228 | samples_exluded += 1 229 | msg += "samples exluded : {}\n".format(samples_exluded) 230 | return new_samples, msg 231 | 232 | 233 | def main(args, NUM_MASK, candidates, shuffle_data=True, model=None): 234 | 235 | if len(args.models_names) > 1: 236 | raise ValueError('Please specify a single language model (e.g., --lm "bert").') 237 | 238 | msg = "" 239 | 240 | [model_type_name] = args.models_names 241 | 242 | if model is None: 243 | model = build_model_by_name(model_type_name, args) 244 | 245 | if model_type_name == "fairseq": 246 | model_name = "fairseq_{}".format(args.fairseq_model_name) 247 | elif model_type_name == "bert": 248 | model_name = "BERT_{}".format(args.bert_model_name) 249 | elif model_type_name == "elmo": 250 | model_name = "ELMo_{}".format(args.elmo_model_name) 251 | else: 252 | model_name = model_type_name.title() 253 | 254 | # initialize logging 255 | if args.full_logdir: 256 | log_directory = args.full_logdir 257 | else: 258 | log_directory = create_logdir_with_timestamp(args.logdir, model_name) 259 | logger = init_logging(log_directory) 260 | msg += "model name: {}\n".format(model_name) 261 | 262 | # deal with vocab subset 263 | vocab_subset = None 264 | index_list = None 265 | msg += "args: {}\n".format(args) 266 | if args.common_vocab_filename is not None: 267 | vocab_subset = load_vocab(args.common_vocab_filename) 268 | msg += "common vocabulary size: {}\n".format(len(vocab_subset)) 269 | 270 | # optimization for some LM (such as ELMo) 271 | model.optimize_top_layer(vocab_subset) 272 | 273 | filter_logprob_indices, index_list = model.init_indices_for_filter_logprobs( 274 | vocab_subset, logger 275 | ) 276 | 277 | logger.info("\n" + msg + "\n") 278 | 279 | # dump arguments on file for log 280 | with open("{}/args.json".format(log_directory), "w") as outfile: 281 | json.dump(vars(args), outfile) 282 | 283 | data = load_file(args.dataset_filename) 284 | 285 | if args.lowercase: 286 | # lowercase all samples 287 | logger.info("lowercasing all samples...") 288 | all_samples = lowercase_samples( 289 | data, use_negated_probes=args.use_negated_probes 290 | ) 291 | else: 292 | # keep samples as they are 293 | all_samples = data 294 | 295 | 296 | # create uuid if not present 297 | i = 0 298 | for sample in all_samples: 299 | sample["uuid"] = i 300 | i += 1 301 | 302 | 303 | 304 | 305 | all_samples, ret_msg = filter_samples( 306 | model, data, vocab_subset, args.max_sentence_length, args.template 307 | ) 308 | 309 | # OUT_FILENAME = "{}.jsonl".format(args.dataset_filename) 310 | # with open(OUT_FILENAME, 'w') as outfile: 311 | # for entry in all_samples: 312 | # json.dump(entry, outfile) 313 | # outfile.write('\n') 314 | 315 | logger.info("\n" + ret_msg + "\n") 316 | 317 | 318 | # if template is active (1) use a single example for (sub,obj) and (2) ... 319 | if args.template and args.template != "": 320 | facts = [] 321 | for sample in all_samples: 322 | sub = sample["sub_label"] 323 | obj = sample["obj_label"] 324 | uuid = sample["uuid"] 325 | if (sub, obj, uuid) not in facts: 326 | facts.append((sub, obj, uuid)) 327 | local_msg = "distinct template facts: {}".format(len(facts)) 328 | logger.info("\n" + local_msg + "\n") 329 | print(local_msg) 330 | all_samples = [] 331 | for fact in facts: 332 | (sub, obj, uuid) = fact 333 | sample = {"sub_label": sub, "obj_label": obj, "uuid": uuid} 334 | # substitute all sentences with a standard template 335 | sample["masked_sentences"] = parse_template( 336 | args.template.strip(), sample["sub_label"].strip(), base.MASK 337 | ) 338 | 339 | all_samples.append(sample) 340 | 341 | # shuffle data 342 | if shuffle_data: 343 | shuffle(all_samples) 344 | 345 | samples_batches, sentences_batches, ret_msg = batchify(all_samples, args.batch_size) 346 | logger.info("\n" + ret_msg + "\n") 347 | 348 | # ThreadPool 349 | num_threads = args.threads 350 | if num_threads <= 0: 351 | # use all available threads 352 | num_threads = multiprocessing.cpu_count() 353 | pool = ThreadPool(num_threads) 354 | list_of_results = [] 355 | 356 | for i in tqdm(range(len(samples_batches))): 357 | 358 | samples_b = samples_batches[i] 359 | sentences_b = [] 360 | current_batch_size = len(samples_b) 361 | for i, sample in enumerate(samples_b): 362 | masked_sentences = [] 363 | for num_mask in range(1, NUM_MASK+1): 364 | sentence = sample["masked_sentences"][0] 365 | sentence = sentence.replace(base.MASK, base.MASK * num_mask) 366 | sentence = sentence.replace("][", "] [") 367 | masked_sentences.append(sentence) 368 | sentences_b.append([sentence]) 369 | samples_b[i]["masked_sentences"] = masked_sentences 370 | ( 371 | original_log_probs_list, 372 | token_ids_list, 373 | masked_indices_list, 374 | ) = model.get_batch_generation(sentences_b, logger=logger) 375 | 376 | if vocab_subset is not None: 377 | # filter log_probs 378 | filtered_log_probs_list = model.filter_logprobs( 379 | original_log_probs_list, filter_logprob_indices 380 | ) 381 | else: 382 | filtered_log_probs_list = original_log_probs_list 383 | 384 | label_index_list = [] 385 | for sample in samples_b: 386 | obj_label_id = model.get_id(sample["obj_label"]) 387 | 388 | # MAKE SURE THAT obj_label IS IN VOCABULARIES 389 | if obj_label_id is None: 390 | raise ValueError( 391 | "object label id {} is None".format( 392 | sample["obj_label"] 393 | ) 394 | ) 395 | 396 | label_index_list.append(obj_label_id) 397 | 398 | dim_reshape = (current_batch_size, int(original_log_probs_list.shape[0]/current_batch_size), original_log_probs_list.shape[1], original_log_probs_list.shape[2]) 399 | original_log_probs_list = torch.reshape(original_log_probs_list, dim_reshape) 400 | filtered_log_probs_list = torch.reshape(filtered_log_probs_list, dim_reshape) 401 | 402 | masked_indices_list = np.reshape(np.array(masked_indices_list), (current_batch_size, int(len(masked_indices_list)/current_batch_size))) 403 | arguments = [ 404 | { 405 | "original_log_probs": original_log_probs, 406 | "filtered_log_probs": filtered_log_probs, 407 | "token_ids": token_ids, 408 | "vocab": model.vocab, 409 | "label_index": label_index, 410 | "masked_indices": masked_indices, 411 | "interactive": args.interactive, 412 | "index_list": index_list, 413 | "sample": sample, 414 | "candidates": candidates, 415 | } 416 | for sample, original_log_probs, filtered_log_probs, token_ids, label_index, masked_indices in zip( 417 | samples_b, original_log_probs_list, filtered_log_probs_list, token_ids_list, label_index_list, masked_indices_list, 418 | ) 419 | ] 420 | 421 | # multithread 422 | res = pool.map(run_thread, arguments) 423 | 424 | for idx, result in enumerate(res): 425 | 426 | result_masked_topk, msg = result 427 | 428 | logger.info("\n" + msg + "\n") 429 | 430 | sample = samples_b[idx] 431 | 432 | element = {"sample": sample, "uuid": sample["uuid"], "token_ids": token_ids_list[0], 433 | "masked_indices": masked_indices_list[0], "label_index": label_index_list[0], 434 | "masked_topk": result_masked_topk} 435 | 436 | list_of_results.append(element) 437 | 438 | pool.close() 439 | pool.join() 440 | 441 | # dump pickle with the result of the experiment 442 | all_results = dict( 443 | list_of_results=list_of_results 444 | ) 445 | with open("{}/result.pkl".format(log_directory), "wb") as f: 446 | pickle.dump(all_results, f) 447 | 448 | 449 | if __name__ == "__main__": 450 | parser = options.get_eval_KB_completion_parser() 451 | args = options.parse_args(parser) 452 | main(args) 453 | -------------------------------------------------------------------------------- /scripts/eval.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import numpy as np 4 | 5 | problem = [] 6 | f_out = open("./output/mbert_ranked.csv", "w") 7 | output_path = "./output/results/mbert_base/" 8 | path_compare = "./output/results/bert_base/en/" 9 | languages = list(os.walk(output_path))[0][1:-1][0] 10 | dict_languages_total = {} 11 | dict_languages_P = {} 12 | 13 | for lang in languages: 14 | print(lang) 15 | P_all = [] 16 | P_all_eng = [] 17 | total_all = [] 18 | relations = list(os.walk(output_path + lang + "/"))[0][1:-1][0] 19 | for relation in relations: 20 | if "date" in relation: 21 | continue 22 | P = 0.0 23 | P_eng = 0.0 24 | total = 0.0 25 | 26 | with open(output_path + lang + "/" + relation + "/" + 'result.pkl', 'rb') as f: 27 | data = pickle.load(f) 28 | 29 | with open(path_compare + relation + "/" + 'result.pkl', 'rb') as f: 30 | data_eng = pickle.load(f) 31 | 32 | if len(data["list_of_results"]) >0: 33 | eng_dict = {} 34 | for d in data_eng["list_of_results"]: 35 | rank = 0.0 36 | if d['masked_topk']["rank"]==0: 37 | rank = 1.0 38 | eng_dict[d["sample"]["uuid"]] = [rank, d["sample"]] 39 | for d in data["list_of_results"]: 40 | rank = 0.0 41 | if d['masked_topk']["rank"]==0: 42 | rank = 1.0 43 | P += rank 44 | total += 1.0 45 | idx = int(d["sample"]["uuid"]) 46 | if idx in eng_dict: 47 | P_eng += eng_dict[idx][0] 48 | 49 | P_all.append(P/total) 50 | P_all_eng.append(P_eng/total) 51 | total_all.append(total) 52 | 53 | f_out.write(lang) 54 | f_out.write(",") 55 | f_out.write(str(np.sum(total_all))) 56 | f_out.write(",") 57 | f_out.write(str(np.mean(P_all))) 58 | f_out.write(",") 59 | f_out.write(str(np.mean(P_all_eng))) 60 | f_out.write("\n") 61 | f_out.close() 62 | -------------------------------------------------------------------------------- /scripts/run_experiments_mBERT_ranked.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import argparse 8 | from batch_eval_KB_completion_mBERT_ranked import main as run_evaluation 9 | from batch_eval_KB_completion_mBERT_ranked import load_file 10 | from mlama.modules import build_model_by_name 11 | import pprint 12 | import statistics 13 | from os import listdir 14 | import os 15 | from os.path import isfile, join 16 | from shutil import copyfile 17 | from collections import defaultdict 18 | import json 19 | 20 | LMs = [ 21 | { 22 | "lm": "bert", 23 | "label": "mbert_base", 24 | "models_names": ["bert"], 25 | "bert_model_name": "bert-base-multilingual-cased", 26 | "bert_model_dir": None 27 | }, 28 | ] 29 | 30 | 31 | def run_experiments( 32 | relations, 33 | data_path_pre, 34 | data_path_post, 35 | language, 36 | input_param={ 37 | "lm": "bert", 38 | "label": "bert_large", 39 | "models_names": ["bert"], 40 | "bert_model_name": "bert-large-cased", 41 | "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", 42 | }, 43 | ): 44 | model = None 45 | pp = pprint.PrettyPrinter(width=41, compact=True) 46 | if "P" in relations[0]["relation"]: 47 | object_path = "./data/TREx_multilingual_objects/" + language + ".json" 48 | else: 49 | object_path = "./data/GoogleRE_objects/" + language + ".json" 50 | 51 | with open(object_path) as f: 52 | candidates = json.load(f) 53 | 54 | for relation in relations: 55 | pp.pprint(relation) 56 | PARAMETERS = { 57 | "dataset_filename": "{}{}{}".format( 58 | data_path_pre, relation["relation"], data_path_post 59 | ), 60 | "common_vocab_filename": None, 61 | "template": "", 62 | "bert_vocab_name": "vocab.txt", 63 | "batch_size": 4, 64 | "logdir": "output", 65 | "full_logdir": "output/results/{}/{}/{}".format( 66 | input_param["label"], language, relation["relation"] 67 | ), 68 | "lowercase": False, 69 | "max_sentence_length": 100, 70 | "threads": -1, 71 | "interactive": False, 72 | } 73 | 74 | if "template" in relation: 75 | PARAMETERS["template"] = relation["template"] 76 | 77 | PARAMETERS.update(input_param) 78 | print(PARAMETERS) 79 | 80 | args = argparse.Namespace(**PARAMETERS) 81 | 82 | # see if file exists 83 | try: 84 | data = load_file(args.dataset_filename) 85 | except Exception as e: 86 | print("Relation {} excluded.".format(relation["relation"])) 87 | print("Exception: {}".format(e)) 88 | continue 89 | 90 | if model is None: 91 | [model_type_name] = args.models_names 92 | model = build_model_by_name(model_type_name, args) 93 | 94 | max_length = 0 95 | dict_num_mask = {} 96 | for obj in candidates[relation["relation"]]["objects"]: 97 | if len(model.tokenizer.tokenize(obj)) > max_length: 98 | max_length = len(model.tokenizer.tokenize(obj)) 99 | for l in range(1, max_length+1): 100 | dict_num_mask[l] = {} 101 | for obj in candidates[relation["relation"]]["objects"]: 102 | dict_num_mask[len(model.tokenizer.tokenize(obj))][obj] = model.get_id(obj) 103 | 104 | run_evaluation(args, max_length, dict_num_mask, shuffle_data=False, model=model) 105 | 106 | 107 | def get_TREx_parameters(data_path_pre="data/"): 108 | relations = load_file("{}relations.jsonl".format(data_path_pre)) 109 | data_path_pre += "TREx/" 110 | data_path_post = ".jsonl" 111 | return relations, data_path_pre, data_path_post 112 | 113 | 114 | def get_GoogleRE_parameters(): 115 | relations = [ 116 | { 117 | "relation": "place_of_birth", 118 | "template": "[X] was born in [Y] .", 119 | "template_negated": "[X] was not born in [Y] .", 120 | }, 121 | { 122 | "relation": "date_of_birth", 123 | "template": "[X] (born [Y]).", 124 | "template_negated": "[X] (not born [Y]).", 125 | }, 126 | { 127 | "relation": "place_of_death", 128 | "template": "[X] died in [Y] .", 129 | "template_negated": "[X] did not die in [Y] .", 130 | }, 131 | ] 132 | data_path_pre = "data/Google_RE/" 133 | data_path_post = "_test.jsonl" 134 | return relations, data_path_pre, data_path_post 135 | 136 | 137 | def get_MultiLingual_parameters(data_path_pre="./data/mlama1.1/", language=""): 138 | relations = load_file("{}/{}/templates.jsonl".format(data_path_pre, language)) 139 | data_path_pre += language + "/" 140 | data_path_post = ".jsonl" 141 | return relations, data_path_pre, data_path_post, language 142 | 143 | 144 | """def get_MultiLingual_parameters_GoogleRe(data_path_pre="./data/", language=""): 145 | relations = load_file("{}/templates.jsonl".format(data_path_pre, language)) 146 | data_path_pre += language + "/" 147 | data_path_post = "_test.jsonl" 148 | return relations, data_path_pre, data_path_post, language""" 149 | 150 | 151 | def run_all_LMs(parameters): 152 | for ip in LMs: 153 | print(ip["label"]) 154 | run_experiments(*parameters, input_param=ip) 155 | 156 | 157 | def main(): 158 | parser = argparse.ArgumentParser() 159 | parser.add_argument('--lang', '-l', type=str, default="fr", help='language') 160 | 161 | args = parser.parse_args() 162 | 163 | l = args.lang 164 | print(l) 165 | parameters = get_MultiLingual_parameters(language=l) 166 | run_all_LMs(parameters) 167 | 168 | if __name__ == "__main__": 169 | main() 170 | --------------------------------------------------------------------------------