├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── dataset-construction ├── .gitignore ├── README.md ├── configs │ ├── expand_objects.json │ ├── expand_subject.json │ ├── filter_objects.json │ ├── filter_subjects.json │ ├── for_v1.5 │ │ ├── NDB Relation Templates - P106.csv │ │ ├── NDB Relation Templates - P108.csv │ │ ├── NDB Relation Templates - P1082.csv │ │ ├── NDB Relation Templates - P1092.csv │ │ ├── NDB Relation Templates - P1110.csv │ │ ├── NDB Relation Templates - P1174.csv │ │ ├── NDB Relation Templates - P118.csv │ │ ├── NDB Relation Templates - P1198.csv │ │ ├── NDB Relation Templates - P1867.csv │ │ ├── NDB Relation Templates - P19.csv │ │ ├── NDB Relation Templates - P20.csv │ │ ├── NDB Relation Templates - P21.csv │ │ ├── NDB Relation Templates - P22.csv │ │ ├── NDB Relation Templates - P23.csv │ │ ├── NDB Relation Templates - P26.csv │ │ ├── NDB Relation Templates - P27.csv │ │ ├── NDB Relation Templates - P35.csv │ │ ├── NDB Relation Templates - P38.csv │ │ ├── NDB Relation Templates - P47.csv │ │ ├── NDB Relation Templates - P50.csv │ │ ├── NDB Relation Templates - P54.csv │ │ ├── NDB Relation Templates - P57.csv │ │ ├── NDB Relation Templates - P58.csv │ │ ├── NDB Relation Templates - P6.csv │ │ ├── NDB Relation Templates - P61.csv │ │ └── NDB Relation Templates - P69.csv │ └── generate_v1.5.json ├── requirements.txt ├── scripts │ ├── initial_sample.sh │ ├── make_databases.sh │ ├── make_questions.sh │ └── make_v2.4.sh └── src │ └── ndb_data │ ├── __init__.py │ ├── construction │ ├── __init__.py │ ├── make_database_finalize.py │ ├── make_database_initial.py │ ├── make_database_initial_cache.py │ └── make_questions.py │ ├── data_import │ ├── __init__.py │ ├── fix_sitelinks.py │ ├── kelm_data.py │ └── wikidata_index.py │ ├── dataset_statistics.py │ ├── generation │ ├── __init__.py │ ├── describe_db_facts.py │ ├── describe_dbs.py │ ├── filter_db_facts.py │ ├── finalize_hypothesis.py │ ├── map_kelm.py │ ├── plot_db_sizes.py │ ├── question_to_db.py │ └── template_first_db.py │ ├── sample_questions.py │ ├── sample_questions_100.py │ ├── sample_questions_1000.py │ ├── sample_questions_250.py │ ├── sample_questions_50.py │ ├── sample_questions_500.py │ ├── util │ ├── __init__.py │ └── build_json.py │ └── wikidata_common │ ├── __init__.py │ ├── common_mongo.py │ ├── kelm.py │ ├── wikidata.py │ └── wikpedia.py ├── modelling ├── .gitignore ├── README.md ├── requirements.txt ├── scripts │ ├── baselines │ │ ├── retrieve.sh │ │ ├── train_longformer.sh │ │ ├── train_t5.sh │ │ └── train_t5_retriever.sh │ ├── convert_ssg_predictions.sh │ ├── experiments_baselines.sh │ ├── experiments_ours.sh │ └── ours │ │ ├── predict_spj.sh │ │ ├── predict_spj_rand_sweep.sh │ │ └── train_spj.sh ├── setup.py ├── src │ ├── __init__.py │ └── neuraldb │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── run.cpython-38.pyc │ │ ├── convert_legacy_predictions.py │ │ ├── convert_spj_to_predictions.py │ │ ├── convert_ssg_predictions.py │ │ ├── dataset │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── data_collator_seq2seq.cpython-38.pyc │ │ │ ├── neuraldb_file_reader.cpython-38.pyc │ │ │ ├── neuraldb_parser.cpython-38.pyc │ │ │ └── seq2seq_dataset.cpython-38.pyc │ │ ├── data_collator_seq2seq.py │ │ ├── instance_generator │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-38.pyc │ │ │ │ ├── instance_generator.cpython-38.pyc │ │ │ │ ├── perfectir_generator.cpython-38.pyc │ │ │ │ ├── spj_generator.cpython-38.pyc │ │ │ │ └── wholedb_generator.cpython-38.pyc │ │ │ ├── externalir_generator.py │ │ │ ├── externalir_generator_maxtok.py │ │ │ ├── instance_generator.py │ │ │ ├── perfectir_generator.py │ │ │ ├── spj_generator.py │ │ │ ├── subsampler.py │ │ │ └── wholedb_generator.py │ │ ├── neuraldb_file_reader.py │ │ ├── neuraldb_parser.py │ │ └── seq2seq_dataset.py │ │ ├── dataset_statistics.py │ │ ├── evaluation │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── postprocess_baselines.cpython-38.pyc │ │ │ └── scoring_functions.cpython-38.pyc │ │ ├── postprocess_baselines.py │ │ ├── postprocess_spj.py │ │ └── scoring_functions.py │ │ ├── final_scoring.py │ │ ├── final_scoring_with_dbsize.py │ │ ├── final_scoring_with_dbsize_sweep.py │ │ ├── modelling │ │ ├── __init__.py │ │ └── neuraldb_trainer.py │ │ ├── retriever │ │ ├── __init__.py │ │ ├── dpr.py │ │ └── tfidf.py │ │ ├── run.py │ │ └── util │ │ ├── __init__.py │ │ └── log_helper.py └── tests │ └── test_evaluation.py ├── overview.png ├── requirements-dev.txt ├── setup.cfg └── ssg ├── README.md ├── evaluate_set_ssg.py ├── requirements.txt ├── ssg_prediction.py ├── ssg_utils.py └── train_ssg.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to NeuralDB 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `master`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to NeuralDB, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Database Reasoning over Text 2 | 3 | This repository contains the code for the [Database Reasoning Over Text](https://arxiv.org/pdf/2106.01074.pdf) paper, 4 | to appear at ACL2021. Work is performed in collaboration with James Thorne, Majid Yazdani, Marzieh Saeidi, Fabrizio Silvestri, Sebastian Riedel, and Alon Halevy. 5 | 6 | 7 | ![Overview Image](overview.png) 8 | 9 | 10 | ## Data 11 | The completed NeuralDB datasets can be downloaded [here](https://dl.fbaipublicfiles.com/neuraldb/fb-data-WikiNLDB.zip) and are released under a [CC BY-SA 3.0 license](https://creativecommons.org/licenses/by-sa/2.0/). 12 | 13 | The dataset includes entity names from Wikidata which are released under a [CC BY-SA 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/). 14 | This dataset includes sentences from the KELM corpus. KELM is released under the [CC BY-SA 2.0 license](https://creativecommons.org/licenses/by-sa/2.0/) 15 | 16 | 17 | ## Repository Structure 18 | The repository is structured in 3 sub-folders: 19 | 20 | * Tools for mapping the KELM data to Wikidata identifiers are provided in the [dataset construction](dataset-construction/) folder , 21 | * The information retrieval system for the support set generator are provided in the [ssg](ssg/) folder 22 | * The models for Neural SPJ, the baseline retrieval (TF-IDF and DPR), and evaluation scripts are provided in the [modelling folder](modelling/). 23 | 24 | Instructions for running each component are provided in the README files in the respective sub-folders. 25 | 26 | ## Setup 27 | 28 | All sub-folders were set up with one Python environment per folder. Requirements for each environment can be installed by 29 | running a pip install: 30 | 31 | ``` 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | In the `dataset-construction` and `modelling` folders, the `src` folder should be included in the python path. 36 | 37 | ``` 38 | export PYTHONPATH=src 39 | ``` 40 | 41 | ## License 42 | 43 | The code in this repository is released under the [Apache 2.0 license](LICENSE) 44 | -------------------------------------------------------------------------------- /dataset-construction/.gitignore: -------------------------------------------------------------------------------- 1 | resources/ 2 | work/ 3 | -------------------------------------------------------------------------------- /dataset-construction/configs/expand_objects.json: -------------------------------------------------------------------------------- 1 | { 2 | "P54": { 3 | "P17": [ 4 | "What is the country of the player who" 5 | ], 6 | "P118": [ 7 | "What is the league of the player who" 8 | ] 9 | }, 10 | "P50": { 11 | "P27": [ 12 | "What is the country of the person that" 13 | ], 14 | "P21": [ 15 | "What is the gender of the person that" 16 | ], 17 | "P108": [ 18 | "What is the employer of the person that" 19 | ], 20 | "P106": [ 21 | "What is the occupation of the person that" 22 | ], 23 | "P39": [ 24 | "What is the position of the person that" 25 | ], 26 | "P166": [ 27 | "What prize did the person that $X win?" 28 | ], 29 | "P463": [ 30 | "What is the affiliation of the person that" 31 | ], 32 | "P937": [ 33 | "What is the field that $X works in?" 34 | ] 35 | }, 36 | "P61": { 37 | "P27": [ 38 | "What is the country of the person that" 39 | ], 40 | "P21": [ 41 | "What is the gender of the person that" 42 | ], 43 | "P108": [ 44 | "What is the employer of the person that" 45 | ], 46 | "P106": [ 47 | "What is the occupation of the person that" 48 | ], 49 | "P39": [ 50 | "What is the position of the person that" 51 | ], 52 | "P166": [ 53 | "What prize did the person that $X win?" 54 | ], 55 | "P463": [ 56 | "What is the affiliation of the person that" 57 | ], 58 | "P937": [ 59 | "What is the field that $X works in?" 60 | ] 61 | }, 62 | "P69": { 63 | "P27": [ 64 | "What country is the institution of the person who has" 65 | ], 66 | "P463": [ 67 | "What affiliation is the institution of the person that has" 68 | ], 69 | "P937": [ 70 | "What is the field of the institution of the person that has" 71 | ] 72 | } 73 | } -------------------------------------------------------------------------------- /dataset-construction/configs/expand_subject.json: -------------------------------------------------------------------------------- 1 | { 2 | "P54": { 3 | "P27": ["What is the citizenship of the player that"], 4 | "P2067": ["What is the weight of the player that"], 5 | "P2048": ["What is the height of the player that"], 6 | "P569": ["What is the date of birth of the player that"] 7 | }, 8 | "P50": { 9 | "P136": ["What is the genre of the work of the"], 10 | "P495": ["Where did the work that $X originate from?"], 11 | "P577": ["When was the work that $X published"] 12 | }, 13 | "P58": { 14 | "P136": ["What is the genre of the work that"], 15 | "P495": ["Where did the work that $X originate from?"], 16 | "P577": ["When was the work that $X published"] 17 | }, 18 | "P69": { 19 | "P27": ["What is the country of the person that went to"], 20 | "P21": ["What is the gender of the person that went to"], 21 | "P108": ["What is the employer of the person that went to"], 22 | "P106": ["What is the occupation of the person that went to"], 23 | "P39": ["What is the position of the person that went to"], 24 | "P166": ["What prize did the person that went to $X win?"], 25 | "P463": ["What is the affiliation of the person that went to"], 26 | "P937": ["What field does the person that went to $X work in?"] 27 | } 28 | 29 | } -------------------------------------------------------------------------------- /dataset-construction/configs/filter_objects.json: -------------------------------------------------------------------------------- 1 | { 2 | "P54": { 3 | "P17": ["a club located in $AO", "a team located in $AO"], 4 | "P118": ["a team in $AO"] 5 | }, 6 | "P50": { 7 | 8 | "P27": ["someone from $AO"], 9 | "P21": ["someone who is a $AO"], 10 | "P108": ["someone who is employed by $AO"], 11 | "P106": ["someone who works as a $AO"], 12 | "P39": ["someone who is a $AO"], 13 | "P166": ["someone who won a $AO"], 14 | "P463": ["someone who is a member of $AO"], 15 | "P937": ["someone who works in $AO"] 16 | }, 17 | "P58": { 18 | 19 | "P27": ["someone from $AO"], 20 | "P21": ["someone who is a $AO"], 21 | "P108": ["someone who is employed by $AO"], 22 | "P106": ["someone who works as a $AO"], 23 | "P39": ["someone who is a $AO"], 24 | "P166": ["someone who won a $AO"], 25 | "P463": ["someone who is a member of $AO"], 26 | "P937": ["someone who works in $AO"] 27 | }, 28 | "P61": { 29 | "P27": ["someone from $AO"], 30 | "P21": ["someone who is a $AO"], 31 | "P108": ["someone who is employed by $AO"], 32 | "P106": ["someone who works as a $AO"], 33 | "P39": ["someone who is a $AO"], 34 | "P166": ["someone who won a $AO"], 35 | "P463": ["someone who is a member of $AO"], 36 | "P937": ["someone who works in $AO"] 37 | }, 38 | "P69": { 39 | "P17": ["an institution in $AO"] 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /dataset-construction/configs/filter_subjects.json: -------------------------------------------------------------------------------- 1 | { 2 | "P54": { 3 | "P2067": ["someone who weighs $AO"], 4 | "P2048": ["someone $AO tall"], 5 | "P21": ["someone who is a $AO"], 6 | "P413": ["someone who plays $AO"], 7 | "P569": ["someone born on $AO"], 8 | "P27": ["someone from $AO"] 9 | }, 10 | "P50": { 11 | "P136": ["a work with the genre $AO"], 12 | "P495": ["a work originating from $AO"], 13 | "P577": ["a work published on $AO"] 14 | }, 15 | "P58": { 16 | "P136": ["a work with the genre $AO"], 17 | "P495": ["a work originating from $AO"], 18 | "P577": ["a work published on $AO"] 19 | }, 20 | "P69": { 21 | "P27": ["someone from $AO"], 22 | "P21": ["someone who is a $AO"], 23 | "P108": ["someone who is employed by $AO"], 24 | "P106": ["someone who works as a $AO"], 25 | "P39": ["someone who is a $AO"], 26 | "P166": ["someone who won a $AO"], 27 | "P463": ["someone who is a member of $AO"], 28 | "P937": ["someone who works in $AO"] 29 | } 30 | } -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P106.csv: -------------------------------------------------------------------------------- 1 | 106,https://www.wikidata.org/wiki/Property:P106,$s person has job $o,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s works as a $o,Yuri Gagarin works as a astronaut,Is $s an $o?,Is Yuri Gagarin an astronaut?,TRUE,TRUE,What jobs does $s have?,What jobs does Yuri Gagarin have?,$o,astronaut,How many jobs has $s had?,How many jobs has Yuri Gagarin had?,$o,astronaut,What is the least popular job?,What is the least popular job?,$o [SEP] $s,astronaut [SEP] Yuri Gagarin,What is the most popular job?,What is the most popular job?,$o [SEP] $s,astronaut [SEP] Yuri Gagarin,,,,,,,, 4 | $s is a $o,Yuri Gagarin is a astronaut,Does $s work as an $o?,Does Yuri Gagarin work as an astronaut?,TRUE,TRUE,Who works as a $o?,Who works as a astronaut?,$s,Yuri Gagarin,How many people are $o?,How many people are astronaut?,$s,Yuri Gagarin,What job has the fewest number of people working there?,What job has the fewest number of people working there?,$o [SEP] $s,astronaut [SEP] Yuri Gagarin,What job has had the highest number of people working there?,What job has had the highest number of people working there?,$o [SEP] $s,astronaut [SEP] Yuri Gagarin,,,,,,,, 5 | $s's job is a $o,Yuri Gagarin's job is a astronaut,Is $s's job a $o?,Is Yuri Gagarin's job a astronaut?,TRUE,TRUE,What does $s do?,What does Yuri Gagarin do?,$o,astronaut,,,,,Who has had the fewest jobs?,Who has had the fewest jobs?,$s [SEP] $o,Yuri Gagarin [SEP] astronaut,Who has had the most jobs?,Who has had the most jobs?,$s [SEP] $o,Yuri Gagarin [SEP] astronaut,,,,,,,, 6 | ,,,,,,What is $s's job?,What is Yuri Gagarin's job?,$o,astronaut,,,,,Who has had the least number of jobs?,Who has had the least number of jobs?,$s [SEP] $o,Yuri Gagarin [SEP] astronaut,Who has had the highest number of jobs?,Who has had the highest number of jobs?,$s [SEP] $o,Yuri Gagarin [SEP] astronaut,,,,,,,, 7 | ,,,,,,What are $s's jobs?,What are Yuri Gagarin's jobs?,$o,astronaut,,,,,,,,,,,,,,,,,,,, 8 | ,,,,,,Who is a $o?,Who is a astronaut?,$s,Yuri Gagarin,,,,,,,,,,,,,,,,,,,, 9 | ,,,,,,Who has a career as a $o?,Who has a career as a astronaut?,$s,Yuri Gagarin,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P108.csv: -------------------------------------------------------------------------------- 1 | 108,https://www.wikidata.org/wiki/Property:P106,$s person has job at $o,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s is employed by $o,Neil Armstrong is employed by NASA,Is $s employed by $o?,Is Neil Armstrong employed by NASA?,TRUE,TRUE,Who does $s work for?,Who does Neil Armstrong work for?,$o,NASA,How many places has $s worked?,How many places has Neil Armstrong worked?,$o,NASA,,,,,,,,,,,,,,,, 4 | The employer of $s is $o,The employer of Neil Armstrong is NASA,Does $s work for $o?,Does Neil Armstrong work for NASA?,TRUE,TRUE,Who works for $o?,Who works for NASA?,$s,Neil Armstrong,How many people work for $o?,How many people work for NASA?,$s,Neil Armstrong,,,,,,,,,,,,,,,, 5 | $s works for $o,Neil Armstrong works for NASA,Is $o the employer of $s?,Is NASA the employer of Neil Armstrong?,TRUE,TRUE,Who is employed by $o?,Who is employed by NASA?,$s,Neil Armstrong,,,,,,,,,,,,,,,,,,,, 6 | ,,,,,,What company is $s employed by?,What company is Neil Armstrong employed by?,$o,NASA,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P1082.csv: -------------------------------------------------------------------------------- 1 | 1082,https://www.wikidata.org/wiki/Property:P1082,"Population of $s is $o",,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | There are $o inhabitants of $s,There are 5079791 inhabitants of San Francisco,Is the population of $s $o?,Is the population of San Francisco 5079791?,TRUE,TRUE,,,,,How many people live in $s?,How many people live in San Francisco?,$o,5079791,Which place has the smallest population?,Which place has the smallest population?,$s [SEP] $o,San Francisco [SEP] 5079791,Which place has the biggest population?,Which place has the biggest population?,$s [SEP] $o,San Francisco [SEP] 5079791,What is the smallest population?,What is the smallest population?,$o,5079791,What is the largest population?,What is the largest population?,$o,5079791 4 | The population of $s is $o,The population of San Francisco is 5079791,Are there $o people living in $s?,Are there 5079791 people living in San Francisco?,TRUE,TRUE,,,,,What is the population of $s?,What is the population of San Francisco?,$o,5079791,What is the least inhabited place?,What is the least inhabited place?,$s [SEP] $o,San Francisco [SEP] 5079791,What is the most inhabited place?,What is the most inhabited place?,$s [SEP] $o,San Francisco [SEP] 5079791,What is the lowest population?,What is the lowest population?,$o,5079791,What is the highest population?,What is the highest population?,$o,5079791 5 | There are $o people living in $s,There are 5079791 people living in San Francisco,Is the number of people living in $s $o?,Is the number of people living in San Francisco 5079791?,TRUE,TRUE,,,,,,,,,What place has the fewest people living there?,What place has the fewest people living there?,$s [SEP] $o,San Francisco [SEP] 5079791,What place has the most people living there?,What place has the most people living there?,$s [SEP] $o,San Francisco [SEP] 5079791,,,,,,,, 6 | ,,Are there $o inhabitants of $s?,Are there 5079791 inhabitants of San Francisco?,TRUE,TRUE,,,,,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P1092.csv: -------------------------------------------------------------------------------- 1 | 1092,https://www.wikidata.org/wiki/Property:P1092,$ was made $o times,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | There were $o $s made,There were 125 CC-150 Polaris made,,,,,What items are there?,What items are there?,$s,CC-150 Polaris,How many $s are there?,How many CC-150 Polaris are there?,$o,125,What item has been made the least?,What item has been made the least?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What item has been made the most?,What item has been made the most?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What was the smallest production run?,What was the smallest production run?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What was the largest production run?,What was the largest production run?,$s [SEP] $o,CC-150 Polaris [SEP] 125 4 | $s was made $o times,CC-150 Polaris was made 125 times,,,,,List all items,List all items,$s,CC-150 Polaris,How many $s exist?,How many CC-150 Polaris exist?,$o,125,What is the meast manufactured item?,What is the meast manufactured item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What is the most manufactured item?,What is the most manufactured item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,,,,,,,, 5 | There are $o $s in existence ,There are 125 CC-150 Polaris in existence ,,,,,What items exist?,What items exist?,$s,CC-150 Polaris,How many $s were manufactured?,How many CC-150 Polaris were manufactured?,$o,125,What is the least common item?,What is the least common item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What is the most common item?,What is the most common item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,,,,,,,, 6 | ,,,,,,,,,,How many items were made?,How many items were made?,$o,125,What is the rarest item?,What is the rarest item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What is the most abundant item?,What is the most abundant item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,,,,,,,, 7 | ,,,,,,,,,,How many things were manufactured?,How many things were manufactured?,$o,125,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P1110.csv: -------------------------------------------------------------------------------- 1 | 1110,https://www.wikidata.org/wiki/Property:P1110,$s has attendance of $o,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | The attendance of $s was $o,"The attendance of 1982 FIFA World Cup was 2,109,723",,,,,What were the attendances of all events?,What were the attendances of all events?,$o,"2,109,723",How many people went to $o?,"How many people went to 2,109,723?",$o,"2,109,723",What was the least popular event?,What was the least popular event?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What was the most popular event?,What was the most popular event?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",How many people went to the smallest event?,How many people went to the smallest event?,$o,"2,109,723",How many people went to the largest event?,How many people went to the largest event?,$o,"2,109,723" 4 | The attendance of $s is $o,"The attendance of 1982 FIFA World Cup is 2,109,723",,,,,What was the attendance of $s?,What was the attendance of 1982 FIFA World Cup?,$o,"2,109,723",How many people went to events?,How many people went to events?,$o,"2,109,723",Which event was least well attended?,Which event was least well attended?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What event had the best attendance?,What event had the best attendance?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",How many went to the smallest event?,How many went to the smallest event?,$o,"2,109,723",How many went to the largest event?,How many went to the largest event?,$o,"2,109,723" 5 | $o people went to $s,"2,109,723 people went to 1982 FIFA World Cup",,,,,,,,,,,,,What event had the smallest turnout?,What event had the smallest turnout?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What event had the largest turnout?,What event had the largest turnout?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What was the smallest crowd?,What was the smallest crowd?,$o,"2,109,723",What was the largest crowd?,What was the largest crowd?,$o,"2,109,723" 6 | The number of people at $s was $o,"The number of people at 1982 FIFA World Cup was 2,109,723",,,,,,,,,,,,,What event had the lowest attendance?,What event had the lowest attendance?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What event was most well attended?,What event was most well attended?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",,,,,,,, 7 | $o went to $s,"2,109,723 went to 1982 FIFA World Cup",,,,,,,,,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P1174.csv: -------------------------------------------------------------------------------- 1 | 1174,https://www.wikidata.org/wiki/Property:P1174,$s has $o visitors per year,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s has $o visitors per year,"Eiffel Tower has 7,000,000 visitors per year",,,,,,,,,How many people visit attractions every year?,How many people visit attractions every year?,$o,"7,000,000",What is the least attended attraction?,What is the least attended attraction?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the most attended attraction?,What is the most attended attraction?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the smallest attendance?,What is the smallest attendance?,$o,"7,000,000",What is the largest attendance?,What is the largest attendance?,$o,"7,000,000" 4 | $o people visit $s every year,"7,000,000 people visit Eiffel Tower every year",,,,,,,,,How many people visit $s every year?,How many people visit Eiffel Tower every year?,$o,"7,000,000",What is the attraction with the lowest visitor count per year?,What is the attraction with the lowest visitor count per year?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the attraction with the highest visitor count per year?,What is the attraction with the highest visitor count per year?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the smallest yearly attendance?,What is the smallest yearly attendance?,$o,"7,000,000",What is the largest yearly attendance?,What is the largest yearly attendance?,$o,"7,000,000" 5 | $o people visit $s annually ,"7,000,000 people visit Eiffel Tower annually ",,,,,,,,,,,,,What has the lowest number of visitors each year?,What has the lowest number of visitors each year?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What has the highest number of visitors each year?,What has the highest number of visitors each year?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the smallest annual attendance?,What is the smallest annual attendance?,$o,"7,000,000",What is the largest annual attendance?,What is the largest annual attendance?,$o,"7,000,000" 6 | The yearly visitor count at $s is $o,"The yearly visitor count at Eiffel Tower is 7,000,000",,,,,,,,,,,,,,,,,,,,,,,,,,,, 7 | $o people go to $s each year,"7,000,000 people go to Eiffel Tower each year",,,,,,,,,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P118.csv: -------------------------------------------------------------------------------- 1 | 118,https://www.wikidata.org/wiki/Property:P118,$s is in league $o,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s plays in $o,Bastian Schweinsteiger plays in Bundesliga,Does $s play in $o?,Does Bastian Schweinsteiger play in Bundesliga?,TRUE,TRUE,What league does $s play for?,What league does Bastian Schweinsteiger play for?,$o,Bundesliga,How many people play in $o?,How many people play in Bundesliga?,$s,Bastian Schweinsteiger,What is the smallest league?,What is the smallest league?,$o [SEP] $s,Bundesliga [SEP] Bastian Schweinsteiger,What is the largest league?,What is the largest league?,$o [SEP] $s,Bundesliga [SEP] Bastian Schweinsteiger,,,,,,,, 4 | $s is a player in $o,Bastian Schweinsteiger is a player in Bundesliga,Is the league that $s plays for $o?,Is the league that Bastian Schweinsteiger plays for Bundesliga?,TRUE,TRUE,Who plays in $o?,Who plays in Bundesliga?,$s,Bastian Schweinsteiger,How many leagues has $s played for?,How many leagues has Bastian Schweinsteiger played for?,$o,Bundesliga,Who has played in the fewest leagues?,Who has played in the fewest leagues?,$s [SEP] $o,Bastian Schweinsteiger [SEP] Bundesliga,Who has played for the most leagues?,Who has played for the most leagues?,$s [SEP] $o,Bastian Schweinsteiger [SEP] Bundesliga,,,,,,,, 5 | The league that $s plays in is $o,The league that Bastian Schweinsteiger plays in is Bundesliga,Does $s participate in $o?,Does Bastian Schweinsteiger participate in Bundesliga?,TRUE,TRUE,What league is $s participant of?,What league is Bastian Schweinsteiger participant of?,$o,Bundesliga,How many leagues has $s participated in?,How many leagues has Bastian Schweinsteiger participated in?,$o,Bundesliga,Who has participated in the least number of leagues?,Who has participated in the least number of leagues?,$s [SEP] $o,Bastian Schweinsteiger [SEP] Bundesliga,Who has participated in the most number of leagues?,Who has participated in the most number of leagues?,$s [SEP] $o,Bastian Schweinsteiger [SEP] Bundesliga,,,,,,,, 6 | $s participates in $o,Bastian Schweinsteiger participates in Bundesliga,Is $s a participant of $o?,Is Bastian Schweinsteiger a participant of Bundesliga?,TRUE,TRUE,Who participates in $o?,Who participates in Bundesliga?,$s,Bastian Schweinsteiger,How many participants are there in $o?,How many participants are there in Bundesliga?,$s,Bastian Schweinsteiger,What league has the fewest participants?,What league has the fewest participants?,$o [SEP] $s,Bundesliga [SEP] Bastian Schweinsteiger,What league has the most participants?,What league has the most participants?,$o [SEP] $s,Bundesliga [SEP] Bastian Schweinsteiger,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P1198.csv: -------------------------------------------------------------------------------- 1 | 1198,https://www.wikidata.org/wiki/Property:P1198,The unemployment rate of $s is $o,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | The unemployment rate of $s is $o,The unemployment rate of United States of America is 6.7 percent,Is the unpemployment rate of $s $o?,Is the unpemployment rate of United States of America 6.7 percent?,TRUE,TRUE,What is unemployment rate of $s?,What is unemployment rate of United States of America?,$o,6.7 percent,,,,,Where does it have the lowest unemployment rate?,Where does it have the lowest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Where does it have the highest unemployment rate?,Where does it have the highest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,What is the lowest unemployment rate?,What is the lowest unemployment rate?,$o,6.7 percent,What is the highest unemployment rate?,What is the highest unemployment rate?,$o,6.7 percent 4 | There are $o of people without jobs in $s,There are 6.7 percent of people without jobs in United States of America,Are there $o people unemployed in $s?,Are there 6.7 percent people unemployed in United States of America?,TRUE,TRUE,List unemployment rates,List unemployment rates,,,,,,,Where is the lowest unemployment rate?,Where is the lowest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Where is the highest unemployment rate?,Where is the highest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,,,,,,,, 5 | There are $o of people without a job in $s,There are 6.7 percent of people without a job in United States of America,Are $o people out of work in $s?,Are 6.7 percent people out of work in United States of America?,,,,,,,,,,,Where is the lowest unemployment?,Where is the lowest unemployment?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Where is the highest unemployment?,Where is the highest unemployment?,$s [SEP] $o,United States of America [SEP] 6.7 percent,,,,,,,, 6 | $s's unemployment rate is $o,United States of America's unemployment rate is 6.7 percent,,,,,,,,,,,,,Which places have the lowest unemployment rate?,Which places have the lowest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Which places have the highest unemployment rate?,Which places have the highest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,,,,,,,, 7 | There are $o people out work in $s,There are 6.7 percent people out work in United States of America,,,,,,,,,,,,,Which places have the least people out of work?,Which places have the least people out of work?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Which places have the most people out of work?,Which places have the most people out of work?,$s [SEP] $o,United States of America [SEP] 6.7 percent,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P1867.csv: -------------------------------------------------------------------------------- 1 | 1867,https://www.wikidata.org/wiki/Property:P1831,$s has number of qualified/eligible voters $o ,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | There are $o registered voters in $s,There are 1235532 registered voters in Scotland,Does $s have $o voters?,Does Scotland have 1235532 voters?,TRUE,TRUE,,,,,How many registered voters are there?,How many registered voters are there?,$o,1235532,Which place has the fewest registered voters?,Which place has the fewest registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,Which place has the most registered voters?,Which place has the most registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,What is the lowest voter registration?,What is the lowest voter registration?,$o,1235532,What is the highest voter regitration?,What is the highest voter regitration?,$o,1235532 4 | There were $o registered voters in $s,There were 1235532 registered voters in Scotland,Are there $o voters in $s?,Are there 1235532 voters in Scotland?,TRUE,TRUE,,,,,How many registered voters are there in $s?,How many registered voters are there in Scotland?,$o,1235532,Where is the lowest number of registered voters?,Where is the lowest number of registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,Where is the highest number of registered voters?,Where is the highest number of registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,What is the smallest number of registered voters?,What is the smallest number of registered voters?,$o,1235532,What is the largest number of voters?,What is the largest number of voters?,$o,1235532 5 | The number of eligible voters in $s is $o,The number of eligible voters in Scotland is 1235532,Are there $o registered voters in $s?,Are there 1235532 registered voters in Scotland?,TRUE,TRUE,,,,,,,,,What places have the lowest number of registered voters?,What places have the lowest number of registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,What places have the highest number of registered voters?,What places have the highest number of registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,,,,,,,, 6 | The number of eligible voters in $s was $o,The number of eligible voters in Scotland was 1235532,Does $s have $o eligible voters?,Does Scotland have 1235532 eligible voters?,TRUE,TRUE,,,,,,,,,,,,,,,,,,,,,,,, 7 | $s has $o eligible voters,Scotland has 1235532 eligible voters,,,,,,,,,,,,,,,,,,,,,,,,,,,, 8 | The size of the electorate in $s is $o,The size of the electorate in Scotland is 1235532,,,,,,,,,,,,,,,,,,,,,,,,,,,, 9 | There are $o registered voters in $s,There are 1235532 registered voters in Scotland,,,,,,,,,,,,,,,,,,,,,,,,,,,, 10 | There are $o voters in $s,There are 1235532 voters in Scotland,,,,,,,,,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P19.csv: -------------------------------------------------------------------------------- 1 | 19,https://www.wikidata.org/wiki/Property:P19,$s was born in $o ,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s was born in $o,Julius Caesar was born in Rome,Was $s born in $o?,Was Julius Caesar born in Rome?,TRUE,TRUE,Who was born in $o?,Who was born in Rome?,$s,Julius Caesar,How many people were born in $o?,How many people were born in Rome?,$s,Julius Caesar,Which place is the birthplace of the fewest people?,Which place is the birthplace of the fewest people?,$o [SEP] $s,Rome [SEP] Julius Caesar,Which place is the birthplace of the most people?,Which place is the birthplace of the most people?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,, 4 | $o is the place of birth of $s,Rome is the place of birth of Julius Caesar,Is $o the place of birth of $s?,Is Rome the place of birth of Julius Caesar?,TRUE,TRUE,Who is from $o?,Who is from Rome?,$s,Julius Caesar,Count the number of people born in $o,Count the number of people born in Rome,$s,Julius Caesar,Which place had the least people born there?,Which place had the least people born there?,$o [SEP] $s,Rome [SEP] Julius Caesar,Which places has had the highest number of births?,Which places has had the highest number of births?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,, 5 | $o is where $s was born,Rome is where Julius Caesar was born,Is $o where $s was born?,Is Rome where Julius Caesar was born?,TRUE,TRUE,Where was $s born?,Where was Julius Caesar born?,$o,Rome,,,,,What is the place with the fewest number of births?,What is the place with the fewest number of births?,$o [SEP] $s,Rome [SEP] Julius Caesar,What is the place with the highest number of births?,What is the place with the highest number of births?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,, 6 | ,,Was $o the birthplace of $s?,Was Rome the birthplace of Julius Caesar?,TRUE,TRUE,Where is $s from?,Where is Julius Caesar from?,$o,Rome,,,,,What is the place with the least number of births?,What is the place with the least number of births?,$o [SEP] $s,Rome [SEP] Julius Caesar,What is the place with the most births?,What is the place with the most births?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,, 7 | ,,,,,,List everyone born in $o,List everyone born in Rome,$s,Julius Caesar,,,,,,,,,What place has the most people born there?,What place has the most people born there?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,, 8 | ,,,,,,List people born in $o,List people born in Rome,$s,Julius Caesar,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P20.csv: -------------------------------------------------------------------------------- 1 | 20,https://www.wikidata.org/wiki/Property:P20,$s was died in $o ,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s died in $o,René Descartes died in Stockholm,Did $s die in $o?,Did René Descartes die in Stockholm?,TRUE,TRUE,Where did $s die?,Where did René Descartes die?,$o,Stockholm,How many people have died in $o?,How many people have died in Stockholm?,$s,René Descartes,Which places have the fewest people die there?,Which places have the fewest people die there?,$o [SEP] $s,Stockholm [SEP] René Descartes,Which places have the most people die there?,Which places have the most people die there?,$o [SEP] $s,Stockholm [SEP] René Descartes,,,,,,,, 4 | The place of death for $s was $o,The place of death for René Descartes was Stockholm,Is $o where $s died?,Is Stockholm where René Descartes died?,TRUE,TRUE,Who died in $o?,Who died in Stockholm?,$s,René Descartes,How many people died in $o?,How many people died in Stockholm?,$s,René Descartes,Which place has had the fewest people die there?,Which place has had the fewest people die there?,$o [SEP] $s,Stockholm [SEP] René Descartes,Which place has had the most people die there?,Which place has had the most people die there?,$o [SEP] $s,Stockholm [SEP] René Descartes,,,,,,,, 5 | $o is where $s died,Stockholm is where René Descartes died,Is $o the place of death of $s?,Is Stockholm the place of death of René Descartes?,TRUE,TRUE,Which people have died in $o?,Which people have died in Stockholm?,$s,René Descartes,Count the number of people who have died in $o,Count the number of people who have died in Stockholm,$s,René Descartes,Where is the place where the fewest people have died?,Where is the place where the fewest people have died?,$o [SEP] $s,Stockholm [SEP] René Descartes,Where is the place where the most people have died?,Where is the place where the most people have died?,$o [SEP] $s,Stockholm [SEP] René Descartes,,,,,,,, 6 | ,,,,,,Who has died in $o?,Who has died in Stockholm?,$s,René Descartes,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P21.csv: -------------------------------------------------------------------------------- 1 | 21,https://www.wikidata.org/wiki/Property:P21,"$s , gender, $o ",,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s is $o,Tintin is male,Is $s $o?,Is Tintin male?,TRUE,TRUE,Who is $o?,Who is male?,$s,Tintin,How many people are $o?,How many people are male?,$s,Tintin,Which gender has the fewest people?,Which gender has the fewest people?,$o [SEP] $s,male [SEP] Tintin,Which gender has the most people?,Which gender has the most people?,$o [SEP] $s,male [SEP] Tintin,,,,,,,, 4 | The gender of $s is $o,The gender of Tintin is male,,,,,Which people are $o?,Which people are male?,$s,Tintin,,,,,What is the least popular gender?,What is the least popular gender?,$o [SEP] $s,male [SEP] Tintin,What is the most popular gender?,What is the most popular gender?,$o [SEP] $s,male [SEP] Tintin,,,,,,,, 5 | $s is a $o,Tintin is a male,,,,,List everyone who is $o,List everyone who is male,$s,Tintin,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P22.csv: -------------------------------------------------------------------------------- 1 | 22,https://www.wikidata.org/wiki/Property:P22,$s has father $o ,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $o is the father of $s,George Bush is the father of Jenna Bush,Is $o a parent of $s?,Is George Bush a parent of Jenna Bush?,TRUE,TRUE,Who are the children of $o?,Who are the children of George Bush?,$s,Jenna Bush,How many children does $o have?,How many children does George Bush have?,$s,Jenna Bush,Which parent has the fewest children?,Which parent has the fewest children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,Which parent has the most children?,Which parent has the most children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,, 4 | $s's father is $o,Jenna Bush's father is George Bush,Is $o $s's parent?,Is George Bush Jenna Bush's parent?,TRUE,TRUE,Who are the parents of $s?,Who are the parents of Jenna Bush?,$o,George Bush,How many parents does $s have?,How many parents does Jenna Bush have?,$o,George Bush,Which parent doesn't have many children?,Which parent doesn't have many children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,Which parent has the highest number of children?,Which parent has the highest number of children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,, 5 | $o is $s's dad,George Bush is Jenna Bush's dad,Is $o $s's father?,Is George Bush Jenna Bush's father?,TRUE,TRUE,Who is the father of $s?,Who is the father of Jenna Bush?,$o,George Bush,How many people are children of $o?,How many people are children of George Bush?,$s,Jenna Bush,Who has the fewest children?,Who has the fewest children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,Who has the most children?,Who has the most children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,, 6 | $o is $s's father,George Bush is Jenna Bush's father,Is $o $s's dad?,Is George Bush Jenna Bush's dad?,TRUE,TRUE,Who are $o's children?,Who are George Bush's children?,$s,Jenna Bush,,,,,Who has the least kids?,Who has the least kids?,$o [SEP] $s,George Bush [SEP] Jenna Bush,Who has the most kids?,Who has the most kids?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,, 7 | $s is a child of $o,Jenna Bush is a child of George Bush,Is $o the father of $s?,Is George Bush the father of Jenna Bush?,TRUE,TRUE,,,,,,,,,Who has the least children?,Who has the least children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,,,,,, 8 | $o is a parent of $s,George Bush is a parent of Jenna Bush,Is $s a parent of $s?,Is Jenna Bush a parent of Jenna Bush?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 9 | ,,Is $s $o's parent?,Is Jenna Bush George Bush's parent?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 10 | ,,Is $s $o's father?,Is Jenna Bush George Bush's father?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 11 | ,,Is $s $o's dad?,Is Jenna Bush George Bush's dad?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 12 | ,,Is $s the father of $o?,Is Jenna Bush the father of George Bush?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P23.csv: -------------------------------------------------------------------------------- 1 | 23,https://www.wikidata.org/wiki/Property:P22,$s has mother $o ,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $o is the mother of $s,Marge Simpson is the mother of Lisa Simpson,Is $o a parent of $s?,Is Marge Simpson a parent of Lisa Simpson?,TRUE,TRUE,Who are the children of $o?,Who are the children of Marge Simpson?,$s,Lisa Simpson,How many children does $o have?,How many children does Marge Simpson have?,$s,Lisa Simpson,Which parent has the fewest children?,Which parent has the fewest children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,Which parent has the most children?,Which parent has the most children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,, 4 | $s's mother is $o,Lisa Simpson's mother is Marge Simpson,Is $o $s's parent?,Is Marge Simpson Lisa Simpson's parent?,TRUE,TRUE,Who are the parents of $s?,Who are the parents of Lisa Simpson?,$o,Marge Simpson,How many parents does $s have?,How many parents does Lisa Simpson have?,$o,Marge Simpson,Which parent doesn't have many children?,Which parent doesn't have many children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,Which parent has the highest number of children?,Which parent has the highest number of children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,, 5 | $o is $s's mum,Marge Simpson is Lisa Simpson's mum,Is $o $s's mother?,Is Marge Simpson Lisa Simpson's mother?,TRUE,TRUE,Who is the mother of $s?,Who is the mother of Lisa Simpson?,$o,Marge Simpson,How many people are children of $o?,How many people are children of Marge Simpson?,$s,Lisa Simpson,Who has the fewest children?,Who has the fewest children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,Who has the most children?,Who has the most children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,, 6 | $o is $s's mom,Marge Simpson is Lisa Simpson's mom,Is $o $s's mom?,Is Marge Simpson Lisa Simpson's mom?,TRUE,TRUE,Who are $o's children?,Who are Marge Simpson's children?,$s,Lisa Simpson,,,,,Who has the least kids?,Who has the least kids?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,Who has the most kids?,Who has the most kids?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,, 7 | $o is a child of $s,Marge Simpson is a child of Lisa Simpson,Is $o $s's mum?,Is Marge Simpson Lisa Simpson's mum?,TRUE,TRUE,Who is $s's mom?,Who is Lisa Simpson's mom?,,,,,,,Who has the least children?,Who has the least children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,,,,,, 8 | $s is a parent of $o,Lisa Simpson is a parent of Marge Simpson,Is $o the mother of $s?,Is Marge Simpson the mother of Lisa Simpson?,TRUE,TRUE,Who is $s's mum?,Who is Lisa Simpson's mum?,,,,,,,,,,,,,,,,,,,,,, 9 | $o is $s's mother,Marge Simpson is Lisa Simpson's mother,Is $o a parent of $s?,Is Marge Simpson a parent of Lisa Simpson?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 10 | ,,Is $o $s's parent?,Is Marge Simpson Lisa Simpson's parent?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 11 | ,,Is $o $s's mother?,Is Marge Simpson Lisa Simpson's mother?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 12 | ,,Is $o $s's mom?,Is Marge Simpson Lisa Simpson's mom?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 13 | ,,Is $o $s's mum?,Is Marge Simpson Lisa Simpson's mum?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, 14 | ,,Is $o the mother of $s?,Is Marge Simpson the mother of Lisa Simpson?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P26.csv: -------------------------------------------------------------------------------- 1 | 26,https://www.wikidata.org/wiki/Property:P26,$s has spouse $o (symmetric),,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s has spouse $o,John has spouse Mary,Is $s partner of $o?,Is John partner of Mary?,TRUE,TRUE,Who are $s's spouses?,Who are John's spouses?,$o,Mary,How many people did $s marry?,How many people did John marry?,$o,Mary,,,,,,,,,,,,,,,, 4 | $s is $o's spouse,John is Mary's spouse,Is $s $o's partner?,Is John Mary's partner?,TRUE,TRUE,Who is $s's spouse?,Who is John's spouse?,$o,Mary,How many partners has $s had?,How many partners has John had?,$o,Mary,,,,,,,,,,,,,,,, 5 | $s is married to $o,John is married to Mary,Is $s $o's spouse?,Is John Mary's spouse?,TRUE,TRUE,List the spouses of $s,List the spouses of John,$o,Mary,How many partners does $s have?,How many partners does John have?,$o,Mary,,,,,,,,,,,,,,,, 6 | $s is $o's partner,John is Mary's partner,,,,,Who is the spouse of $s?,Who is the spouse of John?,$o,Mary,,,,,,,,,,,,,,,,,,,, 7 | ,,,,,,Who is $s married to?,Who is John married to?,$o,Mary,,,,,,,,,,,,,,,,,,,, 8 | ,,,,,,Who married $s?,Who married John?,$o,Mary,,,,,,,,,,,,,,,,,,,, 9 | ,,,,,,List the partners of $s,List the partners of John,$o,Mary,,,,,,,,,,,,,,,,,,,, 10 | ,,,,,,List who $s is married to,List who John is married to,$o,Mary,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P27.csv: -------------------------------------------------------------------------------- 1 | 27,https://www.wikidata.org/wiki/Property:P27,$s is a national of $o.,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $o recognizes $s as its citizen.,First French Empire recognizes Napoleon as its citizen.,Is $s a citizen of $o?,Is Napoleon a citizen of First French Empire?,TRUE,TRUE,What are the nationalities of $s?,What are the nationalities of Napoleon?,$o,First French Empire,How many nationalities does $s have?,How many nationalities does Napoleon have?,$o,First French Empire,Which country has the fewest number of citizens?,Which country has the fewest number of citizens?,$o [SEP] $s,First French Empire [SEP] Napoleon,Which country has the largest number of citizens?,Which country has the largest number of citizens?,$o [SEP] $s,First French Empire [SEP] Napoleon,,,,,,,, 4 | $s is a citizen of $o,Napoleon is a citizen of First French Empire,Does $s have the nationality of $o?,Does Napoleon have the nationality of First French Empire?,TRUE,TRUE,Who are the nationals of $o?,Who are the nationals of First French Empire?,$s,Napoleon,How many nationals does $o have?,How many nationals does First French Empire have?,$s,Napoleon,Which country has the lowest number of nationals?,Which country has the lowest number of nationals?,$o [SEP] $s,First French Empire [SEP] Napoleon,Which country has the highest number of nationals?,Which country has the highest number of nationals?,$o [SEP] $s,First French Empire [SEP] Napoleon,,,,,,,, 5 | $s has the citizenship of $o,Napoleon has the citizenship of First French Empire,,,,,Who are the citizens of $o?,Who are the citizens of First French Empire?,$s,Napoleon,How many citizens does $o have?,How many citizens does First French Empire have?,$s,Napoleon,,,,,,,,,,,,,,,, 6 | $s has the nationality of $o,Napoleon has the nationality of First French Empire,,,,,Who has citizenship of $o?,Who has citizenship of First French Empire?,$s,Napoleon,How many inhabitants are there in $o?,How many inhabitants are there in First French Empire?,$s,Napoleon,,,,,,,,,,,,,,,, 7 | ,,,,,,What is the nationality of $s?,What is the nationality of Napoleon?,$o,First French Empire,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P35.csv: -------------------------------------------------------------------------------- 1 | 35,https://www.wikidata.org/wiki/Property:P35,$o is the head of state of $s,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $o is head of state of $s.,Queen Elizabeth II is head of state of United Kingdom.,Is $o head of state of $s?,Is Queen Elizabeth II head of state of United Kingdom?,TRUE,TRUE,List the heads of state of $s,List the heads of state of United Kingdom,$o,Queen Elizabeth II,How many countries is $o head of state of?,How many countries is Queen Elizabeth II head of state of?,$s,United Kingdom,Which country has the fewest leaders?,Which country has the fewest leaders?,$s [SEP] $o,United Kingdom [SEP] Queen Elizabeth II,Which country has the most the leaders?,Which country has the most the leaders?,$s [SEP] $o,United Kingdom [SEP] Queen Elizabeth II,,,,,,,, 4 | $o is the official with the highest formal authority in $s.,Queen Elizabeth II is the official with the highest formal authority in United Kingdom.,Is $s's head of state $o?,Is United Kingdom's head of state Queen Elizabeth II?,TRUE,TRUE,Who's in charge of $s?,Who's in charge of United Kingdom?,$o,Queen Elizabeth II,How many countries does $o rule?,How many countries does Queen Elizabeth II rule?,$s,United Kingdom,Which country has had the fewest leaders?,Which country has had the fewest leaders?,$s [SEP] $o,United Kingdom [SEP] Queen Elizabeth II,Which country has had the most leaders?,Which country has had the most leaders?,$s [SEP] $o,United Kingdom [SEP] Queen Elizabeth II,,,,,,,, 5 | ,,Is $o the official with the highest formal authority in $s?,Is Queen Elizabeth II the official with the highest formal authority in United Kingdom?,TRUE,TRUE,Who's the leader of $s?,Who's the leader of United Kingdom?,$o,Queen Elizabeth II,How many countries is $o the ruler of?,How many countries is Queen Elizabeth II the ruler of?,$s,United Kingdom,Who rules over the fewest countries?,Who rules over the fewest countries?,$o [SEP] $s,Queen Elizabeth II [SEP] United Kingdom,Who rules over the most countries?,Who rules over the most countries?,$o [SEP] $s,Queen Elizabeth II [SEP] United Kingdom,,,,,,,, 6 | ,,Is the official with the highest formal authority in $s $o?,Is the official with the highest formal authority in United Kingdom Queen Elizabeth II?,TRUE,TRUE,List the leaders of $s?,List the leaders of United Kingdom?,$o,Queen Elizabeth II,How many rulers does $s have?,How many rulers does United Kingdom have?,$o,Queen Elizabeth II,Who is head of state of the fewest places?,Who is head of state of the fewest places?,$o [SEP] $s,Queen Elizabeth II [SEP] United Kingdom,Who is head of state of the most places?,Who is head of state of the most places?,$o [SEP] $s,Queen Elizabeth II [SEP] United Kingdom,,,,,,,, 7 | ,,,,,,Where is $o the ruler?,Where is Queen Elizabeth II the ruler?,$s,United Kingdom,How many heads of states are in charge of $s?,How many heads of states are in charge of United Kingdom?,$o,Queen Elizabeth II,,,,,,,,,,,,,,,, 8 | ,,,,,,Who rules over $s?,Who rules over United Kingdom?,$o,Queen Elizabeth II,,,,,,,,,,,,,,,,,,,, 9 | ,,,,,,Where is $o the head of state?,Where is Queen Elizabeth II the head of state?,$s,United Kingdom,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P38.csv: -------------------------------------------------------------------------------- 1 | 38,https://www.wikidata.org/wiki/Property:P38,$s accepts currency $o,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s accepts $o,People's Republic of China accepts renminbi,Does $s accept $o?,Does People's Republic of China accept renminbi?,TRUE,TRUE,Which places accept $o?,Which places accept renminbi?,$s,People's Republic of China,How many currencies does $s accept?,How many currencies does People's Republic of China accept?,$o,renminbi,What is the least used currency?,What is the least used currency?,$o [SEP] $s,renminbi [SEP] People's Republic of China,What is the most used currency?,What is the most used currency?,$o [SEP] $s,renminbi [SEP] People's Republic of China,,,,,,,, 4 | $s uses the $o,People's Republic of China uses the renminbi,Is $o accepted in $s?,Is renminbi accepted in People's Republic of China?,TRUE,TRUE,What currencies are accepted by $s?,What currencies are accepted by People's Republic of China?,$o,renminbi,How many places accept $o?,How many places accept renminbi?,$s,People's Republic of China,What country uses the fewest currencies?,What country uses the fewest currencies?,$s [SEP] $o,People's Republic of China [SEP] renminbi,What country has the most currencies?,What country has the most currencies?,$s [SEP] $o,People's Republic of China [SEP] renminbi,,,,,,,, 5 | The currency of $s is $o,The currency of People's Republic of China is renminbi,Is $o used in $s?,Is renminbi used in People's Republic of China?,TRUE,TRUE,Which currencies are used by $s?,Which currencies are used by People's Republic of China?,$o,renminbi,How many countries accept $o?,How many countries accept renminbi?,$s,People's Republic of China,What places accept the fewest different currencies?,What places accept the fewest different currencies?,$s [SEP] $o,People's Republic of China [SEP] renminbi,What places accept the most currencies?,What places accept the most currencies?,$s [SEP] $o,People's Republic of China [SEP] renminbi,,,,,,,, 6 | The official currency of $s is $o,The official currency of People's Republic of China is renminbi,Is $o used by $s?,Is renminbi used by People's Republic of China?,TRUE,TRUE,What currency is accepted by $s?,What currency is accepted by People's Republic of China?,$o,renminbi,$o is a currency of how many places?,renminbi is a currency of how many places?,$s,People's Republic of China,,,,,,,,,,,,,,,, 7 | The accepted currency of $s is $o,The accepted currency of People's Republic of China is renminbi,Is the $o currency accepted by $s?,Is the renminbi currency accepted by People's Republic of China?,TRUE,TRUE,What currency is used in $s?,What currency is used in People's Republic of China?,$o,renminbi,,,,,,,,,,,,,,,,,,,, 8 | ,,,,,,What currency is used by $s?,What currency is used by People's Republic of China?,$o,renminbi,,,,,,,,,,,,,,,,,,,, 9 | ,,,,,,Where is $o used?,Where is renminbi used?,$s,People's Republic of China,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P47.csv: -------------------------------------------------------------------------------- 1 | 47,https://www.wikidata.org/wiki/Property:P47,$s shares border with $o.,NOTES: This is a symmetric relation. Only generate relations with one direction! The training data generation script will generate the reverse relation,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s is neighbour with $o.,France is neighbour with Spain.,Are $s and $o neighbours?,Are France and Spain neighbours?,TRUE,TRUE,Which places neighbour $s?,Which places neighbour France?,$o,Spain,How many neighbours does $s have?,How many neighbours does France have?,$o,Spain,Which place has the lowest number of neighbours?,Which place has the lowest number of neighbours?,$s [SEP] $o,France [SEP] Spain,Which place has the highest number of neighbours?,Which place has the highest number of neighbours?,$s [SEP] $o,France [SEP] Spain,,,,,,,, 4 | $s is a neighbour of $o.,France is a neighbour of Spain.,Does $s share a border with $o?,Does France share a border with Spain?,TRUE,TRUE,Which places share a border with $s?,Which places share a border with France?,$o,Spain,How many places share border with $s?,How many places share border with France?,$o,Spain,What is the place with the minimum number of borders?,What is the place with the minimum number of borders?,$s [SEP] $o,France [SEP] Spain,What is the place with the maximum number of borders?,What is the place with the maximum number of borders?,$s [SEP] $o,France [SEP] Spain,,,,,,,, 5 | $s and $o are neighbours.,France and Spain are neighbours.,Do $s and $o share borders?,Do France and Spain share borders?,TRUE,TRUE,What places are next to $s?,What places are next to France?,$o,Spain,How many places border $s?,How many places border France?,$o,Spain,,,,,,,,,,,,,,,, 6 | $s and $o share borders with each other.,France and Spain share borders with each other.,Do $o and $s have common borders?,Do Spain and France have common borders?,TRUE,TRUE,,,,,,,,,,,,,,,,,,,,,,,, 7 | $s shares a border with $o.,France shares a border with Spain.,,,,,,,,,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P50.csv: -------------------------------------------------------------------------------- 1 | 50,https://www.wikidata.org/wiki/Property:P50,$o author of $s,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $o is the author of $s,Jane Austen is the author of Pride and Prejudice,Did $o write $s?,Did Jane Austen write Pride and Prejudice?,TRUE,TRUE,What are the books written by $o?,What are the books written by Jane Austen?,$s,Pride and Prejudice,How many authors does $s have?,How many authors does Pride and Prejudice have?,$o,Jane Austen,Which book has the fewest number of authors?,Which book has the fewest number of authors?,$s [SEP] $o,Pride and Prejudice [SEP] Jane Austen,Which book has the highest number of authors?,Which book has the highest number of authors?,$s [SEP] $o,Pride and Prejudice [SEP] Jane Austen,,,,,,,, 4 | $o wrote $s,Jane Austen wrote Pride and Prejudice,Was $s written by $o?,Was Pride and Prejudice written by Jane Austen?,TRUE,TRUE,List all the books that $o is the author of.,List all the books that Jane Austen is the author of.,$s,Pride and Prejudice,How many books did $s write?,How many books did Pride and Prejudice write?,$s,Pride and Prejudice,Which author has written the least number of books?,Which author has written the least number of books?,$o [SEP] $s,Jane Austen [SEP] Pride and Prejudice,Which author has written the largest number of books?,Which author has written the largest number of books?,$o [SEP] $s,Jane Austen [SEP] Pride and Prejudice,,,,,,,, 5 | $s was written by $o,Pride and Prejudice was written by Jane Austen,Is $o the author of $s?,Is Jane Austen the author of Pride and Prejudice?,TRUE,TRUE,List all the authors of $s.,List all the authors of Pride and Prejudice.,$o.,Jane Austen.,How many people wrote $s?,How many people wrote Pride and Prejudice?,$o,Jane Austen,Who is author with the fewest number of books?,Who is author with the fewest number of books?,$o [SEP] $s,Jane Austen [SEP] Pride and Prejudice,Who is author with the maximum number of books?,Who is author with the maximum number of books?,$o [SEP] $s,Jane Austen [SEP] Pride and Prejudice,,,,,,,, 6 | $s's author is $o,Pride and Prejudice's author is Jane Austen,,,,,Who are writers of $s?,Who are writers of Pride and Prejudice?,$o.,Jane Austen.,How many books is $o the author of?,How many books is Jane Austen the author of?,$s,Pride and Prejudice,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P54.csv: -------------------------------------------------------------------------------- 1 | 54,https://www.wikidata.org/wiki/Property:P54,$s is a member of team $o ,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s is a member of $o,Diego Maradona is a member of FC Barcelona,Does $s play for $o?,Does Diego Maradona play for FC Barcelona?,TRUE,TRUE,Who plays for $o?,Who plays for FC Barcelona?,$s,Diego Maradona,How many people play for $o?,How many people play for FC Barcelona?,$s,Diego Maradona,Which team has the least players?,Which team has the least players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,Which team has the most players?,Which team has the most players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,, 4 | $s is a part of $o,Diego Maradona is a part of FC Barcelona,Is $s a member of $o?,Is Diego Maradona a member of FC Barcelona?,TRUE,TRUE,Who is a member of $o?,Who is a member of FC Barcelona?,$s,Diego Maradona,How many teams has $s played for?,How many teams has Diego Maradona played for?,$o,FC Barcelona,Which team has the fewest players?,Which team has the fewest players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,Which team has the highest number of players?,Which team has the highest number of players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,, 5 | $s plays for $o,Diego Maradona plays for FC Barcelona,Is $s part of $o?,Is Diego Maradona part of FC Barcelona?,TRUE,TRUE,$s is a member of which teams?,Diego Maradona is a member of which teams?,$o,FC Barcelona,,,,,What is the team with the fewest players?,What is the team with the fewest players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,What is the largest team?,What is the largest team?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,, 6 | $o's membership includes $s,FC Barcelona's membership includes Diego Maradona,,,,,List teams $s plays for,List teams Diego Maradona plays for,$o,FC Barcelona,,,,,What is the smallest team?,What is the smallest team?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,What is the team with the most players?,What is the team with the most players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,, 7 | ,,,,,,,,,,,,,,Who has played for the fewest teams?,Who has played for the fewest teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,What is the team with the most number of players?,What is the team with the most number of players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,, 8 | ,,,,,,,,,,,,,,Who has played for the least number of different teams?,Who has played for the least number of different teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,Who has played for the most teams?,Who has played for the most teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,,,,,,,, 9 | ,,,,,,,,,,,,,,Who has played for the least teams?,Who has played for the least teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,Who has played for the highest number of teams?,Who has played for the highest number of teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,,,,,,,, 10 | ,,,,,,,,,,,,,,Which team has the lowest number of players?,Which team has the lowest number of players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,Who has played for the most number of teams?,Who has played for the most number of teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P57.csv: -------------------------------------------------------------------------------- 1 | 57,https://www.wikidata.org/wiki/Property:P57,$o is the director of $s,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s is directed by $o,Titanic is directed by James Cameron,Was $o the director of $s?,Was James Cameron the director of Titanic?,TRUE,TRUE,Which movies has $o directed?,Which movies has James Cameron directed?,$s,Titanic,How many movies has $o directed?,How many movies has James Cameron directed?,$s,Titanic,Which director has directed minimum number of movies?,Which director has directed minimum number of movies?,$o [SEP] $s,James Cameron [SEP] Titanic,Which director has directed maximum number of movies?,Which director has directed maximum number of movies?,$o [SEP] $s,James Cameron [SEP] Titanic,,,,,,,, 4 | $s's director is $o,Titanic's director is James Cameron,Was $s directed by $o?,Was Titanic directed by James Cameron?,TRUE,TRUE,Which movies have been directed by $o?,Which movies have been directed by James Cameron?,$s,Titanic,How many directors does $s have?,How many directors does Titanic have?,$o,James Cameron,Who has directed minimum number of movies?,Who has directed minimum number of movies?,$o [SEP] $s,James Cameron [SEP] Titanic,Who has directed maximum number of movies?,Who has directed maximum number of movies?,$o [SEP] $s,James Cameron [SEP] Titanic,,,,,,,, 5 | $o directed $s,James Cameron directed Titanic,Is $o director of $s?,Is James Cameron director of Titanic?,TRUE,TRUE,Who are the directors of $s?,Who are the directors of Titanic?,$o,James Cameron,,,,,,,,,,,,,,,,,,,, 6 | $o is the director of $s,James Cameron is the director of Titanic,Did $o direct $s?,Did James Cameron direct Titanic?,TRUE,TRUE,,,,,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P58.csv: -------------------------------------------------------------------------------- 1 | 58,https://www.wikidata.org/wiki/Property:P58,$s screenwriter was $o ,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s was written by $o,Titanic was written by James Cameron,Did $o write the script for $s?,Did James Cameron write the script for Titanic?,TRUE,TRUE,Who wrote the script for $s?,Who wrote the script for Titanic?,$o,James Cameron,How many movies has $o written?,How many movies has James Cameron written?,$s,Titanic,Who has the written the fewest movie scripts?,Who has the written the fewest movie scripts?,$o [SEP] $s,James Cameron [SEP] Titanic,Who has written the most movie scripts?,Who has written the most movie scripts?,$o [SEP] $s,James Cameron [SEP] Titanic,,,,,,,, 4 | $o wrote the script of $s,James Cameron wrote the script of Titanic,Did $o write $s?,Did James Cameron write Titanic?,TRUE,TRUE,The script for $s is written by who?,The script for Titanic is written by who?,$o,James Cameron,How many movies was $o involved in?,How many movies was James Cameron involved in?,$s,Titanic,Who has written the fewest screenplays?,Who has written the fewest screenplays?,$o [SEP] $s,James Cameron [SEP] Titanic,Who has written the most screenplays?,Who has written the most screenplays?,$o [SEP] $s,James Cameron [SEP] Titanic,,,,,,,, 5 | The screenwriter for $s was $o,The screenwriter for Titanic was James Cameron,Did $o write the screenplay for $s?,Did James Cameron write the screenplay for Titanic?,TRUE,TRUE,Who is the screenwriter for $s?,Who is the screenwriter for Titanic?,$o,James Cameron,How many movies has $o written the screenplay for?,How many movies has James Cameron written the screenplay for?,$s,Titanic,Which movie has had the fewest writers,Which movie has had the fewest writers,$s [SEP] $o,Titanic [SEP] James Cameron,Which movie has had the most writers?,Which movie has had the most writers?,$s [SEP] $o,Titanic [SEP] James Cameron,,,,,,,, 6 | ,,Was $o involved in the making of $s?,Was James Cameron involved in the making of Titanic?,TRUE,TRUE,Who wrote the screenplay for $s?,Who wrote the screenplay for Titanic?,$o,James Cameron,How many writers were involved in $s?,How many writers were involved in Titanic?,$o,James Cameron,,,,,,,,,,,,,,,, 7 | ,,,,,,What films has $o written?,What films has James Cameron written?,$s,Titanic,,,,,,,,,,,,,,,,,,,, 8 | ,,,,,,$o has written which screenplays?,James Cameron has written which screenplays?,$s,Titanic,,,,,,,,,,,,,,,,,,,, 9 | ,,,,,,Which movies has $o written?,Which movies has James Cameron written?,$s,Titanic,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P6.csv: -------------------------------------------------------------------------------- 1 | 6,https://www.wikidata.org/wiki/Property:P6,"$s , head of government, $o ",,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | The head of government of $s is $o,The head of government of United States of America is Donald Trump,Is the leader of $s $o?,Is the leader of United States of America Donald Trump?,TRUE,TRUE,$o is the leader of which places?,Donald Trump is the leader of which places?,$s,United States of America,How many leaders does $s have?,How many leaders does United States of America have?,$o,Donald Trump,Which place has the fewest leaders?,Which place has the fewest leaders?,$s [SEP] $o,United States of America [SEP] Donald Trump,Which place has the most leaders?,Which place has the most leaders?,$s [SEP] $o,United States of America [SEP] Donald Trump,,,,,,,, 4 | The leader of $s is $o,The leader of United States of America is Donald Trump,Is $o the leader of $s?,Is Donald Trump the leader of United States of America?,TRUE,TRUE,$o is the leader of which governments?,Donald Trump is the leader of which governments?,$s,United States of America,$o is the leader of how many places?,Donald Trump is the leader of how many places?,$s,United States of America,What is the country with the most number of leaders?,What is the country with the most number of leaders?,$s [SEP] $o,United States of America [SEP] Donald Trump,What country has the most number of leaders?,What country has the most number of leaders?,$s [SEP] $o,United States of America [SEP] Donald Trump,,,,,,,, 5 | $o is the leader of $s,Donald Trump is the leader of United States of America,Is $s governed by $o?,Is United States of America governed by Donald Trump?,TRUE,TRUE,Where is $o the head of government?,Where is Donald Trump the head of government?,$s,United States of America,How many heads of government preside in $s?,How many heads of government preside in United States of America?,$o,Donald Trump,,,,,,,,,,,,,,,, 6 | $o is the head of government of $s,Donald Trump is the head of government of United States of America,Is $o a head of government of $s?,Is Donald Trump a head of government of United States of America?,TRUE,TRUE,Who governs $s?,Who governs United States of America?,$o,Donald Trump,How many country leaders are there?,How many country leaders are there?,$o,Donald Trump,,,,,,,,,,,,,,,, 7 | $s's government is led by $o.,United States of America's government is led by Donald Trump.,Is $s's government led by $o?,Is United States of America's government led by Donald Trump?,TRUE,TRUE,Who is a governer of $s?,Who is a governer of United States of America?,$o,Donald Trump,How many heads of government are there?,How many heads of government are there?,$o,Donald Trump,,,,,,,,,,,,,,,, 8 | ,,,,,,Who is a head of government of $s?,Who is a head of government of United States of America?,$o,Donald Trump,,,,,,,,,,,,,,,,,,,, 9 | ,,,,,,Which country is $o the leader of?,Which country is Donald Trump the leader of?,$s,United States of America,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/configs/for_v1.5/NDB Relation Templates - P61.csv: -------------------------------------------------------------------------------- 1 | 61,https://www.wikidata.org/wiki/Property:P61,$s was discovered or invented by $o ,,,,,,,,,,,,,,,,,,,,,,,,,,, 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output 3 | $s was discovered by $o,Uranus was discovered by William Herschel,Did $o discover $s?,Did William Herschel discover Uranus?,TRUE,TRUE,Who discovered $s?,Who discovered Uranus?,$o,William Herschel,How many things did $o discover?,How many things did William Herschel discover?,$s,Uranus,Who discovered the fewest things?,Who discovered the fewest things?,$o [SEP] $s,William Herschel [SEP] Uranus,Who has discovered the most things?,Who has discovered the most things?,$o [SEP] $s,William Herschel [SEP] Uranus,,,,,,,, 4 | $s was invented by $o,Uranus was invented by William Herschel,Did $o invent $s?,Did William Herschel invent Uranus?,TRUE,TRUE,Who invented $s?,Who invented Uranus?,$o,William Herschel,How many people discovered $s?,How many people discovered Uranus?,$o,William Herschel,What has had the fewest inventors?,What has had the fewest inventors?,$s [SEP] $o,Uranus [SEP] William Herschel,What has had the most inventors?,What has had the most inventors?,$s [SEP] $o,Uranus [SEP] William Herschel,,,,,,,, 5 | $o invented $s,William Herschel invented Uranus,Was $s discovered by $o?,Was Uranus discovered by William Herschel?,TRUE,TRUE,Who was the discoverer of $s?,Who was the discoverer of Uranus?,$o,William Herschel,How many people invented $s?,How many people invented Uranus?,$o,William Herschel,What item has had the fewest inventors?,What item has had the fewest inventors?,$s [SEP] $o,Uranus [SEP] William Herschel,What item has had the most inventors?,What item has had the most inventors?,$s [SEP] $o,Uranus [SEP] William Herschel,,,,,,,, 6 | $o discovered $s,William Herschel discovered Uranus,Was $s invented by $o?,Was Uranus invented by William Herschel?,TRUE,TRUE,Who was the inventor of $s?,Who was the inventor of Uranus?,$o,William Herschel,How many things did $o invent?,How many things did William Herschel invent?,$s,Uranus,What has had the least discoverers?,What has had the least discoverers?,$s [SEP] $o,Uranus [SEP] William Herschel,What has had the most discoverers?,What has had the most discoverers?,$s [SEP] $o,Uranus [SEP] William Herschel,,,,,,,, 7 | $o was the inventor of $s,William Herschel was the inventor of Uranus,Was $o the inventor of $s?,Was William Herschel the inventor of Uranus?,TRUE,TRUE,What did $o discover?,What did William Herschel discover?,$s,Uranus,,,,,Who has the fewest inventions?,Who has the fewest inventions?,$o [SEP] $s,William Herschel [SEP] Uranus,Who has the most inventions?,Who has the most inventions?,$o [SEP] $s,William Herschel [SEP] Uranus,,,,,,,, 8 | $o was the discoverer of $s,William Herschel was the discoverer of Uranus,Was $o the discoverer of $s?,Was William Herschel the discoverer of Uranus?,TRUE,TRUE,What did $o invent?,What did William Herschel invent?,$s,Uranus,,,,,Who has the least number of inventions?,Who has the least number of inventions?,$o [SEP] $s,William Herschel [SEP] Uranus,Who has had the most number of inventions?,Who has had the most number of inventions?,$o [SEP] $s,William Herschel [SEP] Uranus,,,,,,,, 9 | ,,,,,,List things $o discovered,List things William Herschel discovered,$s,Uranus,,,,,Who has had the fewest discoveries?,Who has had the fewest discoveries?,$o [SEP] $s,William Herschel [SEP] Uranus,Who has had the most discoveries?,Who has had the most discoveries?,$o [SEP] $s,William Herschel [SEP] Uranus,,,,,,,, 10 | ,,,,,,List things $o invented,List things William Herschel invented,$s,Uranus,,,,,,,,,,,,,,,,,,,, 11 | ,,,,,,List all things discovered,List all things discovered,$s,Uranus,,,,,,,,,,,,,,,,,,,, 12 | ,,,,,,List all discoveries,List all discoveries,$s,Uranus,,,,,,,,,,,,,,,,,,,, 13 | ,,,,,,List all inventions,List all inventions,$s,Uranus,,,,,,,,,,,,,,,,,,,, 14 | ,,,,,,What has been invented?,What has been invented?,$s,Uranus,,,,,,,,,,,,,,,,,,,, 15 | ,,,,,,What has been discovered?,What has been discovered?,$s,Uranus,,,,,,,,,,,,,,,,,,,, -------------------------------------------------------------------------------- /dataset-construction/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | pymongo 3 | numpy -------------------------------------------------------------------------------- /dataset-construction/scripts/initial_sample.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/25 --num_dbs_to_make 10000 --sample_rels 1 --sample_per_rel 16 --sample_extra 1 20 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/50 --num_dbs_to_make 5000 --sample_rels 1 --sample_per_rel 16 --sample_extra 2 21 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/100 --num_dbs_to_make 2500 --sample_rels 2 --sample_per_rel 16 --sample_extra 2 22 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/250 --num_dbs_to_make 1000 --sample_rels 2 --sample_per_rel 32 --sample_extra 3 23 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/500 --num_dbs_to_make 500 --sample_rels 2 --sample_per_rel 64 --sample_extra 4 24 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/1000 --num_dbs_to_make 250 --sample_rels 2 --sample_per_rel 128 --sample_extra 4 25 | -------------------------------------------------------------------------------- /dataset-construction/scripts/make_databases.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | size=$1 20 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/${size}_train work/newdbs/intermediate_train_${size}.jsonl --target-size ${size} 21 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/${size}_dev work/newdbs/intermediate_dev_${size}.jsonl --target-size ${size} 22 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/${size}_test work/newdbs/intermediate_test_${size}.jsonl --target-size ${size} -------------------------------------------------------------------------------- /dataset-construction/scripts/make_questions.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | size=$1 20 | python src/ndb_data/construction/make_questions.py work/newdbs/intermediate_train_${size}.jsonl work/newdbs/final_train_${size}.jsonl 21 | python src/ndb_data/construction/make_questions.py work/newdbs/intermediate_dev_${size}.jsonl work/newdbs/final_dev_${size}.jsonl 22 | python src/ndb_data/construction/make_questions.py work/newdbs/intermediate_test_${size}.jsonl work/newdbs/final_test_${size}.jsonl 23 | -------------------------------------------------------------------------------- /dataset-construction/scripts/make_v2.4.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | mkdir -pv work/newdbs 20 | 21 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/25 --num_dbs_to_make 1000 --sample_rels 1 --sample_per_rel 16 --sample_extra 2 22 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/25_train work/newdbs/intermediate_train_25.jsonl --target-size 25 23 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/25_dev work/newdbs/intermediate_dev_25.jsonl --target-size 25 24 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/25_test work/newdbs/intermediate_test_25.jsonl --target-size 25 25 | 26 | 27 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/50 --num_dbs_to_make 500 --sample_rels 1 --sample_per_rel 16 --sample_extra 2 28 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_50.jsonl --target-size 50 29 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_50.jsonl --target-size 50 30 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_50.jsonl --target-size 50 31 | 32 | 33 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/100 --num_dbs_to_make 250 --sample_rels 2 --sample_per_rel 16 --sample_extra 2 34 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_100.jsonl --target-size 100 35 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_100.jsonl --target-size 100 36 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_100.jsonl --target-size 100 37 | 38 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/250 --num_dbs_to_make 100 --sample_rels 2 --sample_per_rel 32 --sample_extra 3 39 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_250.jsonl --target-size 250 40 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_250.jsonl --target-size 250 41 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_250.jsonl --target-size 250 42 | 43 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/500 --num_dbs_to_make 50 --sample_rels 3 --sample_per_rel 40 --sample_extra 4 44 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_250.jsonl --target-size 500 45 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_250.jsonl --target-size 500 46 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_250.jsonl --target-size 500 47 | 48 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/1000 --num_dbs_to_make 25 --sample_rels 3 --sample_per_rel 50 --sample_extra 4 49 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_250.jsonl --target-size 1000 50 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_250.jsonl --target-size 1000 51 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_250.jsonl --target-size 1000 52 | 53 | 54 | 55 | bash scripts/make_databases.sh 50; 56 | bash scripts/make_databases.sh 100; 57 | bash scripts/make_databases.sh 250; 58 | bash scripts/make_databases.sh 500; 59 | bash scripts/make_databases.sh 1000; 60 | 61 | bash scripts/make_questions.sh 50; 62 | bash scripts/make_questions.sh 100; 63 | bash scripts/make_questions.sh 250; 64 | bash scripts/make_questions.sh 500; 65 | bash scripts/make_questions.sh 1000; -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/construction/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/construction/make_database_initial_cache.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | from collections import defaultdict 21 | 22 | import numpy as np 23 | from argparse import ArgumentParser 24 | from nltk import ngrams 25 | from nltk import word_tokenize 26 | from nltk.tokenize.treebank import TreebankWordDetokenizer 27 | from similarity.normalized_levenshtein import NormalizedLevenshtein 28 | from tqdm import tqdm 29 | 30 | from ndb_data.wikidata_common.wikidata import Wikidata 31 | 32 | detok = TreebankWordDetokenizer() 33 | 34 | 35 | def generate_hypotheses(instance): 36 | for s, r, o in instance["valid_hypotheses"]: 37 | if r not in final_templates: 38 | continue 39 | yield (s, r, o) 40 | 41 | 42 | def normalize_subject(subject_name, fact): 43 | if subject_name is None: 44 | return None 45 | 46 | skip = {"is", "a", "of", "between", "on", "in"} 47 | 48 | n = NormalizedLevenshtein() 49 | mixed_case_subject = not subject_name.islower() 50 | if mixed_case_subject and subject_name not in fact: 51 | toks = word_tokenize(fact) 52 | all_grams = [] 53 | for i in range(1, len(toks)): 54 | all_grams.extend(" ".join(a) for a in ngrams(toks, i) if a[0] not in skip) 55 | 56 | scores = [n.similarity(gram, subject_name) for gram in all_grams] 57 | best_post = int(np.argmax(scores)) 58 | 59 | original_subject_name = all_grams[best_post] 60 | if scores[best_post] < 0.5 or all_grams[best_post] == "name": 61 | return None 62 | 63 | fact = " ".join(toks) 64 | fact = fact.replace(original_subject_name, subject_name) 65 | fact = detok.detokenize(fact.split()).replace(" 's", "'s").replace(" ,", ",") 66 | 67 | if subject_name not in fact: 68 | return None 69 | 70 | assert subject_name in fact, f"Subject {subject_name} was not in {fact}" 71 | return fact 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = ArgumentParser() 76 | parser.add_argument("in_file") 77 | parser.add_argument("out_file") 78 | parser.add_argument("--estimated_size", type=int) 79 | args = parser.parse_args() 80 | 81 | wiki = Wikidata() 82 | loaded = [] 83 | by_subject = defaultdict(list) 84 | by_relation = defaultdict(list) 85 | by_object = defaultdict(list) 86 | 87 | with open("configs/generate_v1.5.json") as f: 88 | final_templates = json.load(f) 89 | 90 | # print(final_templates.keys()) 91 | with open(args.in_file) as f: 92 | for ix, line in enumerate(tqdm(f, total=args.estimated_size)): 93 | instance = json.loads(line) 94 | added_id = None 95 | 96 | # Check if it contains a relation we care about 97 | for s, r, o in generate_hypotheses(instance): 98 | if added_id is None: 99 | added_id = len(loaded) 100 | loaded.append(instance) 101 | 102 | # If it doesn't skip this claim 103 | if added_id is None: 104 | continue 105 | 106 | # Correct the claim 107 | fact = instance["candidate"].strip() 108 | for s, r, o in generate_hypotheses(instance): 109 | name = wiki.get_by_id_or_uri(s)["english_name"] 110 | 111 | if name is None: 112 | fact = None 113 | break 114 | 115 | fact = normalize_subject(name, fact) 116 | 117 | if fact is None: 118 | break 119 | 120 | if o.startswith("Q"): 121 | name = wiki.get_by_id_or_uri(o)["english_name"] 122 | if name is None: 123 | fact = None 124 | break 125 | 126 | fact = normalize_subject(name, fact) 127 | 128 | if fact is None: 129 | break 130 | 131 | instance["fact"] = fact 132 | if fact is None or "⁇" in fact: 133 | continue 134 | 135 | # Add the filtered relations to the dictionaries 136 | for s, r, o in generate_hypotheses(instance): 137 | by_subject[s].append(added_id) 138 | by_relation[r].append(added_id) 139 | by_object[o].append(added_id) 140 | 141 | with (open(args.out_file, "w+")) as f: 142 | json.dump( 143 | { 144 | "loaded": loaded, 145 | "by_subject": by_subject, 146 | "by_object": by_object, 147 | "by_relation": by_relation, 148 | }, 149 | f, 150 | ) 151 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/data_import/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/data_import/fix_sitelinks.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | from pymongo import UpdateOne 20 | from tqdm import tqdm 21 | 22 | from ndb_data.wikidata_common.wikidata import Wikidata 23 | 24 | 25 | def write_updates(batch_update): 26 | bulks = [] 27 | for k, v in batch_update: 28 | bulks.append(UpdateOne(k, v)) 29 | 30 | collection.bulk_write(bulks) 31 | 32 | 33 | if __name__ == "__main__": 34 | client = Wikidata() 35 | collection = client.collection 36 | 37 | batch_update = [] 38 | 39 | num_ops = 0 40 | tqdm_iter = tqdm( 41 | collection.find({}, {"_id": 1, "sitelinks": 1}), 42 | total=collection.estimated_document_count(), 43 | ) 44 | for i in tqdm_iter: 45 | if type(i["sitelinks"]) == dict: 46 | batch_update.append( 47 | ( 48 | {"_id": i["_id"]}, 49 | {"$set": {"sitelinks": list(i["sitelinks"].values())}}, 50 | ) 51 | ) 52 | 53 | if len(batch_update) > 10000: 54 | write_updates(batch_update) 55 | batch_update = [] 56 | num_ops += 1 57 | tqdm_iter.desc = f"Performed update {num_ops}" 58 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/data_import/kelm_data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | from argparse import ArgumentParser 21 | from tqdm import tqdm 22 | 23 | from ndb_data.wikidata_common.kelm import KELMMongo 24 | 25 | if __name__ == "__main__": 26 | parser = ArgumentParser() 27 | parser.add_argument("kelm_file") 28 | args = parser.parse_args() 29 | 30 | client = KELMMongo() 31 | collection = client.collection 32 | 33 | batch = [] 34 | insert_count = 0 35 | with open(args.kelm_file) as f: 36 | _tqdm_iter = tqdm(enumerate(f)) 37 | 38 | for idx, line in _tqdm_iter: 39 | instance = json.loads(line) 40 | 41 | subjects = set() 42 | relations = set() 43 | for hypothesis in instance["valid_hypotheses"]: 44 | s, r, o = hypothesis 45 | if s.startswith("Q"): 46 | subjects.add(s) 47 | 48 | if o is not None and not isinstance(o, dict) and o.startswith("Q"): 49 | subjects.add(o) 50 | 51 | relations.add(r) 52 | 53 | instance["entities"] = list(subjects) 54 | instance["relations"] = list(relations) 55 | 56 | batch.append(instance) 57 | if len(batch) >= 5000: 58 | collection.insert_many(batch) 59 | batch = [] 60 | insert_count += 1 61 | 62 | _tqdm_iter.desc = f"Insert batch {insert_count}" 63 | 64 | collection.insert_many(batch) 65 | client.close() 66 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/data_import/wikidata_index.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import bz2 20 | import json 21 | from collections import defaultdict 22 | from json import JSONDecodeError 23 | 24 | import pydash 25 | from argparse import ArgumentParser 26 | 27 | from tqdm import tqdm 28 | 29 | from ndb_data.wikidata_common.wikidata import Wikidata 30 | 31 | 32 | def read_dump(wikidata_file): 33 | with bz2.open(wikidata_file, mode="rt") as f: 34 | f.read(2) 35 | for line in f: 36 | yield line.rstrip(",\n") 37 | 38 | 39 | def get_indexable(instance): 40 | wikidata_id = pydash.get(instance, "id") 41 | english_name = pydash.get(instance, "labels.en.value") 42 | 43 | claims = pydash.get(instance, "claims") 44 | 45 | properties = set() 46 | property_entity = defaultdict(list) 47 | for property, claims in claims.items(): 48 | properties.add(property) 49 | for claim in claims: 50 | property_entity[property].append( 51 | ( 52 | pydash.get(claim, "mainsnak.datavalue.value"), 53 | list(pydash.get(claim, "qualifiers").values()) 54 | if pydash.get(claim, "qualifiers") is not None 55 | else None, 56 | ) 57 | ) 58 | sitelinks = pydash.get(instance, "sitelinks") 59 | enwiki = pydash.get(instance, "sitelinks.enwiki.title") 60 | yield wikidata_id, english_name, sitelinks, enwiki, list(properties), dict( 61 | property_entity 62 | ) 63 | 64 | 65 | def index_dump(dump): 66 | for idx, line in enumerate(dump): 67 | try: 68 | yield from get_indexable(json.loads(line)) 69 | except JSONDecodeError as e: 70 | print(e) 71 | pass 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = ArgumentParser() 76 | parser.add_argument("wikidata_file") 77 | args = parser.parse_args() 78 | 79 | wiki = Wikidata() 80 | collection = wiki.collection 81 | 82 | insert_count = 0 83 | dump = read_dump(args.wikidata_file) 84 | batch = [] 85 | 86 | _tqdm_iter = tqdm(index_dump(dump), total=90e6) 87 | for w_id, e_name, sitelinks, enwiki, props, prop_dict in _tqdm_iter: 88 | batch.append( 89 | { 90 | "wikidata_id": w_id, 91 | "english_name": e_name, 92 | "english_wiki": enwiki, 93 | "property_types": props, 94 | "properties": prop_dict, 95 | "sitelinks": list(sitelinks.values()), 96 | } 97 | ) 98 | 99 | if len(batch) >= 5000: 100 | collection.insert_many(batch) 101 | batch = [] 102 | insert_count += 1 103 | 104 | _tqdm_iter.desc = f"Insert batch {insert_count}" 105 | 106 | print("last") 107 | collection.insert_many(batch) 108 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/dataset_statistics.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | from argparse import ArgumentParser 21 | from collections import Counter 22 | 23 | 24 | def merge_type(query_type): 25 | return query_type 26 | # .replace("arg", "") 27 | 28 | 29 | def get_bool_ans(answers): 30 | return "NULL" if not len(answers) else ("TRUE" if "TRUE" in answers else "FALSE") 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = ArgumentParser() 35 | parser.add_argument("in_file") 36 | args = parser.parse_args() 37 | 38 | type_counter = Counter() 39 | support_set_size_counter = Counter() 40 | bool_ans_counter = Counter() 41 | total_queries = 0 42 | total_dbs = 0 43 | with open(args.in_file) as f: 44 | 45 | for line in f: 46 | database = json.loads(line) 47 | for query in database["queries"]: 48 | 49 | support_set_size_counter[len(query["facts"])] += 1 50 | type_counter[merge_type(query["type"])] += 1 51 | 52 | if query["type"] == "bool": 53 | bool_ans_counter[get_bool_ans(query["answer"])] += 1 54 | total_queries += len(database["queries"]) 55 | total_dbs += 1 56 | 57 | for k, v in type_counter.items(): 58 | print(k, v) 59 | 60 | print() 61 | for i in range(0, 20): 62 | print(i, support_set_size_counter[i]) 63 | 64 | print(total_queries, total_dbs) 65 | print(bool_ans_counter) 66 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/generation/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/generation/describe_db_facts.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import glob 20 | import json 21 | import os 22 | 23 | from tqdm import tqdm 24 | from transformers import AutoTokenizer 25 | 26 | if __name__ == "__main__": 27 | tokenizer = AutoTokenizer.from_pretrained("t5-base") 28 | if os.path.exists("db_sizes.jsonl"): 29 | os.unlink("db_sizes.jsonl") 30 | 31 | for file in tqdm(glob.glob("dbs/*.jsonl")): 32 | with open(file) as f: 33 | sizes = [] 34 | for line in f: 35 | db = json.loads(line) 36 | sizes.append(sum(len(tokenizer.tokenize(fact)) for fact in db["facts"])) 37 | 38 | with open("db_sizes.jsonl", "a+") as f: 39 | f.write(json.dumps({"file": file, "sizes": sizes}) + "\n") 40 | print(sizes) 41 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/generation/describe_dbs.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | import numpy as np 21 | from collections import Counter 22 | 23 | 24 | def read_databases(file): 25 | with open(file) as f: 26 | for line in f: 27 | instance = json.loads(line) 28 | yield instance 29 | 30 | 31 | if __name__ == "__main__": 32 | 33 | null_answers = 0 34 | true_answers = 0 35 | zero_answers = 0 36 | other_answers = 0 37 | 38 | num_fact_used = [] 39 | answer_sizes = [] 40 | type_counter = Counter() 41 | for db in read_databases("generated_dbs.jsonl"): 42 | for query in db["queries"]: 43 | num_fact_used.append(len(query["facts"])) 44 | type_counter[query["type"]] += 1 45 | answer_sizes.append(len(query["answer"])) 46 | 47 | if None in query["answer"]: 48 | null_answers += 1 49 | elif len(query["answer"]) == 0: 50 | zero_answers += 1 51 | elif True in query["answer"]: 52 | true_answers += 1 53 | else: 54 | other_answers += 1 55 | print(np.mean(num_fact_used), np.mean(answer_sizes)) 56 | print(type_counter) 57 | print(true_answers, null_answers, zero_answers, other_answers) 58 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/generation/filter_db_facts.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import glob 20 | import json 21 | import os 22 | from argparse import ArgumentParser 23 | 24 | from tqdm import tqdm 25 | from transformers import AutoTokenizer 26 | 27 | if __name__ == "__main__": 28 | tokenizer = AutoTokenizer.from_pretrained("t5-base") 29 | parser = ArgumentParser() 30 | parser.add_argument("in_dir") 31 | parser.add_argument("out_dir") 32 | args = parser.parse_args() 33 | 34 | os.makedirs(args.out_dir, exist_ok=True) 35 | 36 | for file in glob.glob(args.in_dir + "/*"): 37 | with open(file) as f, open( 38 | args.out_dir + "/" + os.path.basename(file), "w+" 39 | ) as of: 40 | sizes = [] 41 | for line in tqdm(f, desc=file): 42 | db = json.loads(line) 43 | tt = sum(len(tokenizer.tokenize(fact)) for fact in db["facts"]) 44 | if tt < 900: 45 | of.write(line) 46 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/generation/plot_db_sizes.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | from operator import itemgetter 21 | 22 | import numpy as np 23 | import matplotlib.pyplot as plt 24 | 25 | if __name__ == "__main__": 26 | 27 | with open("db_sizes.jsonl") as f: 28 | plot = [] 29 | 30 | for line in f: 31 | db = json.loads(line) 32 | size = int(db["file"].split(".")[0].rsplit("_", maxsplit=1)[1]) 33 | l5, lq, med, uq, u95, highest = np.percentile( 34 | db["sizes"], (1, 25, 50, 75, 99, 100) 35 | ) 36 | 37 | plot.append((size, (l5, lq, med, uq, u95, highest))) 38 | 39 | plot.sort(key=itemgetter(0)) 40 | lowers5 = [p[0] for q, p in plot] 41 | lowers = [p[1] for q, p in plot] 42 | median = [p[2] for q, p in plot] 43 | uppers = [p[3] for q, p in plot] 44 | upper5 = [p[4] for q, p in plot] 45 | limit = [p[5] for q, p in plot] 46 | nums = [q for q, p in plot] 47 | 48 | plt.fill_between(nums, lowers, uppers, alpha=0.3, color="purple") 49 | plt.fill_between(nums, lowers5, upper5, alpha=0.3, color="blue") 50 | plt.plot(nums, median, color="blue") 51 | plt.plot(nums, limit, color="blue") 52 | plt.title("KELM Database size") 53 | plt.ylabel("Number of subword tokens (T5 tokenizer)") 54 | plt.xlabel("Number of facts") 55 | plt.hlines(1024, 0, 50) 56 | 57 | plt.legend(["Median", "Max", "25th Percentile", "99th percentile", "Budget"]) 58 | 59 | plt.show() 60 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/util/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/util/build_json.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import glob 20 | import csv 21 | import json 22 | from argparse import ArgumentParser 23 | from collections import defaultdict 24 | 25 | import re 26 | 27 | 28 | def read_csv(f): 29 | next(f) 30 | reader = csv.DictReader(f) 31 | 32 | templates = defaultdict(set) 33 | 34 | for template in reader: 35 | 36 | if len(template["fact"]): 37 | templates["fact"].add(template["fact"]) 38 | 39 | if len(template["bool"]): 40 | if template["bool_answer"].lower() in ["true", "t", "1", "yes", "y"]: 41 | templates["bool"].add((template["bool"], template["bool_answer"])) 42 | 43 | if len(template["set"]): 44 | templates["set"].add((template["set"], template["set_projection"])) 45 | 46 | if len(template["count"]): 47 | templates["count"].add((template["count"], template["count_projection"])) 48 | 49 | if len(template["min"]): 50 | templates["min"].add((template["min"], template["min_projection"])) 51 | 52 | if len(template["max"]): 53 | templates["max"].add((template["max"], template["max_projection"])) 54 | 55 | if len(template["argmin"]): 56 | templates["argmin"].add((template["argmin"], template["argmin_projection"])) 57 | 58 | if len(template["argmax"]): 59 | templates["argmax"].add((template["argmax"], template["argmax_projection"])) 60 | 61 | templates["_subject"] = "$s" 62 | templates["_object"] = "$o" 63 | 64 | return {k: list(v) if isinstance(v, set) else v for k, v in templates.items()} 65 | 66 | 67 | def swap_so(statement): 68 | return statement.replace("$s", "$tmp_s").replace("$o", "$s").replace("$tmp_s", "$o") 69 | 70 | 71 | def make_symmetric(k, templates): 72 | 73 | if not k.startswith("_"): 74 | out = [] 75 | out.extend(templates) 76 | out.extend([(swap_so(t[0]), swap_so(t[1])) for t in templates if len(t) == 2]) 77 | out.extend([swap_so(t) for t in templates if isinstance(t, str)]) 78 | return out 79 | else: 80 | return templates 81 | 82 | 83 | if __name__ == "__main__": 84 | print("Generate") 85 | parser = ArgumentParser() 86 | parser.add_argument("version") 87 | args = parser.parse_args() 88 | # Read all CSV files in dir 89 | files = glob.glob("configs/for_{}/*.csv".format(args.version)) 90 | print(files) 91 | 92 | all_templates = {} 93 | for file in files: 94 | match = re.match(r".*(P[0-9]+).*", file) 95 | 96 | if match is not None: 97 | name = match.group(1) 98 | 99 | with open(file) as f: 100 | template = read_csv(f) 101 | 102 | if name in {"P47", "P26"}: 103 | all_templates[name] = { 104 | prop: make_symmetric(prop, rules) 105 | for prop, rules in template.items() 106 | } 107 | else: 108 | all_templates[name] = template 109 | 110 | with open("configs/generate_{}.json".format(args.version), "w+") as of: 111 | json.dump(all_templates, of, indent=4) 112 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/wikidata_common/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/wikidata_common/common_mongo.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import os 20 | from abc import ABC 21 | 22 | import pymongo 23 | 24 | 25 | class MongoDataSource(ABC): 26 | def __init__(self): 27 | user = os.getenv("MONGO_USER", "") 28 | password = os.getenv("MONGO_PASSWORD", "") 29 | host = os.getenv("MONGO_HOST", "localhost") 30 | port = os.getenv("MONGO_PORT", "27017") 31 | db = os.getenv("MONGO_DB", "wikidata") 32 | 33 | client = pymongo.MongoClient(f"mongodb://{user}:{password}@{host}:{port}") 34 | 35 | self.db = client[db] 36 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/wikidata_common/kelm.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | from ndb_data.wikidata_common.common_mongo import MongoDataSource 20 | 21 | 22 | class KELMMongo(MongoDataSource): 23 | def __init__(self): 24 | super().__init__() 25 | self.collection = self.db["kelm"] 26 | 27 | def find_entity(self, entity): 28 | results = self.collection.find({"entities": entity}) 29 | return results 30 | 31 | def find_entity_rel(self, entity, rels): 32 | results = self.collection.find( 33 | {"entities": entity, "relations": {"$in": list(rels)}} 34 | ) 35 | return results 36 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/wikidata_common/wikidata.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | from ndb_data.wikidata_common.common_mongo import MongoDataSource 20 | 21 | 22 | class Wikidata(MongoDataSource): 23 | def __init__(self): 24 | super().__init__() 25 | self.collection = self.db["wiki_graph"] 26 | 27 | def get_by_id_or_uri(self, unit_uri): 28 | return self.collection.find_one( 29 | {"wikidata_id": unit_uri.replace("http://www.wikidata.org/entity/", "")} 30 | ) 31 | 32 | def find_custom(self, search_key, search_toks): 33 | return self.collection.find({search_key: {"$in": search_toks}}) 34 | 35 | def find_matching_relation(self, relation): 36 | return self.collection.find({"propery_types": relation}) 37 | -------------------------------------------------------------------------------- /dataset-construction/src/ndb_data/wikidata_common/wikpedia.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | from ndb_data.wikidata_common.common_mongo import MongoDataSource 20 | 21 | 22 | class Wikipedia(MongoDataSource): 23 | def __init__(self): 24 | super().__init__() 25 | self.collection = self.db["wiki_redirects"] 26 | 27 | def resolve_redirect(self, names): 28 | results = self.collection.find({"title": {"$in": names}}) 29 | new_search = [] 30 | for res in results: 31 | new_search.append(res["target"]) 32 | return new_search 33 | -------------------------------------------------------------------------------- /modelling/.gitignore: -------------------------------------------------------------------------------- 1 | private/ 2 | -------------------------------------------------------------------------------- /modelling/README.md: -------------------------------------------------------------------------------- 1 | # Neural database implementation for Database Reasoning over Text ACL paper 2 | 3 | ## Package Structure 4 | 5 | ``` 6 | resources/ # DB files go here 7 | 8 | src/ 9 | neuraldb/ # Python package containing NeuralDB project 10 | 11 | dataset/ # Components pertaining to reading and loading the DB files 12 | instance_generator/ # Generates model inputs from different DB formats 13 | 14 | evaluation/ # Helper functions that allow in-line evaluation of the models from the training script 15 | modelling/ # Extra models/trainers etc 16 | retriever/ # TF-IDF and DPR baselines 17 | util/ # Other utils 18 | 19 | tests/ # Unit tests for scorer 20 | ``` 21 | 22 | ## TF-IDF and DPR retrieval 23 | 24 | Baseline retrieval methods can be run first, collecting the data to be used by the downstream models 25 | 26 | ``` 27 | bash scripts/baselines/retrieve.sh dpr 28 | bash scripts/baselines/retrieve.sh tfidf 29 | ``` 30 | 31 | ## Replicating ACL experiments 32 | 33 | These will train and generate predictions for the v2.4 databases containing up to 25 facts. 34 | The scripts use task spooler to manager a queue of jobs. If you do not have this, remove `tsp` from the scripts. 35 | ``` 36 | export SEED=1 37 | bash scripts/experiments_ours.sh v2.4_25 38 | bash scripts/experiments_baseline.sh v2.4_25 39 | ``` 40 | 41 | The final scoring script would take the predictions generated by these scripts and evaluate them against the reference predictions. 42 | 43 | ``` 44 | python -m neuraldb.final_scoring 45 | ``` 46 | 47 | Graphs which plot the answer accuracy by DB size are generated from 48 | ``` 49 | python -m neuraldb.final_scoring_with_db_size 50 | ``` 51 | 52 | ### Larger databases 53 | There are a couple of variants of this scoring script to evaluate for larger databases (v2.4_50, v2.4_100, v2.4_250, v2.4_500 and v2.4_1000): 54 | This would involve running the models trained on 25 facts with larger databases. 55 | 56 | ``` 57 | bash scripts/ours/predict_spj_rand_sweep.sh 58 | python -m neuraldb.final_scoring_with_db_size_sweep 59 | ``` 60 | 61 | ### Fusion in decoder baseline 62 | 63 | Was performed using a modified version of the FiD code adapted from https://github.com/facebookresearch/FiD, the outputs of this can be converted to the NeuralDB format with 64 | 65 | ``` 66 | python -m neuraldb.convert_legacy_predictions 67 | ``` -------------------------------------------------------------------------------- /modelling/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.6.0 2 | torch 3 | datasets 4 | fever-drqa 5 | -------------------------------------------------------------------------------- /modelling/scripts/baselines/retrieve.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | dataset=$1 20 | retriever=$2 21 | 22 | export PYTHONPATH=src 23 | mkdir -pv resources/${dataset}_${retriever} 24 | python src/neuraldb/retriever/${retriever}.py resources/${dataset}/train.jsonl resources/${dataset}_${retriever}/train.jsonl 25 | python src/neuraldb/retriever/${retriever}.py resources/${dataset}/dev.jsonl resources/${dataset}_${retriever}/dev.jsonl 26 | python src/neuraldb/retriever/${retriever}.py resources/${dataset}/test.jsonl resources/${dataset}_${retriever}/test.jsonl -------------------------------------------------------------------------------- /modelling/scripts/baselines/train_longformer.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | export PYTHONPATH=src 20 | export TRANSFORMERS_CACHE=/local/scratch/jt719/.cache 21 | 22 | data=$1 23 | generator=$2 24 | lr=$3 25 | steps=${4:-1} 26 | seed=${SEED:-1} 27 | 28 | work_dir=work/${data}/model=longformer,generator=${generator},lr=${lr},steps=${steps}/seed-${seed} 29 | data_dir=resources/${data} 30 | 31 | python src/neuraldb/run.py \ 32 | --model_name_or_path allenai/led-base-16384 \ 33 | --learning_rate ${lr} \ 34 | --gradient_accumulation_steps ${steps} \ 35 | --output_dir ${work_dir} \ 36 | --train_file ${data_dir}/train.jsonl \ 37 | --validation_file ${data_dir}/dev.jsonl \ 38 | --instance_generator ${generator} \ 39 | --do_train \ 40 | --do_eval \ 41 | --num_train_epochs 3 \ 42 | --evaluation_strategy epoch \ 43 | --per_device_train_batch_size 8 \ 44 | --per_device_eval_batch_size 8 \ 45 | --predict_with_generate \ 46 | --save_total_limit 2 \ 47 | --seed ${seed} \ 48 | --save_steps 10000 49 | #--overwrite_output_dir \ 50 | 51 | rm -rfv ${work_dir}/checkpoint-* -------------------------------------------------------------------------------- /modelling/scripts/baselines/train_t5.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | export PYTHONPATH=src 20 | export TRANSFORMERS_CACHE=/local/scratch/jt719/.cache 21 | 22 | data=$1 23 | generator=$2 24 | lr=$3 25 | steps=${4:-1} 26 | seed=${SEED:-1} 27 | 28 | work_dir=work/${data}/model=t5,generator=${generator},lr=${lr},steps=${steps}/seed-${seed} 29 | data_dir=resources/${data} 30 | 31 | python src/neuraldb/run.py \ 32 | --model_name_or_path t5-base \ 33 | --learning_rate ${lr} \ 34 | --gradient_accumulation_steps ${steps} \ 35 | --output_dir ${work_dir} \ 36 | --train_file ${data_dir}/train.jsonl \ 37 | --validation_file ${data_dir}/dev.jsonl \ 38 | --instance_generator ${generator} \ 39 | --do_train \ 40 | --do_eval \ 41 | --num_train_epochs 3 \ 42 | --evaluation_strategy epoch \ 43 | --per_device_train_batch_size 32 \ 44 | --per_device_eval_batch_size 32 \ 45 | --predict_with_generate \ 46 | --save_total_limit 2 \ 47 | --seed ${seed} \ 48 | --save_steps 10000 49 | #--overwrite_output_dir \ 50 | 51 | rm -rfv ${work_dir}/checkpoint-* -------------------------------------------------------------------------------- /modelling/scripts/baselines/train_t5_retriever.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | export PYTHONPATH=src 20 | export TRANSFORMERS_CACHE=/local/scratch/jt719/.cache 21 | 22 | data=$1 23 | generator=$2 24 | retriever=$3 25 | lr=$4 26 | steps=${5:-1} 27 | seed=${SEED:-1} 28 | 29 | work_dir=work/${data}/model=t5,generator=${generator},retriever=${retriever},lr=${lr},steps=${steps}/seed-${seed} 30 | data_dir=resources/${data}_${retriever} 31 | 32 | python src/neuraldb/run.py \ 33 | --model_name_or_path t5-base \ 34 | --learning_rate ${lr} \ 35 | --gradient_accumulation_steps ${steps} \ 36 | --output_dir ${work_dir} \ 37 | --train_file ${data_dir}/train.jsonl \ 38 | --validation_file ${data_dir}/dev.jsonl \ 39 | --instance_generator ${generator} \ 40 | --do_train \ 41 | --do_eval \ 42 | --num_train_epochs 3 \ 43 | --evaluation_strategy epoch \ 44 | --per_device_train_batch_size 8 \ 45 | --per_device_eval_batch_size 8 \ 46 | --predict_with_generate \ 47 | --save_total_limit 2 \ 48 | --seed ${seed} \ 49 | --save_steps 10000 50 | #--overwrite_output_dir \ 51 | 52 | rm -rfv ${work_dir}/checkpoint-* -------------------------------------------------------------------------------- /modelling/scripts/convert_ssg_predictions.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | 20 | dataset=${1:-v2.4} 21 | 22 | function convert(){ 23 | size=$1 24 | 25 | echo "Convert ${dataset} ${size}" 26 | mkdir -pv resources/${dataset}_${size}_ssg 27 | python src/neuraldb/convert_ssg_predictions.py resources/ssg_predictions/${dataset}_${size}/test_0.8_st_ssg_sup.json resources/${dataset}_${size}_ssg/test.jsonl --master_file resources/${dataset}_${size}/test.jsonl 28 | } 29 | 30 | convert 25 31 | convert 50 32 | convert 100 33 | convert 250 34 | convert 500 35 | convert 1000 -------------------------------------------------------------------------------- /modelling/scripts/experiments_baselines.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | function do_predictions() { 20 | model_path=$1 21 | generator=$2 22 | predictions_path=$3 23 | tsp python src/neuraldb/run.py \ 24 | --model_name_or_path $model_path \ 25 | --output_dir $model_path \ 26 | --predictions_file $model_path/predictions.jsonl \ 27 | --do_predict --test_file resources/${predictions_path}/test.jsonl \ 28 | --instance_generator $generator \ 29 | --per_device_eval_batch_size 4 \ 30 | --predict_with_generate 31 | 32 | } 33 | 34 | dataset=${1:-v2.4_25} 35 | export seed=${SEED:-1} 36 | 37 | SEED=${seed} tsp bash scripts/baselines/train_t5.sh $dataset perfectir 1e-4 38 | SEED=${seed} tsp bash scripts/baselines/train_t5.sh $dataset wholedb 1e-4 39 | 40 | do_predictions work/${dataset}/model=t5,generator=perfectir,lr=1e-4,steps=1/seed-${seed} perfectir ${dataset} 41 | do_predictions work/${dataset}/model=t5,generator=wholedb,lr=1e-4,steps=1/seed-${seed} wholedb ${dataset} 42 | 43 | SEED=${seed} tsp bash scripts/baselines/train_longformer.sh $dataset perfectir 1e-4 44 | SEED=${seed} tsp bash scripts/baselines/train_longformer.sh $dataset wholedb 1e-4 45 | 46 | do_predictions work/${dataset}/model=longformer,generator=perfectir,lr=1e-4,steps=1/seed-${seed} perfectir ${dataset} 47 | do_predictions work/${dataset}/model=longformer,generator=wholedb,lr=1e-4,steps=1/seed-${seed} wholedb ${dataset} 48 | 49 | SEED=${seed} tsp bash scripts/baselines/train_t5_retriever.sh $dataset externalir dpr 1e-4 50 | SEED=${seed} tsp bash scripts/baselines/train_t5_retriever.sh $dataset externalir tfidf 1e-4 51 | 52 | do_predictions work/${dataset}/model=t5,generator=externalir,retriever=dpr,lr=1e-4,steps=1/seed-${seed} externalir ${dataset}_dpr 53 | do_predictions work/${dataset}/model=t5,generator=externalir,retriever=tfidf,lr=1e-4,steps=1/seed-${seed} externalir ${dataset}_tfidf 54 | -------------------------------------------------------------------------------- /modelling/scripts/experiments_ours.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | dataset=${1:-v2.4_25} 20 | export seed=${SEED:-1} 21 | 22 | SEED=${seed} tsp bash scripts/ours/train_spj.sh $dataset spj_rand 1e-4 23 | SEED=${seed} tsp bash scripts/ours/train_spj.sh $dataset spj 1e-4 24 | -------------------------------------------------------------------------------- /modelling/scripts/ours/predict_spj.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | dataset=v2.4_25 20 | 21 | function do_predictions_spj() { 22 | model_path=$1 23 | generator=spj 24 | tsp python src/neuraldb/run.py \ 25 | --model_name_or_path $model_path \ 26 | --output_dir $model_path \ 27 | --predictions_file $model_path/intermediate_predictions.jsonl \ 28 | --do_predict --test_file resources/${dataset}/test.jsonl \ 29 | --instance_generator $generator \ 30 | --per_device_eval_batch_size 64 \ 31 | --predict_with_generate 32 | 33 | tsp python src/neuraldb/convert_spj_to_predictions.py $model_path/intermediate_predictions.jsonl $model_path/predictions.jsonl 34 | } 35 | 36 | function do_predictions_ssg_spj() { 37 | model_path=$1 38 | out_path=$2 39 | generator=spj 40 | 41 | mkdir -pv $out_path 42 | tsp python src/neuraldb/run.py \ 43 | --model_name_or_path $model_path \ 44 | --output_dir $model_path \ 45 | --predictions_file $out_path/intermediate_predictions.jsonl \ 46 | --do_predict \ 47 | --test_file resources/${dataset}_ssg/test.jsonl \ 48 | --instance_generator $generator \ 49 | --per_device_eval_batch_size 64 \ 50 | --predict_with_generate 51 | 52 | tsp python src/neuraldb/convert_spj_to_predictions.py $out_path/intermediate_predictions.jsonl $out_path/predictions.jsonl --actual_file resources/${dataset}/test.jsonl 53 | } 54 | 55 | seed=${SEED:-1} 56 | do_predictions_spj work/${dataset}/model=t5,generator=spj,lr=1e-4,steps=1/seed-${seed} 57 | -------------------------------------------------------------------------------- /modelling/scripts/ours/predict_spj_rand_sweep.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | dataset=v2.4_25 20 | 21 | function do_predictions_ssg_spj() { 22 | dataset1=$1 23 | dataset2=$2 24 | model_path=$3 25 | generator=spj 26 | mkdir -pv work/${dataset2}/$model_path/ 27 | tsp python src/neuraldb/run.py \ 28 | --model_name_or_path work/${dataset1}/$model_path \ 29 | --output_dir work/${dataset1}/$model_path \ 30 | --predictions_file work/${dataset2}/$model_path/intermediate_predictions.jsonl \ 31 | --do_predict --test_file resources/${dataset2}_ssg/test.jsonl \ 32 | --instance_generator $generator \ 33 | --per_device_eval_batch_size 64 \ 34 | --predict_with_generate 35 | 36 | tsp python src/neuraldb/convert_spj_to_predictions.py work/${dataset2}/$model_path/intermediate_predictions.jsonl work/${dataset2}/$model_path/predictions.jsonl --actual_file resources/${dataset2}/test.jsonl 37 | } 38 | 39 | 40 | seed=${SEED:-1} 41 | do_predictions_ssg_spj v2.4_25 v2.4_25 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed} 42 | do_predictions_ssg_spj v2.4_25 v2.4_50 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed} 43 | do_predictions_ssg_spj v2.4_25 v2.4_100 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed} 44 | do_predictions_ssg_spj v2.4_25 v2.4_250 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed} 45 | do_predictions_ssg_spj v2.4_25 v2.4_500 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed} 46 | do_predictions_ssg_spj v2.4_25 v2.4_1000 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed} -------------------------------------------------------------------------------- /modelling/scripts/ours/train_spj.sh: -------------------------------------------------------------------------------- 1 | ## 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | ## 4 | ## This file is part of NeuralDB. 5 | ## See https://github.com/facebookresearch/NeuralDB for further info. 6 | ## 7 | ## Licensed under the Apache License, Version 2.0 (the "License"); 8 | ## you may not use this file except in compliance with the License. 9 | ## You may obtain a copy of the License at 10 | ## 11 | ## http://www.apache.org/licenses/LICENSE-2.0 12 | ## 13 | ## Unless required by applicable law or agreed to in writing, software 14 | ## distributed under the License is distributed on an "AS IS" BASIS, 15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ## See the License for the specific language governing permissions and 17 | ## limitations under the License. 18 | ## 19 | export PYTHONPATH=src 20 | export TRANSFORMERS_CACHE=/local/scratch/jt719/.cache 21 | 22 | dataset=$1 23 | generator=$2 24 | lr=$3 25 | steps=${4:-1} 26 | seed=${SEED:-1} 27 | 28 | work_dir=work/${dataset}/model=t5,generator=${generator},lr=${lr},steps=${steps}/seed-${seed} 29 | data_dir=resources/${dataset} 30 | 31 | python src/neuraldb/run.py \ 32 | --model_name_or_path t5-base \ 33 | --learning_rate ${lr} \ 34 | --gradient_accumulation_steps ${steps} \ 35 | --output_dir ${work_dir} \ 36 | --train_file ${data_dir}/train.jsonl \ 37 | --validation_file ${data_dir}/dev.jsonl \ 38 | --instance_generator ${generator} \ 39 | --do_train \ 40 | --do_eval \ 41 | --num_train_epochs 3 \ 42 | --evaluation_strategy epoch \ 43 | --per_device_train_batch_size 8 \ 44 | --per_device_eval_batch_size 8 \ 45 | --predict_with_generate \ 46 | --save_total_limit 2 \ 47 | --seed ${seed} \ 48 | --save_steps 10000 49 | 50 | rm -rfv ${work_dir}/checkpoint-* -------------------------------------------------------------------------------- /modelling/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import shutil 20 | from pathlib import Path 21 | from setuptools import find_packages, setup 22 | 23 | stale_egg_info = Path(__file__).parent / "neuraldb.egg-info" 24 | if stale_egg_info.exists(): 25 | shutil.rmtree(stale_egg_info) 26 | 27 | setup( 28 | name="neuraldb", 29 | version="0.0.0", 30 | author="", 31 | author_email="jt719@cam.ac.uk", 32 | description="NeuralDB Baseline implementation", 33 | long_description=open("README.md", "r", encoding="utf-8").read(), 34 | long_description_content_type="text/markdown", 35 | keywords="NLP neuraldb neural database deep learning transformer", 36 | license="Apache", 37 | url="", 38 | package_dir={"": "src"}, 39 | packages=find_packages("src"), 40 | extras_require=[], 41 | entry_points={"console_scripts": [""]}, 42 | python_requires=">=3.6.0", 43 | install_requires=["transformers==4.6.0","torch","datasets","fever-drqa"], 44 | classifiers=[ 45 | "Programming Language :: Python :: 3.8", 46 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 47 | ], 48 | ) 49 | -------------------------------------------------------------------------------- /modelling/src/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/__pycache__/run.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/__pycache__/run.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/convert_legacy_predictions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | from argparse import ArgumentParser 21 | 22 | from functools import reduce 23 | 24 | if __name__ == "__main__": 25 | 26 | parser = ArgumentParser() 27 | parser.add_argument("in_file") 28 | parser.add_argument("out_file") 29 | args = parser.parse_args() 30 | with open(args.in_file) as f, open( 31 | args.out_file, 32 | "w+", 33 | ) as of: 34 | for line in f: 35 | results = json.loads(line) 36 | for prediction in results["test"]["raw"]: 37 | predicted, actual, ems, eml, meta = prediction 38 | 39 | print(meta) 40 | 41 | instance = { 42 | "prediction": predicted.split("[LIST]"), 43 | "actual": actual.split("[LIST]"), 44 | "metadata": { 45 | "dbsize": len( 46 | set(reduce(lambda a, b: a + b, meta["query"]["gold_facts"])) 47 | ) 48 | if len(meta["query"]["gold_facts"]) 49 | else 0, 50 | "type": meta["query"]["metadata"]["query_type"], 51 | }, 52 | } 53 | 54 | of.write(json.dumps(instance) + "\n") 55 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/convert_ssg_predictions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | import logging 21 | from argparse import ArgumentParser 22 | from collections import defaultdict 23 | 24 | from neuraldb.util.log_helper import setup_logging 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | if __name__ == "__main__": 30 | setup_logging() 31 | 32 | parser = ArgumentParser() 33 | parser.add_argument("predictions_file") 34 | parser.add_argument("output_file") 35 | parser.add_argument("--master_file", required=True) 36 | args = parser.parse_args() 37 | questions_answers = defaultdict(list) 38 | 39 | use_predicted_type = True 40 | master_file = args.master_file # "resources/v2.1_25/test.jsonl" 41 | out_file = args.output_file # "resources/v2.1_25/dev_ssg_predictions.jsonl" 42 | predictions_file = ( 43 | args.predictions_file 44 | ) # "resources/v2.1_25/dev_0.76_st_ssg_sup.json" 45 | 46 | predicted_instances = {} 47 | with open(predictions_file) as f: 48 | predictions = json.load(f) 49 | for inst in predictions: 50 | predicted_instances[(inst["db_id"], inst["question_id"])] = inst 51 | 52 | with open(master_file) as f, open(out_file, "w+") as of: 53 | for db_idx, line in enumerate(f): 54 | database = json.loads(line) 55 | 56 | for q_idx, query in enumerate(database["queries"]): 57 | query["predicted_facts"] = [ 58 | [a[0] for a in b] 59 | for b in predicted_instances[(db_idx, q_idx)]["ssg_output"] 60 | ] 61 | 62 | of.write(json.dumps(database) + "\n") 63 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/__pycache__/data_collator_seq2seq.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/data_collator_seq2seq.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/__pycache__/neuraldb_file_reader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/neuraldb_file_reader.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/__pycache__/neuraldb_parser.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/neuraldb_parser.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/__pycache__/seq2seq_dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/seq2seq_dataset.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/__pycache__/instance_generator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/instance_generator.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/__pycache__/perfectir_generator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/perfectir_generator.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/__pycache__/spj_generator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/spj_generator.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/__pycache__/wholedb_generator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/wholedb_generator.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/externalir_generator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import itertools 20 | import logging 21 | 22 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class ExternalIRGenerator(InstanceGenerator): 28 | def _process_query(self, query_obj, update_tokens): 29 | query_tokens = self.tokenizer.tokenize("Answer question: " + query_obj["query"]) 30 | answer_tokens = [ 31 | self.maybe_tokenize_answer(answer) for answer in query_obj["answer"] 32 | ] 33 | 34 | facts = set(itertools.chain(*query_obj["predicted_facts"])) 35 | context_tokens = [update_tokens[fact] for fact in facts][:10] 36 | 37 | yield self.maybe_decorate_with_metadata( 38 | { 39 | "query": query_tokens, 40 | "context": context_tokens, 41 | "output": self.concatenate_answer(answer_tokens), 42 | }, 43 | query_obj, 44 | ) 45 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/externalir_generator_maxtok.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import itertools 20 | import logging 21 | 22 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class ExternalIRGeneratorMaxTok(InstanceGenerator): 28 | def _process_query(self, query_obj, update_tokens): 29 | query_tokens = self.tokenizer.tokenize("Answer question: " + query_obj["query"]) 30 | answer_tokens = [ 31 | self.maybe_tokenize_answer(answer) for answer in query_obj["answer"] 32 | ] 33 | 34 | facts = set(itertools.chain(*query_obj["predicted_facts"])) 35 | context_tokens = [] 36 | for fact in facts: 37 | context_tokens.append(update_tokens[fact]) 38 | if len(list(itertools.chain(*context_tokens))) > 900: 39 | break 40 | 41 | yield self.maybe_decorate_with_metadata( 42 | { 43 | "query": query_tokens, 44 | "context": context_tokens, 45 | "output": self.concatenate_answer(answer_tokens), 46 | }, 47 | query_obj, 48 | ) 49 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/perfectir_generator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import itertools 20 | import logging 21 | 22 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class PerfectIRGenerator(InstanceGenerator): 28 | def _process_query(self, query_obj, update_tokens): 29 | query_tokens = self.tokenizer.tokenize("Answer question: " + query_obj["query"]) 30 | answer_tokens = [ 31 | self.maybe_tokenize_answer(answer) for answer in query_obj["answer"] 32 | ] 33 | 34 | facts = set(itertools.chain(*query_obj["facts"])) 35 | context_tokens = [update_tokens[fact] for fact in facts] 36 | 37 | yield self.maybe_decorate_with_metadata( 38 | { 39 | "query": query_tokens, 40 | "context": context_tokens, 41 | "output": self.concatenate_answer(answer_tokens), 42 | }, 43 | query_obj, 44 | ) 45 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/subsampler.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import random 20 | 21 | 22 | class Subsampler: 23 | 24 | # Take a list of sample types and probability of keeping 25 | def __init__(self, sample_types): 26 | self.sample_types = sample_types 27 | 28 | # 29 | def maybe_drop_sample(self, query): 30 | if query["type"] in self.sample_types: 31 | sample_rate = self.sample_types[query["type"]] 32 | rand = random.random() 33 | 34 | if isinstance(sample_rate, list): 35 | if not len(query["answer"]): 36 | sample_rate = sample_rate[2] 37 | else: 38 | if "TRUE" in query["answer"]: 39 | sample_rate = sample_rate[0] 40 | else: 41 | sample_rate = sample_rate[1] 42 | 43 | # Drop sample if needed 44 | return rand < sample_rate 45 | 46 | return False 47 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/instance_generator/wholedb_generator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import logging 20 | 21 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class WholeDBGenerator(InstanceGenerator): 27 | def _process_query(self, query_obj, update_tokens): 28 | query_tokens = self.tokenizer.tokenize(query_obj["query"]) 29 | answer_tokens = [ 30 | self.maybe_tokenize_answer(answer) for answer in query_obj["answer"] 31 | ] 32 | 33 | context_tokens = update_tokens[: query_obj["height"] + 1] 34 | 35 | yield self.maybe_decorate_with_metadata( 36 | { 37 | "query": query_tokens, 38 | "context": context_tokens, 39 | "output": self.concatenate_answer(answer_tokens), 40 | }, 41 | query_obj, 42 | ) 43 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/neuraldb_file_reader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | import logging 21 | import os 22 | 23 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator 24 | from neuraldb.dataset.neuraldb_parser import NeuralDBParser 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | class NeuralDBFileReader: 30 | def __init__(self, instance_generator: InstanceGenerator): 31 | self.database_reader = NeuralDBParser() 32 | self.instance_generator = instance_generator 33 | 34 | def read(self, file_path): 35 | logger.info("Reading instances from {}".format(file_path)) 36 | 37 | database_count = 0 38 | with open(file_path) as f: 39 | for idx, line in enumerate(f): 40 | database_count += 1 41 | database = json.loads(line) 42 | 43 | loaded_database = self.database_reader.load_instances(database) 44 | yield from self.instance_generator.generate( 45 | loaded_database, database_idx=idx 46 | ) 47 | 48 | if os.getenv("DEBUG", None) is not None and idx > 3: 49 | break 50 | 51 | logger.info("Dataset file contains {} databases".format(database_count)) 52 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/neuraldb_parser.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import random 20 | import logging 21 | from typing import List, Any, Dict 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class NeuralDBParser: 27 | def __init__(self, max_queries=None): 28 | self._max_queries = max_queries 29 | 30 | def load_instances(self, database: Dict[str, List[Any]]): 31 | return self._load_instances(database) 32 | 33 | def _load_instances(self, database: Dict[str, List[Any]]): 34 | logger.debug("Loading updates") 35 | updates = map(self._read_update, database["facts"]) 36 | 37 | logger.debug("Loading queries") 38 | queries = filter( 39 | lambda query: query is not None, 40 | map(self._read_query, self._maybe_sample(database["queries"])), 41 | ) 42 | 43 | return {"updates": updates, "queries": queries, "metadata": {}} 44 | 45 | def _maybe_sample(self, queries: List[Any]): 46 | if self._max_queries is not None: 47 | queries = random.sample(queries, min(len(queries), self._max_queries)) 48 | return queries 49 | 50 | def _read_update(self, update): 51 | return update 52 | 53 | def _read_query(self, query): 54 | answer, answer_type = self._process_answer(query["answer"]) 55 | query["answer"] = answer 56 | query["answer_type"] = answer_type 57 | return query 58 | 59 | # return { 60 | # "id": query["id"], 61 | # "height": query["height"], 62 | # "input": query["query"], 63 | # "output": answer, 64 | # "metadata": { 65 | # "answer_type": answer_type, 66 | # "original_instance": query 67 | # } 68 | # } 69 | 70 | def _process_answer(self, answer): 71 | return answer, None 72 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset/seq2seq_dataset.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | from operator import itemgetter 20 | from torch.utils.data import Dataset 21 | from tqdm import tqdm 22 | import logging 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class Seq2SeqDataset(Dataset): 28 | def __init__(self, generator, auto_pad=None): 29 | self.generator = generator 30 | self.auto_pad = auto_pad 31 | 32 | if self.auto_pad: 33 | self.features = list( 34 | map( 35 | self.auto_pad, 36 | tqdm(generator, desc="Reading and padding instances"), 37 | ) 38 | ) 39 | else: 40 | self.features = list(tqdm(generator, desc="Reading instances")) 41 | 42 | def __len__(self): 43 | return len(self.features) 44 | 45 | def __getitem__(self, item): 46 | return self.features[item] 47 | 48 | def to_dict(self): 49 | assert len(self.features) 50 | keys = self.features[0].keys() 51 | return {key: list(map(itemgetter(key), self.features)) for key in keys} 52 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/dataset_statistics.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import logging 20 | import json 21 | from collections import defaultdict 22 | import random 23 | 24 | from datasets import tqdm 25 | 26 | from neuraldb.dataset.neuraldb_parser import NeuralDBParser 27 | from neuraldb.util.log_helper import setup_logging 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | def get_instances_from_file(file): 33 | parser = NeuralDBParser() 34 | 35 | with open(file) as f: 36 | for line in f: 37 | database = json.loads(line) 38 | yield from parser.load_instances(database)["queries"] 39 | 40 | 41 | def get_bool_breakdown(answers): 42 | if len(answers) == 0: 43 | return "NULL" 44 | 45 | answer_str = " ".join(answers) 46 | 47 | if "TRUE" in answer_str: 48 | return "TRUE" 49 | 50 | elif "FALSE" in answer_str: 51 | return "FALSE" 52 | 53 | assert False, "malformed" 54 | 55 | 56 | def get_file_stats(file, drop_argmax_chance=None): 57 | stats = defaultdict(lambda: defaultdict(int)) 58 | for instance in tqdm(get_instances_from_file(file)): 59 | if drop_argmax_chance and instance["type"] in ["argmax", "argmin"]: 60 | if random.randint(0, 100) < drop_argmax_chance * 100: 61 | continue 62 | 63 | stats["type"][instance["type"]] += 1 64 | stats["relation"][instance["relation"]] += 1 65 | stats["num_support_sets"][len(instance["facts"])] += 1 66 | 67 | if instance["type"] == "bool": 68 | stats["bool_breakdown"][get_bool_breakdown(instance["answer"])] += 1 69 | 70 | return stats 71 | 72 | 73 | if __name__ == "__main__": 74 | setup_logging() 75 | print() 76 | 77 | file = "resources/v2.1_25_big/train.jsonl" 78 | 79 | stats = get_file_stats(file, 0.8) 80 | 81 | print(stats) 82 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/evaluation/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/evaluation/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/evaluation/__pycache__/postprocess_baselines.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/evaluation/__pycache__/postprocess_baselines.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/evaluation/__pycache__/scoring_functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/evaluation/__pycache__/scoring_functions.cpython-38.pyc -------------------------------------------------------------------------------- /modelling/src/neuraldb/evaluation/postprocess_spj.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | import logging 21 | import random 22 | 23 | import numpy as np 24 | 25 | from neuraldb.evaluation.scoring_functions import ( 26 | average_score, 27 | exact_match, 28 | exact_match_case_insensitive, 29 | breakdown_score, 30 | ) 31 | 32 | logger = logging.getLogger(__name__) 33 | 34 | 35 | def get_spj_evaluation(data_args, tokenizer, generator): 36 | def postprocess_text(preds, labels): 37 | preds = [ 38 | [ 39 | answer.strip() if len(answer.strip()) else generator.null_answer_special 40 | for answer in pred.replace( 41 | tokenizer.bos_token if tokenizer.bos_token is not None else "", "" 42 | ) 43 | .replace( 44 | tokenizer.eos_token if tokenizer.eos_token is not None else "", "" 45 | ) 46 | .replace( 47 | tokenizer.pad_token if tokenizer.pad_token is not None else "", "" 48 | ) 49 | .strip() 50 | .split(generator.answer_delimiter) 51 | ] 52 | for pred in preds 53 | ] 54 | labels = [ 55 | [ 56 | answer.strip() if len(answer.strip()) else generator.null_answer_special 57 | for answer in label.replace( 58 | tokenizer.bos_token if tokenizer.bos_token is not None else "", "" 59 | ) 60 | .replace( 61 | tokenizer.eos_token if tokenizer.eos_token is not None else "", "" 62 | ) 63 | .replace( 64 | tokenizer.pad_token if tokenizer.pad_token is not None else "", "" 65 | ) 66 | .strip() 67 | .split(generator.answer_delimiter) 68 | ] 69 | for label in labels 70 | ] 71 | 72 | return preds, labels 73 | 74 | def compute_metrics(eval_preds): 75 | preds, labels, metadata = eval_preds 76 | 77 | if isinstance(preds, tuple): 78 | preds = preds[0] 79 | 80 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=False) 81 | if data_args.ignore_pad_token_for_loss: 82 | # Replace -100 in the labels as we can't decode them. 83 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 84 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False) 85 | 86 | # Some simple post-processing 87 | decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) 88 | if data_args.predictions_file is not None: 89 | with open(data_args.predictions_file, "w+") as f: 90 | for pred, label, meta in zip(decoded_preds, decoded_labels, metadata): 91 | f.write( 92 | json.dumps( 93 | {"prediction": pred, "actual": label, "metadata": meta} 94 | ) 95 | + "\n" 96 | ) 97 | 98 | sampled_ids = random.sample(list(range(len(decoded_preds))), 10) 99 | for id in sampled_ids: 100 | logger.info( 101 | f"Example prediction \n" 102 | f"Q: {metadata[id]['question']}\n" 103 | f"P: {decoded_preds[id]}\n" 104 | f"A: {decoded_labels[id]}\n" 105 | f"\n" 106 | ) 107 | 108 | em = average_score(decoded_labels, decoded_preds, exact_match) 109 | em_lower = average_score( 110 | decoded_labels, decoded_preds, exact_match_case_insensitive 111 | ) 112 | 113 | result = { 114 | "em": em, 115 | "emi": em_lower, 116 | "em_breakdown_type": breakdown_score( 117 | "type", decoded_labels, decoded_preds, metadata, exact_match 118 | ), 119 | "em_breakdown_relation": breakdown_score( 120 | "relation", decoded_labels, decoded_preds, metadata, exact_match 121 | ), 122 | } 123 | 124 | prediction_lens = [ 125 | np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds 126 | ] 127 | result["gen_len"] = np.mean(prediction_lens) 128 | result = { 129 | k: {k2: round(v2, 4) for k2, v2 in v.items()} 130 | if isinstance(v, dict) 131 | else round(v, 4) 132 | for k, v in result.items() 133 | } 134 | return result 135 | 136 | return compute_metrics 137 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/evaluation/scoring_functions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | from collections import defaultdict 20 | 21 | 22 | def precision(actual, predicted): 23 | return ( 24 | sum(1.0 for p in predicted if p in actual) / float(len(predicted)) 25 | if len(predicted) 26 | else 1.0 27 | ) 28 | 29 | 30 | def recall(actual, predicted): 31 | return ( 32 | sum(1.0 for p in predicted if p in actual) / float(len(actual)) 33 | if len(actual) 34 | else 1.0 35 | ) 36 | 37 | 38 | def f1(actual, predicted): 39 | actual = set(actual) 40 | predicted = set(predicted) 41 | 42 | pr = precision(actual, predicted) 43 | rec = recall(actual, predicted) 44 | 45 | return compute_f1(pr, rec) 46 | 47 | 48 | def join_decoded(decoded_labels): 49 | return " ".join(decoded_labels) 50 | 51 | 52 | def exact_match(actual, predicted): 53 | return 1.0 if join_decoded(actual) == join_decoded(predicted) else 0.0 54 | 55 | 56 | def exact_match_case_insensitive(actual, predicted): 57 | return ( 58 | 1.0 if join_decoded(actual).lower() == join_decoded(predicted).lower() else 0.0 59 | ) 60 | 61 | 62 | def compute_f1(pr, rec): 63 | return 2.0 * pr * rec / (pr + rec) if (pr + rec > 0.0) else 0.0 64 | 65 | 66 | def average_score(all_actual, all_predicted, scoring_function): 67 | running_score = 0 68 | num_instances = 0 69 | 70 | for actual, predicted in zip(all_actual, all_predicted): 71 | num_instances += 1 72 | local_score = scoring_function(actual, predicted) 73 | assert local_score <= 1 74 | 75 | running_score += local_score 76 | assert running_score <= num_instances 77 | 78 | return running_score / num_instances if num_instances > 0 else 0.0 79 | 80 | 81 | def breakdown_score(key, all_actual, all_predicted, metadata, scoring_function): 82 | running_score = defaultdict(int) 83 | num_instances = defaultdict(int) 84 | 85 | for actual, predicted, metadatum in zip(all_actual, all_predicted, metadata): 86 | num_instances[metadatum[key]] += 1 87 | local_score = scoring_function(actual, predicted) 88 | assert local_score <= 1 89 | 90 | running_score[metadatum[key]] += local_score 91 | assert running_score[metadatum[key]] <= num_instances[metadatum[key]] 92 | 93 | return { 94 | key: running_score[key] / num_instances[key] if num_instances[key] > 0 else 0.0 95 | for key in num_instances.keys() 96 | } 97 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/final_scoring.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import glob 20 | import json 21 | from collections import OrderedDict, defaultdict 22 | 23 | import numpy as np 24 | import pandas as pd 25 | 26 | from neuraldb.evaluation.scoring_functions import f1 27 | 28 | 29 | def load_experiment(path): 30 | 31 | running_score = defaultdict(lambda: defaultdict(int)) 32 | running_count = defaultdict(lambda: defaultdict(int)) 33 | 34 | print(path) 35 | with open(path) as f: 36 | for line in f: 37 | instance = json.loads(line) 38 | actual = instance["actual"] 39 | prediction = instance["prediction"] 40 | 41 | local_score = f1(set(actual), set(prediction)) 42 | 43 | # relation = instance["metadata"]["relation"] 44 | # running_score["relation"][relation] += local_score 45 | # running_count["relation"][relation] += 1 46 | 47 | qtype = instance["metadata"]["type"] 48 | if qtype in {"argmin", "argmax", "min", "max"}: 49 | qtype = "minmax" 50 | running_score["type"][qtype] += local_score 51 | running_count["type"][qtype] += 1 52 | 53 | running_score["all"][""] += local_score 54 | running_count["all"][""] += 1 55 | 56 | scores = {} 57 | for k, v in running_score.items(): 58 | for attr, val in v.items(): 59 | score = ( 60 | running_score[k][attr] / running_count[k][attr] 61 | if running_count[k][attr] 62 | else 0 63 | ) 64 | print(f"Running score: {k}\t{attr}\t\t{score}") 65 | scores["_".join([k, attr])] = ( 66 | running_score[k][attr] / running_count[k][attr] 67 | if running_count[k][attr] 68 | else 0 69 | ) 70 | 71 | return scores 72 | 73 | 74 | if __name__ == "__main__": 75 | ndb_predictions = glob.glob( 76 | "consolidated/work/v2.4_25/**/predictions.jsonl", recursive=True 77 | ) 78 | all_experiments = [] 79 | for prediction in ndb_predictions: 80 | print(prediction) 81 | 82 | experiment = OrderedDict() 83 | 84 | for element in prediction.split("/"): 85 | if "," in element: 86 | for kvp in element.split(","): 87 | k, v = kvp.split("=", maxsplit=1) 88 | experiment[k] = v 89 | elif "-" in element: 90 | for kvp in element.split(","): 91 | k, v = kvp.split("-", maxsplit=1) 92 | experiment[k] = v 93 | 94 | # experiment["ssg"] = prediction.replace(".jsonl", "").rsplit("_", maxsplit=1)[1] 95 | experiment["dataset"] = prediction.split("/")[2] 96 | if "retriever" not in experiment: 97 | experiment["retriever"] = "" 98 | experiment["path"] = prediction 99 | all_experiments.append(experiment) 100 | 101 | print("Reading by experiment: \n\n\n") 102 | for expt in all_experiments: 103 | expt.update(load_experiment(expt["path"])) 104 | del expt["path"] 105 | 106 | frame = pd.DataFrame(all_experiments) 107 | frame[frame.select_dtypes(include=["number"]).columns] *= 100 108 | pd.set_option("display.width", 1000) 109 | pd.set_option("display.max_columns", None) 110 | 111 | aggr = {"all_": [np.mean, np.std]} 112 | aggr.update({k: [np.mean] for k in frame.columns if "type" in k}) 113 | pt = pd.pivot_table( 114 | frame, index=["model", "generator", "retriever", "lr", "steps"], aggfunc=aggr 115 | ) 116 | print(pt) 117 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/modelling/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/retriever/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/retriever/dpr.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | import logging 21 | 22 | import torch 23 | from argparse import ArgumentParser 24 | from tqdm import tqdm 25 | from transformers import ( 26 | DPRContextEncoderTokenizer, 27 | DPRContextEncoder, 28 | ) 29 | from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer 30 | 31 | from neuraldb.util.log_helper import setup_logging 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | 36 | class DPRRetriever: 37 | def __init__(self): 38 | self.question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( 39 | "facebook/dpr-question_encoder-single-nq-base" 40 | ) 41 | self.question_model = DPRQuestionEncoder.from_pretrained( 42 | "facebook/dpr-question_encoder-single-nq-base" 43 | ).to("cuda") 44 | self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained( 45 | "facebook/dpr-ctx_encoder-single-nq-base" 46 | ) 47 | self.context_model = DPRContextEncoder.from_pretrained( 48 | "facebook/dpr-ctx_encoder-single-nq-base" 49 | ).to("cuda") 50 | 51 | def lookup(self, queries, facts): 52 | encoded_questions = self.question_tokenizer(queries, padding=True) 53 | device_inputs = { 54 | k: torch.LongTensor(v).to("cuda") for k, v in encoded_questions.items() 55 | } 56 | question_outputs = self.question_model(**device_inputs) 57 | 58 | encoded_context = self.context_tokenizer(facts, padding=True) 59 | device_inputs = { 60 | k: torch.LongTensor(v).to("cuda") for k, v in encoded_context.items() 61 | } 62 | context_outputs = self.context_model(**device_inputs) 63 | 64 | yield from torch.matmul( 65 | question_outputs.pooler_output, context_outputs.pooler_output.T 66 | ).cpu().detach().numpy().argsort(axis=1).tolist() 67 | 68 | 69 | if __name__ == "__main__": 70 | 71 | parser = ArgumentParser() 72 | parser.add_argument("in_file") 73 | parser.add_argument("out_file") 74 | args = parser.parse_args() 75 | 76 | setup_logging() 77 | dpr = DPRRetriever() 78 | dpr.context_model.eval() 79 | dpr.question_model.eval() 80 | with open(args.in_file) as f, open(args.out_file, "w+") as of: 81 | for line in tqdm(f): 82 | database = json.loads(line) 83 | 84 | facts = database["facts"] 85 | queries = [q["query"] for q in database["queries"]] 86 | 87 | for query, ids in zip(database["queries"], dpr.lookup(queries, facts)): 88 | filtered_ids = list( 89 | filter(lambda idx: idx <= query["height"], ids[::-1]) 90 | ) 91 | query["predicted_facts"] = [filtered_ids] 92 | 93 | of.write(json.dumps(database) + "\n") 94 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/retriever/tfidf.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | import logging 21 | from argparse import ArgumentParser 22 | 23 | import math 24 | from drqascripts.retriever.build_tfidf_lines import OnlineTfidfDocRanker 25 | 26 | from neuraldb.util.log_helper import setup_logging 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | class TFIDFRetriever: 32 | class RankArgs: 33 | def __init__(self): 34 | self.ngram = 2 35 | self.hash_size = int(math.pow(2, 24)) 36 | self.tokenizer = "simple" 37 | self.num_workers = None 38 | self.max_sent = 50 39 | 40 | def __init__(self): 41 | self.args = self.RankArgs() 42 | 43 | def lookup(self, queries, facts): 44 | tfidf = OnlineTfidfDocRanker(self.args, facts) 45 | 46 | for query in queries: 47 | ids, scores = tfidf.closest_docs(query, self.args.max_sent) 48 | yield ids 49 | 50 | 51 | if __name__ == "__main__": 52 | 53 | parser = ArgumentParser() 54 | parser.add_argument("in_file") 55 | parser.add_argument("out_file") 56 | args = parser.parse_args() 57 | 58 | setup_logging() 59 | tfidf = TFIDFRetriever() 60 | with open(args.in_file) as f, open(args.out_file, "w+") as of: 61 | for line in f: 62 | database = json.loads(line) 63 | 64 | facts = database["facts"] 65 | queries = [q["query"] for q in database["queries"]] 66 | 67 | for query, ids in zip(database["queries"], tfidf.lookup(queries, facts)): 68 | query["predicted_facts"] = [ 69 | list(filter(lambda idx: idx <= query["height"], ids)) 70 | ] 71 | 72 | of.write(json.dumps(database) + "\n") 73 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/util/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | -------------------------------------------------------------------------------- /modelling/src/neuraldb/util/log_helper.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | import logging 21 | import os 22 | 23 | def setup_logging(): 24 | h = logging.StreamHandler(None) 25 | logging.root.addHandler(h) 26 | logging.root.setLevel(os.environ.get("LOGLEVEL", "INFO")) 27 | 28 | 29 | if __name__ == "__main__": 30 | setup_logging() 31 | logger = logging.getLogger(__name__) 32 | 33 | logger.debug("This is a debug message") 34 | logger.info("This is an info message") 35 | logger.warning("This is a warning message") 36 | logger.error("This is an error message") 37 | logger.critical("This is a critical message") 38 | -------------------------------------------------------------------------------- /modelling/tests/test_evaluation.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import unittest 20 | from neuraldb.evaluation.scoring_functions import ( 21 | precision, 22 | recall, 23 | compute_f1, 24 | average_score, 25 | ) 26 | 27 | 28 | class PrecisionTestCase(unittest.TestCase): 29 | def setUp(self) -> None: 30 | self.gold_reference = ["a", "b", "c", "d", "e"] 31 | 32 | self.precision_fail1 = ["a", "b", "c", "d", "e", "f"] 33 | self.precision_fail2 = ["f"] 34 | self.precision_fail3 = ["a", "f"] 35 | 36 | def testPrecisionExact(self): 37 | self.assertEqual(precision(self.gold_reference, self.gold_reference), 1) 38 | 39 | def testPrecisionFailOneTooMany(self): 40 | self.assertEqual(precision(self.gold_reference, self.precision_fail1), 5 / 6) 41 | 42 | def testPrecisionFailOnlyWrong(self): 43 | self.assertEqual(precision(self.gold_reference, self.precision_fail2), 0) 44 | 45 | def testPrecisionFailHalfWrong(self): 46 | self.assertEqual(precision(self.gold_reference, self.precision_fail3), 0.5) 47 | 48 | def testPrecisionNoPredictions(self): 49 | self.assertEqual(precision(self.gold_reference, []), 1) 50 | 51 | def testPrecisionNoSourceNoPredictions(self): 52 | self.assertEqual(precision([], []), 1) 53 | 54 | def testPrecisionNoSourceBadPredictions(self): 55 | self.assertEqual(precision([], self.precision_fail2), 0) 56 | 57 | 58 | class RecallTestCase(unittest.TestCase): 59 | def setUp(self) -> None: 60 | self.gold_reference = ["a", "b", "c", "d", "e"] 61 | 62 | self.recall_ok1 = ["a", "b", "c", "d", "e", "f"] 63 | self.recall_fail1 = ["f"] 64 | self.recall_fail2 = ["a", "f"] 65 | self.recall_fail3 = ["a"] 66 | 67 | def testRecallExact(self): 68 | self.assertEqual(recall(self.gold_reference, self.gold_reference), 1) 69 | 70 | def testRecallTooMany(self): 71 | self.assertEqual(recall(self.gold_reference, self.recall_ok1), 1) 72 | 73 | def testRecallNoPredictions(self): 74 | self.assertEqual(recall(self.gold_reference, []), 0) 75 | 76 | def testRecallNoSourceNoPredictions(self): 77 | self.assertEqual(recall([], []), 1) 78 | 79 | def testRecallOnlyOne(self): 80 | self.assertEqual(recall(self.gold_reference, self.recall_fail3), 1 / 5) 81 | 82 | def testRecallOnlyWithFalsePositive(self): 83 | self.assertEqual(recall(self.gold_reference, self.recall_fail2), 1 / 5) 84 | 85 | def testRecallOnlyFalsePositive(self): 86 | self.assertEqual(recall(self.gold_reference, self.recall_fail1), 0) 87 | 88 | 89 | class F1Test(unittest.TestCase): 90 | def testBothOne(self): 91 | self.assertEqual(compute_f1(1, 1), 1.0) 92 | 93 | def testBothZero(self): 94 | self.assertEqual(compute_f1(0, 0), 0.0) 95 | 96 | def testOneZero(self): 97 | self.assertEqual(compute_f1(1, 0), 0.0) 98 | 99 | def testBothHalf(self): 100 | self.assertEqual(compute_f1(0.5, 0.5), 0.5) 101 | 102 | def testHalfAndOne(self): 103 | self.assertEqual(compute_f1(0.5, 1), 2 / 3) 104 | 105 | 106 | class AverageScoreTest(unittest.TestCase): 107 | @staticmethod 108 | def passThroughA(a, b): 109 | return a 110 | 111 | def setUp(self): 112 | self.scores0 = [0, 0, 0, 0] 113 | self.scores1Quarter = [0, 0, 0, 1] 114 | self.scores1 = [1, 1, 1, 1] 115 | 116 | def testScoresZero(self): 117 | self.assertEqual( 118 | average_score(self.scores0, self.scores0, self.passThroughA), 0 119 | ) 120 | 121 | def testScores1Quarter(self): 122 | self.assertEqual( 123 | average_score(self.scores1Quarter, self.scores1Quarter, self.passThroughA), 124 | 1 / 4, 125 | ) 126 | 127 | def testScores1(self): 128 | self.assertEqual( 129 | average_score(self.scores1, self.scores1, self.passThroughA), 1 130 | ) 131 | 132 | def testScoresNoInstances(self): 133 | self.assertEqual(average_score([], [], self.passThroughA), 0) 134 | -------------------------------------------------------------------------------- /overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/overview.png -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit 2 | black 3 | flake8 4 | licenseheaders -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | -------------------------------------------------------------------------------- /ssg/README.md: -------------------------------------------------------------------------------- 1 | To Train the ssg: 2 | ``` 3 | python train_ssg.py -i "data_folder" -b "batch_size" -e "number of epochs" -o "output folder" 4 | ``` 5 | 6 | To run the prediction: 7 | 8 | ``` 9 | python ssg_prediction.py -i "data_folder" -m "model_address" -th list_of_thresholds 10 | ``` 11 | 12 | To evaluate the predictions: 13 | 14 | ``` 15 | python evaluate_set_ssg.py -i "prediction file" 16 | ``` -------------------------------------------------------------------------------- /ssg/requirements.txt: -------------------------------------------------------------------------------- 1 | sentence-transformers==0.4.1.2 2 | transformers==4.5.1 3 | torch -------------------------------------------------------------------------------- /ssg/ssg_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import json 20 | 21 | 22 | def read_NDB(data_file): 23 | with open(data_file) as file: 24 | dataset = [] 25 | 26 | for line in file: 27 | db = json.loads(line) 28 | 29 | facts = db["facts"] 30 | queries = db["queries"] 31 | dataset.append([facts, queries]) 32 | return dataset 33 | 34 | 35 | def create_dataset(db): 36 | dataset = [] 37 | eos = "" 38 | for d in db: 39 | 40 | questions = d[1] 41 | ctx = d[0] 42 | 43 | for q in questions: 44 | 45 | t = q["height"] 46 | gold_facts = q["facts"] 47 | context = ctx[: t + 1] 48 | flat_facts = [item for sublist in gold_facts for item in sublist] 49 | 50 | # all facts in flat facts can be positive 51 | state = [q["query"]] 52 | pos_act = [context[g] for g in flat_facts] 53 | # everything else is negative 54 | neg_act = [x for i, x in enumerate(context) if i not in flat_facts] 55 | 56 | dataset.append([state, eos, 0]) 57 | dataset.extend([[state, n, 0] for n in neg_act]) 58 | pos_set = [[state, p, 1] for p in pos_act] 59 | 60 | dataset.extend(pos_set) 61 | 62 | for g in gold_facts: 63 | if len(g) <= 1: 64 | state = [q["query"], context[g[0]]] 65 | 66 | pos_act = eos 67 | neg_act = context 68 | item = [state, pos_act, 1] 69 | dataset.append(item) 70 | dataset.extend([[state, n, 0] for n in neg_act]) 71 | else: 72 | g_0 = g[0] 73 | g_1 = g[1] 74 | 75 | state = [q["query"], context[g_0]] 76 | pos_act = context[g_1] 77 | neg_act = [x for i, x in enumerate(context) if i != g_1] 78 | item = [state, pos_act, 1] 79 | dataset.append(item) 80 | dataset.extend([[state, n, 0] for n in neg_act]) 81 | 82 | state = [q["query"], context[g_1]] 83 | pos_act = context[g_0] 84 | neg_act = [x for i, x in enumerate(context) if i != g_0] 85 | item = [state, pos_act, 1] 86 | dataset.append(item) 87 | dataset.extend([[state, n, 0] for n in neg_act]) 88 | 89 | state = [q["query"], context[g_0], context[g_1]] 90 | pos_act = eos 91 | neg_act = context 92 | item = [state, pos_act, 1] 93 | dataset.append(item) 94 | dataset.extend([[state, n, 0] for n in neg_act]) 95 | 96 | return dataset 97 | 98 | 99 | def prepare_tokenizer(tokenizer): 100 | special_tokens = [] 101 | special_tokens.extend(["", "", "", "[SEP]"]) 102 | tokenizer.add_special_tokens({"additional_special_tokens": special_tokens}) 103 | -------------------------------------------------------------------------------- /ssg/train_ssg.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates. 3 | # 4 | # This file is part of NeuralDB. 5 | # See https://github.com/facebookresearch/NeuralDB for further info. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | import argparse 20 | import os 21 | 22 | from sentence_transformers import SentencesDataset, InputExample, SentenceTransformer 23 | from sentence_transformers.evaluation import BinaryClassificationEvaluator 24 | from sentence_transformers.losses import ContrastiveLoss 25 | from torch.utils.data import DataLoader 26 | from torch.utils.data.sampler import WeightedRandomSampler 27 | 28 | from ssg_utils import read_NDB, create_dataset 29 | 30 | 31 | def is_valid_folder(parser, arg): 32 | if not os.path.exists(arg): 33 | parser.error("The file %s does not exist!" % arg) 34 | else: 35 | return arg 36 | 37 | 38 | if __name__ == "__main__": 39 | 40 | parser = argparse.ArgumentParser(description="training ssg") 41 | parser.add_argument( 42 | "-i", 43 | dest="folder", 44 | required=True, 45 | help="input data folder", 46 | type=lambda x: is_valid_folder(parser, x), 47 | ) 48 | parser.add_argument("-b", dest="batch_size", type=int, help="batch size", default=100) 49 | 50 | parser.add_argument("-e", dest="epochs", type=int, help="number of epochs", default=10) 51 | 52 | parser.add_argument("-o", dest="output", required=True, help="output address") 53 | parser.add_argument("-d", dest="device", default="cuda:0", help="output address") 54 | 55 | args = parser.parse_args() 56 | 57 | folder = args.folder 58 | batch_size = args.batch_size 59 | epochs = args.epochs 60 | output = args.output 61 | device = args.device 62 | 63 | # Define the model. Either from scratch of by loading a pre-trained model 64 | model = SentenceTransformer("distilbert-base-nli-mean-tokens", device=device) 65 | 66 | # read the train data 67 | name = "train" 68 | data_file = folder + "/" + name + ".jsonl" 69 | db = read_NDB(data_file) 70 | dataset = create_dataset(db) 71 | 72 | train_examples = [] 73 | weights = [] 74 | for d in dataset: 75 | texts = ["[SEP]".join(d[0]), "".join(d[1])] 76 | label = d[2] 77 | if label == 1: 78 | weights.append(10) 79 | else: 80 | weights.append(1) 81 | train_examples.append(InputExample(texts=texts, label=label)) 82 | 83 | # read the dev data 84 | name = "dev" 85 | data_file = folder + "/" + name + ".jsonl" 86 | db = read_NDB(data_file) 87 | dataset = create_dataset(db) 88 | 89 | dev_examples = [] 90 | for d in dataset: 91 | texts = ["[SEP]".join(d[0]), "".join(d[1])] 92 | label = d[2] 93 | dev_examples.append(InputExample(texts=texts, label=label)) 94 | 95 | train_loss = ContrastiveLoss(model) 96 | 97 | # Define your train dataset, the dataloader and the train loss 98 | train_dataset = SentencesDataset(train_examples, model) 99 | sampler = WeightedRandomSampler(weights=weights, num_samples=len(train_examples)) 100 | train_dataloader = DataLoader( 101 | train_dataset, sampler=sampler, shuffle=False, batch_size=batch_size 102 | ) 103 | 104 | evaluator = BinaryClassificationEvaluator.from_input_examples( 105 | dev_examples, batch_size=batch_size 106 | ) 107 | 108 | model.fit( 109 | train_objectives=[(train_dataloader, train_loss)], 110 | epochs=epochs, 111 | warmup_steps=100, 112 | evaluator=evaluator, 113 | output_path=output, 114 | evaluation_steps=100, 115 | ) 116 | --------------------------------------------------------------------------------