├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── dataset-construction
    ├── .gitignore
    ├── README.md
    ├── configs
    │   ├── expand_objects.json
    │   ├── expand_subject.json
    │   ├── filter_objects.json
    │   ├── filter_subjects.json
    │   ├── for_v1.5
    │   │   ├── NDB Relation Templates - P106.csv
    │   │   ├── NDB Relation Templates - P108.csv
    │   │   ├── NDB Relation Templates - P1082.csv
    │   │   ├── NDB Relation Templates - P1092.csv
    │   │   ├── NDB Relation Templates - P1110.csv
    │   │   ├── NDB Relation Templates - P1174.csv
    │   │   ├── NDB Relation Templates - P118.csv
    │   │   ├── NDB Relation Templates - P1198.csv
    │   │   ├── NDB Relation Templates - P1867.csv
    │   │   ├── NDB Relation Templates - P19.csv
    │   │   ├── NDB Relation Templates - P20.csv
    │   │   ├── NDB Relation Templates - P21.csv
    │   │   ├── NDB Relation Templates - P22.csv
    │   │   ├── NDB Relation Templates - P23.csv
    │   │   ├── NDB Relation Templates - P26.csv
    │   │   ├── NDB Relation Templates - P27.csv
    │   │   ├── NDB Relation Templates - P35.csv
    │   │   ├── NDB Relation Templates - P38.csv
    │   │   ├── NDB Relation Templates - P47.csv
    │   │   ├── NDB Relation Templates - P50.csv
    │   │   ├── NDB Relation Templates - P54.csv
    │   │   ├── NDB Relation Templates - P57.csv
    │   │   ├── NDB Relation Templates - P58.csv
    │   │   ├── NDB Relation Templates - P6.csv
    │   │   ├── NDB Relation Templates - P61.csv
    │   │   └── NDB Relation Templates - P69.csv
    │   └── generate_v1.5.json
    ├── requirements.txt
    ├── scripts
    │   ├── initial_sample.sh
    │   ├── make_databases.sh
    │   ├── make_questions.sh
    │   └── make_v2.4.sh
    └── src
    │   └── ndb_data
    │       ├── __init__.py
    │       ├── construction
    │           ├── __init__.py
    │           ├── make_database_finalize.py
    │           ├── make_database_initial.py
    │           ├── make_database_initial_cache.py
    │           └── make_questions.py
    │       ├── data_import
    │           ├── __init__.py
    │           ├── fix_sitelinks.py
    │           ├── kelm_data.py
    │           └── wikidata_index.py
    │       ├── dataset_statistics.py
    │       ├── generation
    │           ├── __init__.py
    │           ├── describe_db_facts.py
    │           ├── describe_dbs.py
    │           ├── filter_db_facts.py
    │           ├── finalize_hypothesis.py
    │           ├── map_kelm.py
    │           ├── plot_db_sizes.py
    │           ├── question_to_db.py
    │           └── template_first_db.py
    │       ├── sample_questions.py
    │       ├── sample_questions_100.py
    │       ├── sample_questions_1000.py
    │       ├── sample_questions_250.py
    │       ├── sample_questions_50.py
    │       ├── sample_questions_500.py
    │       ├── util
    │           ├── __init__.py
    │           └── build_json.py
    │       └── wikidata_common
    │           ├── __init__.py
    │           ├── common_mongo.py
    │           ├── kelm.py
    │           ├── wikidata.py
    │           └── wikpedia.py
├── modelling
    ├── .gitignore
    ├── README.md
    ├── requirements.txt
    ├── scripts
    │   ├── baselines
    │   │   ├── retrieve.sh
    │   │   ├── train_longformer.sh
    │   │   ├── train_t5.sh
    │   │   └── train_t5_retriever.sh
    │   ├── convert_ssg_predictions.sh
    │   ├── experiments_baselines.sh
    │   ├── experiments_ours.sh
    │   └── ours
    │   │   ├── predict_spj.sh
    │   │   ├── predict_spj_rand_sweep.sh
    │   │   └── train_spj.sh
    ├── setup.py
    ├── src
    │   ├── __init__.py
    │   └── neuraldb
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-38.pyc
    │   │       └── run.cpython-38.pyc
    │   │   ├── convert_legacy_predictions.py
    │   │   ├── convert_spj_to_predictions.py
    │   │   ├── convert_ssg_predictions.py
    │   │   ├── dataset
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-38.pyc
    │   │       │   ├── data_collator_seq2seq.cpython-38.pyc
    │   │       │   ├── neuraldb_file_reader.cpython-38.pyc
    │   │       │   ├── neuraldb_parser.cpython-38.pyc
    │   │       │   └── seq2seq_dataset.cpython-38.pyc
    │   │       ├── data_collator_seq2seq.py
    │   │       ├── instance_generator
    │   │       │   ├── __init__.py
    │   │       │   ├── __pycache__
    │   │       │   │   ├── __init__.cpython-38.pyc
    │   │       │   │   ├── instance_generator.cpython-38.pyc
    │   │       │   │   ├── perfectir_generator.cpython-38.pyc
    │   │       │   │   ├── spj_generator.cpython-38.pyc
    │   │       │   │   └── wholedb_generator.cpython-38.pyc
    │   │       │   ├── externalir_generator.py
    │   │       │   ├── externalir_generator_maxtok.py
    │   │       │   ├── instance_generator.py
    │   │       │   ├── perfectir_generator.py
    │   │       │   ├── spj_generator.py
    │   │       │   ├── subsampler.py
    │   │       │   └── wholedb_generator.py
    │   │       ├── neuraldb_file_reader.py
    │   │       ├── neuraldb_parser.py
    │   │       └── seq2seq_dataset.py
    │   │   ├── dataset_statistics.py
    │   │   ├── evaluation
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-38.pyc
    │   │       │   ├── postprocess_baselines.cpython-38.pyc
    │   │       │   └── scoring_functions.cpython-38.pyc
    │   │       ├── postprocess_baselines.py
    │   │       ├── postprocess_spj.py
    │   │       └── scoring_functions.py
    │   │   ├── final_scoring.py
    │   │   ├── final_scoring_with_dbsize.py
    │   │   ├── final_scoring_with_dbsize_sweep.py
    │   │   ├── modelling
    │   │       ├── __init__.py
    │   │       └── neuraldb_trainer.py
    │   │   ├── retriever
    │   │       ├── __init__.py
    │   │       ├── dpr.py
    │   │       └── tfidf.py
    │   │   ├── run.py
    │   │   └── util
    │   │       ├── __init__.py
    │   │       └── log_helper.py
    └── tests
    │   └── test_evaluation.py
├── overview.png
├── requirements-dev.txt
├── setup.cfg
└── ssg
    ├── README.md
    ├── evaluate_set_ssg.py
    ├── requirements.txt
    ├── ssg_prediction.py
    ├── ssg_utils.py
    └── train_ssg.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to NeuralDB
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `master`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to NeuralDB, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Database Reasoning over Text
 2 | 
 3 | This repository contains the code for the [Database Reasoning Over Text](https://arxiv.org/pdf/2106.01074.pdf) paper, 
 4 | to appear at ACL2021. Work is performed in collaboration with James Thorne, Majid Yazdani, Marzieh Saeidi, Fabrizio Silvestri, Sebastian Riedel, and Alon Halevy.
 5 | 
 6 | 
 7 | ![Overview Image](overview.png)
 8 | 
 9 | 
10 | ## Data
11 | The completed NeuralDB datasets can be downloaded [here](https://dl.fbaipublicfiles.com/neuraldb/fb-data-WikiNLDB.zip) and are released under a [CC BY-SA 3.0 license](https://creativecommons.org/licenses/by-sa/2.0/). 
12 | 
13 | The dataset includes entity names from Wikidata which are released under a [CC BY-SA 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/).
14 | This dataset includes sentences from the KELM corpus. KELM is released under the [CC BY-SA 2.0 license](https://creativecommons.org/licenses/by-sa/2.0/)
15 | 
16 | 
17 | ## Repository Structure
18 | The repository is structured in 3 sub-folders:
19 | 
20 | * Tools for mapping the KELM data to Wikidata identifiers are provided in the [dataset construction](dataset-construction/) folder ,
21 | * The information retrieval system for the support set generator are provided in the [ssg](ssg/) folder
22 | * The models for Neural SPJ, the baseline retrieval (TF-IDF and DPR), and evaluation scripts are provided in the [modelling folder](modelling/).
23 | 
24 | Instructions for running each component are provided in the README files in the respective sub-folders.
25 | 
26 | ## Setup
27 | 
28 | All sub-folders were set up with one Python environment per folder. Requirements for each environment can be installed by
29 | running a pip install:
30 | 
31 | ```
32 | pip install -r requirements.txt
33 | ```
34 | 
35 | In the `dataset-construction` and `modelling` folders, the `src` folder should be included in the python path.
36 | 
37 | ```
38 | export PYTHONPATH=src
39 | ```
40 | 
41 | ## License
42 | 
43 | The code in this repository is released under the [Apache 2.0 license](LICENSE)
44 | 


--------------------------------------------------------------------------------
/dataset-construction/.gitignore:
--------------------------------------------------------------------------------
1 | resources/
2 | work/
3 | 


--------------------------------------------------------------------------------
/dataset-construction/configs/expand_objects.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "P54": {
 3 |       "P17": [
 4 |         "What is the country of the player who"
 5 |       ],
 6 |       "P118": [
 7 |         "What is the league of the player who"
 8 |       ]
 9 |     },
10 |   "P50": {
11 |     "P27": [
12 |       "What is the country of the person that"
13 |     ],
14 |     "P21": [
15 |       "What is the gender of the person that"
16 |     ],
17 |     "P108": [
18 |       "What is the employer of the person that"
19 |     ],
20 |     "P106": [
21 |       "What is the occupation of the person that"
22 |     ],
23 |     "P39": [
24 |       "What is the position of the person that"
25 |     ],
26 |     "P166": [
27 |       "What prize did the person that $X win?"
28 |     ],
29 |     "P463": [
30 |       "What is the affiliation of the person that"
31 |     ],
32 |     "P937": [
33 |       "What is the field that $X works in?"
34 |     ]
35 |   },
36 |   "P61": {
37 |     "P27": [
38 |       "What is the country of the person that"
39 |     ],
40 |     "P21": [
41 |       "What is the gender of the person that"
42 |     ],
43 |     "P108": [
44 |       "What is the employer of the person that"
45 |     ],
46 |     "P106": [
47 |       "What is the occupation of the person that"
48 |     ],
49 |     "P39": [
50 |       "What is the position of the person that"
51 |     ],
52 |     "P166": [
53 |       "What prize did the person that $X win?"
54 |     ],
55 |     "P463": [
56 |       "What is the affiliation of the person that"
57 |     ],
58 |     "P937": [
59 |       "What is the field that $X works in?"
60 |     ]
61 |   },
62 |   "P69": {
63 |     "P27": [
64 |       "What country is the institution of the person who has"
65 |     ],
66 |     "P463": [
67 |       "What affiliation is the institution of the person that has"
68 |     ],
69 |     "P937": [
70 |       "What is the field of the institution of the person that has"
71 |     ]
72 |   }
73 | }


--------------------------------------------------------------------------------
/dataset-construction/configs/expand_subject.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "P54": {
 3 |         "P27": ["What is the citizenship of the player that"],
 4 |         "P2067": ["What is the weight of the player that"],
 5 |         "P2048": ["What is the height of the player that"],
 6 |         "P569": ["What is the date of birth of the player that"]
 7 |     },
 8 |     "P50": {
 9 |         "P136": ["What is the genre of the work of the"],
10 |         "P495": ["Where did the work that $X originate from?"],
11 |         "P577": ["When was the work that $X published"]
12 |     },
13 |     "P58": {
14 |         "P136": ["What is the genre of the work that"],
15 |         "P495": ["Where did the work that $X originate from?"],
16 |         "P577": ["When was the work that $X published"]
17 |     },
18 |     "P69": {
19 |         "P27": ["What is the country of the person that went to"],
20 |         "P21": ["What is the gender of the person that went to"],
21 |         "P108": ["What is the employer of the person that went to"],
22 |         "P106": ["What is the occupation of the person that went to"],
23 |         "P39": ["What is the position of the person that went to"],
24 |         "P166": ["What prize did the person that went to $X win?"],
25 |         "P463": ["What is the affiliation of the person that went to"],
26 |         "P937": ["What field does the person that went to $X work in?"]
27 |     }
28 | 
29 | }


--------------------------------------------------------------------------------
/dataset-construction/configs/filter_objects.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "P54": {
 3 |         "P17": ["a club located in $AO", "a team located in $AO"],
 4 |         "P118": ["a team in $AO"]
 5 |     },
 6 |     "P50": {
 7 | 
 8 |         "P27": ["someone from $AO"],
 9 |         "P21": ["someone who is a $AO"],
10 |         "P108": ["someone who is employed by $AO"],
11 |         "P106": ["someone who works as a $AO"],
12 |         "P39": ["someone who is a $AO"],
13 |         "P166": ["someone who won a $AO"],
14 |         "P463": ["someone who is a member of $AO"],
15 |         "P937": ["someone who works in $AO"]
16 |     },
17 |     "P58": {
18 | 
19 |         "P27": ["someone from $AO"],
20 |         "P21": ["someone who is a $AO"],
21 |         "P108": ["someone who is employed by $AO"],
22 |         "P106": ["someone who works as a $AO"],
23 |         "P39": ["someone who is a $AO"],
24 |         "P166": ["someone who won a $AO"],
25 |         "P463": ["someone who is a member of $AO"],
26 |         "P937": ["someone who works in $AO"]
27 |     },
28 |     "P61": {
29 |         "P27": ["someone from $AO"],
30 |         "P21": ["someone who is a $AO"],
31 |         "P108": ["someone who is employed by $AO"],
32 |         "P106": ["someone who works as a $AO"],
33 |         "P39": ["someone who is a $AO"],
34 |         "P166": ["someone who won a $AO"],
35 |         "P463": ["someone who is a member of $AO"],
36 |         "P937": ["someone who works in $AO"]
37 |     },
38 |     "P69": {
39 |         "P17": ["an institution in $AO"]
40 |     }
41 | 
42 | }


--------------------------------------------------------------------------------
/dataset-construction/configs/filter_subjects.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "P54": {
 3 |         "P2067": ["someone who weighs $AO"],
 4 |         "P2048": ["someone $AO tall"],
 5 |         "P21": ["someone who is a $AO"],
 6 |         "P413": ["someone who plays $AO"],
 7 |         "P569": ["someone born on $AO"],
 8 |         "P27": ["someone from $AO"]
 9 |     },
10 |     "P50": {
11 |         "P136": ["a work with the genre $AO"],
12 |         "P495": ["a work originating from $AO"],
13 |         "P577": ["a work published on $AO"]
14 |     },
15 |     "P58": {
16 |         "P136": ["a work with the genre $AO"],
17 |         "P495": ["a work originating from $AO"],
18 |         "P577": ["a work published on $AO"]
19 |     },
20 |     "P69": {
21 |         "P27": ["someone from $AO"],
22 |         "P21": ["someone who is a $AO"],
23 |         "P108": ["someone who is employed by $AO"],
24 |         "P106": ["someone who works as a $AO"],
25 |         "P39": ["someone who is a $AO"],
26 |         "P166": ["someone who won a $AO"],
27 |         "P463": ["someone who is a member of $AO"],
28 |         "P937": ["someone who works in $AO"]
29 |     }
30 | }


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P106.csv:
--------------------------------------------------------------------------------
1 | 106,https://www.wikidata.org/wiki/Property:P106,$s person has job $o,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s works as a $o,Yuri Gagarin works as a astronaut,Is $s an $o?,Is Yuri Gagarin an astronaut?,TRUE,TRUE,What jobs does $s have?,What jobs does Yuri Gagarin have?,$o,astronaut,How many jobs has $s had?,How many jobs has Yuri Gagarin had?,$o,astronaut,What is the least popular job?,What is the least popular job?,$o [SEP] $s,astronaut [SEP] Yuri Gagarin,What is the most popular job?,What is the most popular job?,$o [SEP] $s,astronaut [SEP] Yuri Gagarin,,,,,,,,
4 | $s is a $o,Yuri Gagarin is a astronaut,Does $s work as an $o?,Does Yuri Gagarin work as an astronaut?,TRUE,TRUE,Who works as a $o?,Who works as a astronaut?,$s,Yuri Gagarin,How many people are $o?,How many people are astronaut?,$s,Yuri Gagarin,What job has the fewest number of people working there?,What job has the fewest number of people working there?,$o [SEP] $s,astronaut [SEP] Yuri Gagarin,What job has had the highest number of people working there?,What job has had the highest number of people working there?,$o [SEP] $s,astronaut [SEP] Yuri Gagarin,,,,,,,,
5 | $s's job is a $o,Yuri Gagarin's job is a astronaut,Is $s's job a $o?,Is Yuri Gagarin's job a astronaut?,TRUE,TRUE,What does $s do?,What does Yuri Gagarin do?,$o,astronaut,,,,,Who has had the fewest jobs?,Who has had the fewest jobs?,$s [SEP] $o,Yuri Gagarin [SEP] astronaut,Who has had the most jobs?,Who has had the most jobs?,$s [SEP] $o,Yuri Gagarin [SEP] astronaut,,,,,,,,
6 | ,,,,,,What is $s's job?,What is Yuri Gagarin's job?,$o,astronaut,,,,,Who has had the least number of jobs?,Who has had the least number of jobs?,$s [SEP] $o,Yuri Gagarin [SEP] astronaut,Who has had the highest number of jobs?,Who has had the highest number of jobs?,$s [SEP] $o,Yuri Gagarin [SEP] astronaut,,,,,,,,
7 | ,,,,,,What are $s's jobs?,What are Yuri Gagarin's jobs?,$o,astronaut,,,,,,,,,,,,,,,,,,,,
8 | ,,,,,,Who is a $o?,Who is a astronaut?,$s,Yuri Gagarin,,,,,,,,,,,,,,,,,,,,
9 | ,,,,,,Who has a career as a $o?,Who has a career as a astronaut?,$s,Yuri Gagarin,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P108.csv:
--------------------------------------------------------------------------------
1 | 108,https://www.wikidata.org/wiki/Property:P106,$s person has job at $o,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s is employed by $o,Neil Armstrong is employed by NASA,Is $s employed by $o?,Is Neil Armstrong employed by NASA?,TRUE,TRUE,Who does $s work for?,Who does Neil Armstrong work for?,$o,NASA,How many places has $s worked?,How many places has Neil Armstrong worked?,$o,NASA,,,,,,,,,,,,,,,,
4 | The employer of $s is $o,The employer of Neil Armstrong is NASA,Does $s work for $o?,Does Neil Armstrong work for NASA?,TRUE,TRUE,Who works for $o?,Who works for NASA?,$s,Neil Armstrong,How many people work for $o?,How many people work for NASA?,$s,Neil Armstrong,,,,,,,,,,,,,,,,
5 | $s works for $o,Neil Armstrong works for NASA,Is $o the employer of $s?,Is NASA the employer of Neil Armstrong?,TRUE,TRUE,Who is employed by $o?,Who is employed by NASA?,$s,Neil Armstrong,,,,,,,,,,,,,,,,,,,,
6 | ,,,,,,What company is $s employed by?,What company is Neil Armstrong employed by?,$o,NASA,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P1082.csv:
--------------------------------------------------------------------------------
1 | 1082,https://www.wikidata.org/wiki/Property:P1082,"Population of $s <city, territory or country> is $o",,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | There are $o inhabitants of $s,There are 5079791 inhabitants of San Francisco,Is the population of $s $o?,Is the population of San Francisco 5079791?,TRUE,TRUE,,,,,How many people live in $s?,How many people live in San Francisco?,$o,5079791,Which place has the smallest population?,Which place has the smallest population?,$s [SEP] $o,San Francisco [SEP] 5079791,Which place has the biggest population?,Which place has the biggest population?,$s [SEP] $o,San Francisco [SEP] 5079791,What is the smallest population?,What is the smallest population?,$o,5079791,What is the largest population?,What is the largest population?,$o,5079791
4 | The population of $s is $o,The population of San Francisco is 5079791,Are there $o people living in $s?,Are there 5079791 people living in San Francisco?,TRUE,TRUE,,,,,What is the population of $s?,What is the population of San Francisco?,$o,5079791,What is the least inhabited place?,What is the least inhabited place?,$s [SEP] $o,San Francisco [SEP] 5079791,What is the most inhabited place?,What is the most inhabited place?,$s [SEP] $o,San Francisco [SEP] 5079791,What is the lowest population?,What is the lowest population?,$o,5079791,What is the highest population?,What is the highest population?,$o,5079791
5 | There are $o people living in $s,There are 5079791 people living in San Francisco,Is the number of people living in $s $o?,Is the number of people living in San Francisco 5079791?,TRUE,TRUE,,,,,,,,,What place has the fewest people living there?,What place has the fewest people living there?,$s [SEP] $o,San Francisco [SEP] 5079791,What place has the most people living there?,What place has the most people living there?,$s [SEP] $o,San Francisco [SEP] 5079791,,,,,,,,
6 | ,,Are there $o inhabitants of $s?,Are there 5079791 inhabitants of San Francisco?,TRUE,TRUE,,,,,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P1092.csv:
--------------------------------------------------------------------------------
1 | 1092,https://www.wikidata.org/wiki/Property:P1092,$ was made $o times,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | There were $o $s made,There were 125 CC-150 Polaris made,,,,,What items are there?,What items are there?,$s,CC-150 Polaris,How many $s are there?,How many CC-150 Polaris are there?,$o,125,What item has been made the least?,What item has been made the least?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What item has been made the most?,What item has been made the most?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What was the smallest production run?,What was the smallest production run?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What was the largest production run?,What was the largest production run?,$s [SEP] $o,CC-150 Polaris [SEP] 125
4 | $s was made $o times,CC-150 Polaris was made 125 times,,,,,List all items,List all items,$s,CC-150 Polaris,How many $s exist?,How many CC-150 Polaris exist?,$o,125,What is the meast manufactured item?,What is the meast manufactured item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What is the most manufactured item?,What is the most manufactured item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,,,,,,,,
5 | There are $o $s in existence ,There are 125 CC-150 Polaris in existence ,,,,,What items exist?,What items exist?,$s,CC-150 Polaris,How many $s were manufactured?,How many CC-150 Polaris were manufactured?,$o,125,What is the least common item?,What is the least common item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What is the most common item?,What is the most common item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,,,,,,,,
6 | ,,,,,,,,,,How many items were made?,How many items were made?,$o,125,What is the rarest item?,What is the rarest item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,What is the most abundant item?,What is the most abundant item?,$s [SEP] $o,CC-150 Polaris [SEP] 125,,,,,,,,
7 | ,,,,,,,,,,How many things were manufactured?,How many things were manufactured?,$o,125,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P1110.csv:
--------------------------------------------------------------------------------
1 | 1110,https://www.wikidata.org/wiki/Property:P1110,$s <event> has attendance of $o,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | The attendance of $s was $o,"The attendance of 1982 FIFA World Cup was 2,109,723",,,,,What were the attendances of all events?,What were the attendances of all events?,$o,"2,109,723",How many people went to $o?,"How many people went to 2,109,723?",$o,"2,109,723",What was the least popular event?,What was the least popular event?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What was the most popular event?,What was the most popular event?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",How many people went to the smallest event?,How many people went to the smallest event?,$o,"2,109,723",How many people went to the largest event?,How many people went to the largest event?,$o,"2,109,723"
4 | The attendance of $s is $o,"The attendance of 1982 FIFA World Cup is 2,109,723",,,,,What was the attendance of $s?,What was the attendance of 1982 FIFA World Cup?,$o,"2,109,723",How many people went to events?,How many people went to events?,$o,"2,109,723",Which event was least well attended?,Which event was least well attended?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What event had the best attendance?,What event had the best attendance?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",How many went to the smallest event?,How many went to the smallest event?,$o,"2,109,723",How many went to the largest event?,How many went to the largest event?,$o,"2,109,723"
5 | $o people went to $s,"2,109,723 people went to 1982 FIFA World Cup",,,,,,,,,,,,,What event had the smallest turnout?,What event had the smallest turnout?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What event had the largest turnout?,What event had the largest turnout?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What was the smallest crowd?,What was the smallest crowd?,$o,"2,109,723",What was the largest crowd?,What was the largest crowd?,$o,"2,109,723"
6 | The number of people at $s was $o,"The number of people at 1982 FIFA World Cup was 2,109,723",,,,,,,,,,,,,What event had the lowest attendance?,What event had the lowest attendance?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",What event was most well attended?,What event was most well attended?,$s [SEP] $o,"1982 FIFA World Cup [SEP] 2,109,723",,,,,,,,
7 | $o went to $s,"2,109,723 went to 1982 FIFA World Cup",,,,,,,,,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P1174.csv:
--------------------------------------------------------------------------------
1 | 1174,https://www.wikidata.org/wiki/Property:P1174,$s has $o visitors per year,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s has $o visitors per year,"Eiffel Tower has 7,000,000 visitors per year",,,,,,,,,How many people visit attractions every year?,How many people visit attractions every year?,$o,"7,000,000",What is the least attended attraction?,What is the least attended attraction?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the most attended attraction?,What is the most attended attraction?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the smallest attendance?,What is the smallest attendance?,$o,"7,000,000",What is the largest attendance?,What is the largest attendance?,$o,"7,000,000"
4 | $o people visit $s every year,"7,000,000 people visit Eiffel Tower every year",,,,,,,,,How many people visit $s every year?,How many people visit Eiffel Tower every year?,$o,"7,000,000",What is the attraction with the lowest visitor count per year?,What is the attraction with the lowest visitor count per year?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the attraction with the highest visitor count per year?,What is the attraction with the highest visitor count per year?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the smallest yearly attendance?,What is the smallest yearly attendance?,$o,"7,000,000",What is the largest yearly attendance?,What is the largest yearly attendance?,$o,"7,000,000"
5 | $o people visit $s annually ,"7,000,000 people visit Eiffel Tower annually ",,,,,,,,,,,,,What has the lowest number of visitors each year?,What has the lowest number of visitors each year?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What has the highest number of visitors each year?,What has the highest number of visitors each year?,$s [SEP] $o,"Eiffel Tower [SEP] 7,000,000",What is the smallest annual attendance?,What is the smallest annual attendance?,$o,"7,000,000",What is the largest annual attendance?,What is the largest annual attendance?,$o,"7,000,000"
6 | The yearly visitor count at $s is $o,"The yearly visitor count at Eiffel Tower is 7,000,000",,,,,,,,,,,,,,,,,,,,,,,,,,,,
7 | $o people go to $s each year,"7,000,000 people go to Eiffel Tower each year",,,,,,,,,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P118.csv:
--------------------------------------------------------------------------------
1 | 118,https://www.wikidata.org/wiki/Property:P118,$s <person or team> is in league $o,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s plays in $o,Bastian Schweinsteiger plays in Bundesliga,Does $s play in $o?,Does Bastian Schweinsteiger play in Bundesliga?,TRUE,TRUE,What league does $s play for?,What league does Bastian Schweinsteiger play for?,$o,Bundesliga,How many people play in $o?,How many people play in Bundesliga?,$s,Bastian Schweinsteiger,What is the smallest league?,What is the smallest league?,$o [SEP] $s,Bundesliga [SEP] Bastian Schweinsteiger,What is the largest league?,What is the largest league?,$o [SEP] $s,Bundesliga [SEP] Bastian Schweinsteiger,,,,,,,,
4 | $s is a player in $o,Bastian Schweinsteiger is a player in Bundesliga,Is the league that $s plays for $o?,Is the league that Bastian Schweinsteiger plays for Bundesliga?,TRUE,TRUE,Who plays in $o?,Who plays in Bundesliga?,$s,Bastian Schweinsteiger,How many leagues has $s played for?,How many leagues has Bastian Schweinsteiger played for?,$o,Bundesliga,Who has played in the fewest leagues?,Who has played in the fewest leagues?,$s [SEP] $o,Bastian Schweinsteiger [SEP] Bundesliga,Who has played for the most leagues?,Who has played for the most leagues?,$s [SEP] $o,Bastian Schweinsteiger [SEP] Bundesliga,,,,,,,,
5 | The league that $s plays in is $o,The league that Bastian Schweinsteiger plays in is Bundesliga,Does $s participate in $o?,Does Bastian Schweinsteiger participate in Bundesliga?,TRUE,TRUE,What league is $s participant of?,What league is Bastian Schweinsteiger participant of?,$o,Bundesliga,How many leagues has $s participated in?,How many leagues has Bastian Schweinsteiger participated in?,$o,Bundesliga,Who has participated in the least number of leagues?,Who has participated in the least number of leagues?,$s [SEP] $o,Bastian Schweinsteiger [SEP] Bundesliga,Who has participated in the most number of leagues?,Who has participated in the most number of leagues?,$s [SEP] $o,Bastian Schweinsteiger [SEP] Bundesliga,,,,,,,,
6 | $s participates in $o,Bastian Schweinsteiger participates in Bundesliga,Is $s a participant of $o?,Is Bastian Schweinsteiger a participant of Bundesliga?,TRUE,TRUE,Who participates in $o?,Who participates in Bundesliga?,$s,Bastian Schweinsteiger,How many participants are there in $o?,How many participants are there in Bundesliga?,$s,Bastian Schweinsteiger,What league has the fewest participants?,What league has the fewest participants?,$o [SEP] $s,Bundesliga [SEP] Bastian Schweinsteiger,What league has the most participants?,What league has the most participants?,$o [SEP] $s,Bundesliga [SEP] Bastian Schweinsteiger,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P1198.csv:
--------------------------------------------------------------------------------
1 | 1198,https://www.wikidata.org/wiki/Property:P1198,The unemployment rate of $s is $o,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | The unemployment rate of $s is $o,The unemployment rate of United States of America is 6.7 percent,Is the unpemployment rate of $s $o?,Is the unpemployment rate of United States of America 6.7 percent?,TRUE,TRUE,What is unemployment rate of $s?,What is unemployment rate of United States of America?,$o,6.7 percent,,,,,Where does it have the lowest unemployment rate?,Where does it have the lowest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Where does it have the highest unemployment rate?,Where does it have the highest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,What is the lowest unemployment rate?,What is the lowest unemployment rate?,$o,6.7 percent,What is the highest unemployment rate?,What is the highest unemployment rate?,$o,6.7 percent
4 | There are $o of people without jobs in $s,There are 6.7 percent of people without jobs in United States of America,Are there $o people unemployed in $s?,Are there 6.7 percent people unemployed in United States of America?,TRUE,TRUE,List unemployment rates,List unemployment rates,,,,,,,Where is the lowest unemployment rate?,Where is the lowest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Where is the highest unemployment rate?,Where is the highest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,,,,,,,,
5 | There are $o of people without a job in $s,There are 6.7 percent of people without a job in United States of America,Are $o people out of work in $s?,Are 6.7 percent people out of work in United States of America?,,,,,,,,,,,Where is the lowest unemployment?,Where is the lowest unemployment?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Where is the highest unemployment?,Where is the highest unemployment?,$s [SEP] $o,United States of America [SEP] 6.7 percent,,,,,,,,
6 | $s's unemployment rate is $o,United States of America's unemployment rate is 6.7 percent,,,,,,,,,,,,,Which places have the lowest unemployment rate?,Which places have the lowest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Which places have the highest unemployment rate?,Which places have the highest unemployment rate?,$s [SEP] $o,United States of America [SEP] 6.7 percent,,,,,,,,
7 | There are $o people out work in $s,There are 6.7 percent people out work in United States of America,,,,,,,,,,,,,Which places have the least people out of work?,Which places have the least people out of work?,$s [SEP] $o,United States of America [SEP] 6.7 percent,Which places have the most people out of work?,Which places have the most people out of work?,$s [SEP] $o,United States of America [SEP] 6.7 percent,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P1867.csv:
--------------------------------------------------------------------------------
 1 | 1867,https://www.wikidata.org/wiki/Property:P1831,$s <place> has number of qualified/eligible voters $o <unit>,,,,,,,,,,,,,,,,,,,,,,,,,,,
 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
 3 | There are $o registered voters in $s,There are 1235532 registered voters in Scotland,Does $s have $o voters?,Does Scotland have 1235532 voters?,TRUE,TRUE,,,,,How many registered voters are there?,How many registered voters are there?,$o,1235532,Which place has the fewest registered voters?,Which place has the fewest registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,Which place has the most registered voters?,Which place has the most registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,What is the lowest voter registration?,What is the lowest voter registration?,$o,1235532,What is the highest voter regitration?,What is the highest voter regitration?,$o,1235532
 4 | There were $o registered voters in $s,There were 1235532 registered voters in Scotland,Are there $o voters in $s?,Are there 1235532 voters in Scotland?,TRUE,TRUE,,,,,How many registered voters are there in $s?,How many registered voters are there in Scotland?,$o,1235532,Where is the lowest number of registered voters?,Where is the lowest number of registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,Where is the highest number of registered voters?,Where is the highest number of registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,What is the smallest number of registered voters?,What is the smallest number of registered voters?,$o,1235532,What is the largest number of voters?,What is the largest number of voters?,$o,1235532
 5 | The number of eligible voters in $s is $o,The number of eligible voters in Scotland is 1235532,Are there $o registered voters in $s?,Are there 1235532 registered voters in Scotland?,TRUE,TRUE,,,,,,,,,What places have the lowest number of registered voters?,What places have the lowest number of registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,What places have the highest number of registered voters?,What places have the highest number of registered voters?,$s [SEP] $o,Scotland [SEP] 1235532,,,,,,,,
 6 | The number of eligible voters in $s was $o,The number of eligible voters in Scotland was 1235532,Does $s have $o eligible voters?,Does Scotland have 1235532 eligible voters?,TRUE,TRUE,,,,,,,,,,,,,,,,,,,,,,,,
 7 | $s has $o eligible voters,Scotland has 1235532 eligible voters,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 8 | The size of the electorate in $s is $o,The size of the electorate in Scotland is 1235532,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 9 | There are $o registered voters in $s,There are 1235532 registered voters in Scotland,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10 | There are $o voters in $s,There are 1235532 voters in Scotland,,,,,,,,,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P19.csv:
--------------------------------------------------------------------------------
1 | 19,https://www.wikidata.org/wiki/Property:P19,$s <person> was born in $o <place>,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s was born in $o,Julius Caesar was born in Rome,Was $s born in $o?,Was Julius Caesar born in Rome?,TRUE,TRUE,Who was born in $o?,Who was born in Rome?,$s,Julius Caesar,How many people were born in $o?,How many people were born in Rome?,$s,Julius Caesar,Which place is the birthplace of the fewest people?,Which place is the birthplace of the fewest people?,$o [SEP] $s,Rome [SEP] Julius Caesar,Which place is the birthplace of the most people?,Which place is the birthplace of the most people?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,,
4 | $o is the place of birth of $s,Rome is the place of birth of Julius Caesar,Is $o the place of birth of $s?,Is Rome the place of birth of Julius Caesar?,TRUE,TRUE,Who is from $o?,Who is from Rome?,$s,Julius Caesar,Count the number of people born in $o,Count the number of people born in Rome,$s,Julius Caesar,Which place had the least people born there?,Which place had the least people born there?,$o [SEP] $s,Rome [SEP] Julius Caesar,Which places has had the highest number of births?,Which places has had the highest number of births?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,,
5 | $o is where $s was born,Rome is where Julius Caesar was born,Is $o where $s was born?,Is Rome where Julius Caesar was born?,TRUE,TRUE,Where was $s born?,Where was Julius Caesar born?,$o,Rome,,,,,What is the place with the fewest number of births?,What is the place with the fewest number of births?,$o [SEP] $s,Rome [SEP] Julius Caesar,What is the place with the highest number of births?,What is the place with the highest number of births?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,,
6 | ,,Was $o the birthplace of $s?,Was Rome the birthplace of Julius Caesar?,TRUE,TRUE,Where is $s from?,Where is Julius Caesar from?,$o,Rome,,,,,What is the place with the least number of births?,What is the place with the least number of births?,$o [SEP] $s,Rome [SEP] Julius Caesar,What is the place with the most births?,What is the place with the most births?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,,
7 | ,,,,,,List everyone born in $o,List everyone born in Rome,$s,Julius Caesar,,,,,,,,,What place has the most people born there?,What place has the most people born there?,$o [SEP] $s,Rome [SEP] Julius Caesar,,,,,,,,
8 | ,,,,,,List people born in $o,List people born in Rome,$s,Julius Caesar,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P20.csv:
--------------------------------------------------------------------------------
1 | 20,https://www.wikidata.org/wiki/Property:P20,$s <person> was died in $o <place>,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s died in $o,René Descartes died in Stockholm,Did $s die in $o?,Did René Descartes die in Stockholm?,TRUE,TRUE,Where did $s die?,Where did René Descartes die?,$o,Stockholm,How many people have died in $o?,How many people have died in Stockholm?,$s,René Descartes,Which places have the fewest people die there?,Which places have the fewest people die there?,$o [SEP] $s,Stockholm [SEP] René Descartes,Which places have the most people die there?,Which places have the most people die there?,$o [SEP] $s,Stockholm [SEP] René Descartes,,,,,,,,
4 | The place of death for $s was $o,The place of death for René Descartes was Stockholm,Is $o where $s died?,Is Stockholm where René Descartes died?,TRUE,TRUE,Who died in $o?,Who died in Stockholm?,$s,René Descartes,How many people died in $o?,How many people died in Stockholm?,$s,René Descartes,Which place has had the fewest people die there?,Which place has had the fewest people die there?,$o [SEP] $s,Stockholm [SEP] René Descartes,Which place has had the most people die there?,Which place has had the most people die there?,$o [SEP] $s,Stockholm [SEP] René Descartes,,,,,,,,
5 | $o is where $s died,Stockholm is where René Descartes died,Is $o the place of death of $s?,Is Stockholm the place of death of René Descartes?,TRUE,TRUE,Which people have died in $o?,Which people have died in Stockholm?,$s,René Descartes,Count the number of people who have died in $o,Count the number of people who have died in Stockholm,$s,René Descartes,Where is the place where the fewest people have died?,Where is the place where the fewest people have died?,$o [SEP] $s,Stockholm [SEP] René Descartes,Where is the place where the most people have died?,Where is the place where the most people have died?,$o [SEP] $s,Stockholm [SEP] René Descartes,,,,,,,,
6 | ,,,,,,Who has died in $o?,Who has died in Stockholm?,$s,René Descartes,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P21.csv:
--------------------------------------------------------------------------------
1 | 21,https://www.wikidata.org/wiki/Property:P21,"$s <person>, gender, $o <sex or gender identity>",,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s is $o,Tintin is male,Is $s $o?,Is Tintin male?,TRUE,TRUE,Who is $o?,Who is male?,$s,Tintin,How many people are $o?,How many people are male?,$s,Tintin,Which gender has the fewest people?,Which gender has the fewest people?,$o [SEP] $s,male [SEP] Tintin,Which gender has the most people?,Which gender has the most people?,$o [SEP] $s,male [SEP] Tintin,,,,,,,,
4 | The gender of $s is $o,The gender of Tintin is male,,,,,Which people are $o?,Which people are male?,$s,Tintin,,,,,What is the least popular gender?,What is the least popular gender?,$o [SEP] $s,male [SEP] Tintin,What is the most popular gender?,What is the most popular gender?,$o [SEP] $s,male [SEP] Tintin,,,,,,,,
5 | $s is a $o,Tintin is a male,,,,,List everyone who is $o,List everyone who is male,$s,Tintin,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P22.csv:
--------------------------------------------------------------------------------
 1 | 22,https://www.wikidata.org/wiki/Property:P22,$s <person> has father $o <person>,,,,,,,,,,,,,,,,,,,,,,,,,,,
 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
 3 | $o is the father of $s,George Bush is the father of Jenna Bush,Is $o a parent of $s?,Is George Bush a parent of Jenna Bush?,TRUE,TRUE,Who are the children of $o?,Who are the children of George Bush?,$s,Jenna Bush,How many children does $o have?,How many children does George Bush have?,$s,Jenna Bush,Which parent has the fewest children?,Which parent has the fewest children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,Which parent has the most children?,Which parent has the most children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,,
 4 | $s's father is $o,Jenna Bush's father is George Bush,Is $o $s's parent?,Is George Bush Jenna Bush's parent?,TRUE,TRUE,Who are the parents of $s?,Who are the parents of Jenna Bush?,$o,George Bush,How many parents does $s have?,How many parents does Jenna Bush have?,$o,George Bush,Which parent doesn't have many children?,Which parent doesn't have many children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,Which parent has the highest number of children?,Which parent has the highest number of children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,,
 5 | $o is $s's dad,George Bush is Jenna Bush's dad,Is $o $s's father?,Is George Bush Jenna Bush's father?,TRUE,TRUE,Who is the father of $s?,Who is the father of Jenna Bush?,$o,George Bush,How many people are children of $o?,How many people are children of George Bush?,$s,Jenna Bush,Who has the fewest children?,Who has the fewest children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,Who has the most children?,Who has the most children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,,
 6 | $o is $s's father,George Bush is Jenna Bush's father,Is $o $s's dad?,Is George Bush Jenna Bush's dad?,TRUE,TRUE,Who are $o's children?,Who are George Bush's children?,$s,Jenna Bush,,,,,Who has the least kids?,Who has the least kids?,$o [SEP] $s,George Bush [SEP] Jenna Bush,Who has the most kids?,Who has the most kids?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,,
 7 | $s is a child of $o,Jenna Bush is a child of George Bush,Is $o the father of $s?,Is George Bush the father of Jenna Bush?,TRUE,TRUE,,,,,,,,,Who has the least children?,Who has the least children?,$o [SEP] $s,George Bush [SEP] Jenna Bush,,,,,,,,,,,,
 8 | $o is a parent of $s,George Bush is a parent of Jenna Bush,Is $s a parent of $s?,Is Jenna Bush a parent of Jenna Bush?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
 9 | ,,Is $s $o's parent?,Is Jenna Bush George Bush's parent?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
10 | ,,Is $s $o's father?,Is Jenna Bush George Bush's father?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
11 | ,,Is $s $o's dad?,Is Jenna Bush George Bush's dad?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
12 | ,,Is $s the father of $o?,Is Jenna Bush the father of George Bush?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P23.csv:
--------------------------------------------------------------------------------
 1 | 23,https://www.wikidata.org/wiki/Property:P22,$s <person> has mother $o <person>,,,,,,,,,,,,,,,,,,,,,,,,,,,
 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
 3 | $o is the mother of $s,Marge Simpson is the mother of Lisa Simpson,Is $o a parent of $s?,Is Marge Simpson a parent of Lisa Simpson?,TRUE,TRUE,Who are the children of $o?,Who are the children of Marge Simpson?,$s,Lisa Simpson,How many children does $o have?,How many children does Marge Simpson have?,$s,Lisa Simpson,Which parent has the fewest children?,Which parent has the fewest children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,Which parent has the most children?,Which parent has the most children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,,
 4 | $s's mother is $o,Lisa Simpson's mother is Marge Simpson,Is $o $s's parent?,Is Marge Simpson Lisa Simpson's parent?,TRUE,TRUE,Who are the parents of $s?,Who are the parents of Lisa Simpson?,$o,Marge Simpson,How many parents does $s have?,How many parents does Lisa Simpson have?,$o,Marge Simpson,Which parent doesn't have many children?,Which parent doesn't have many children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,Which parent has the highest number of children?,Which parent has the highest number of children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,,
 5 | $o is $s's mum,Marge Simpson is Lisa Simpson's mum,Is $o $s's mother?,Is Marge Simpson Lisa Simpson's mother?,TRUE,TRUE,Who is the mother of $s?,Who is the mother of Lisa Simpson?,$o,Marge Simpson,How many people are children of $o?,How many people are children of Marge Simpson?,$s,Lisa Simpson,Who has the fewest children?,Who has the fewest children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,Who has the most children?,Who has the most children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,,
 6 | $o is $s's mom,Marge Simpson is Lisa Simpson's mom,Is $o $s's mom?,Is Marge Simpson Lisa Simpson's mom?,TRUE,TRUE,Who are $o's children?,Who are Marge Simpson's children?,$s,Lisa Simpson,,,,,Who has the least kids?,Who has the least kids?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,Who has the most kids?,Who has the most kids?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,,
 7 | $o is a child of $s,Marge Simpson is a child of Lisa Simpson,Is $o $s's mum?,Is Marge Simpson Lisa Simpson's mum?,TRUE,TRUE,Who is $s's mom?,Who is Lisa Simpson's mom?,,,,,,,Who has the least children?,Who has the least children?,$o [SEP] $s,Marge Simpson [SEP] Lisa Simpson,,,,,,,,,,,,
 8 | $s is a parent of $o,Lisa Simpson is a parent of Marge Simpson,Is $o the mother of $s?,Is Marge Simpson the mother of Lisa Simpson?,TRUE,TRUE,Who is $s's mum?,Who is Lisa Simpson's mum?,,,,,,,,,,,,,,,,,,,,,,
 9 | $o is $s's mother,Marge Simpson is Lisa Simpson's mother,Is $o a parent of $s?,Is Marge Simpson a parent of Lisa Simpson?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
10 | ,,Is $o $s's parent?,Is Marge Simpson Lisa Simpson's parent?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
11 | ,,Is $o $s's mother?,Is Marge Simpson Lisa Simpson's mother?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
12 | ,,Is $o $s's mom?,Is Marge Simpson Lisa Simpson's mom?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
13 | ,,Is $o $s's mum?,Is Marge Simpson Lisa Simpson's mum?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,
14 | ,,Is $o the mother of $s?,Is Marge Simpson the mother of Lisa Simpson?,FALSE,FALSE,,,,,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P26.csv:
--------------------------------------------------------------------------------
 1 | 26,https://www.wikidata.org/wiki/Property:P26,$s has spouse $o (symmetric),,,,,,,,,,,,,,,,,,,,,,,,,,,
 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
 3 | $s has spouse $o,John has spouse Mary,Is $s partner of $o?,Is John partner of Mary?,TRUE,TRUE,Who are $s's spouses?,Who are John's spouses?,$o,Mary,How many people did $s marry?,How many people did John marry?,$o,Mary,,,,,,,,,,,,,,,,
 4 | $s is $o's spouse,John is Mary's spouse,Is $s $o's partner?,Is John Mary's partner?,TRUE,TRUE,Who is $s's spouse?,Who is John's spouse?,$o,Mary,How many partners has $s had?,How many partners has John had?,$o,Mary,,,,,,,,,,,,,,,,
 5 | $s is married to $o,John is married to Mary,Is $s $o's spouse?,Is John Mary's spouse?,TRUE,TRUE,List the spouses of $s,List the spouses of John,$o,Mary,How many partners does $s have?,How many partners does John have?,$o,Mary,,,,,,,,,,,,,,,,
 6 | $s is $o's partner,John is Mary's partner,,,,,Who is the spouse of $s?,Who is the spouse of John?,$o,Mary,,,,,,,,,,,,,,,,,,,,
 7 | ,,,,,,Who is $s married to?,Who is John married to?,$o,Mary,,,,,,,,,,,,,,,,,,,,
 8 | ,,,,,,Who married $s?,Who married John?,$o,Mary,,,,,,,,,,,,,,,,,,,,
 9 | ,,,,,,List the partners of $s,List the partners of John,$o,Mary,,,,,,,,,,,,,,,,,,,,
10 | ,,,,,,List who $s is married to,List who John is married to,$o,Mary,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P27.csv:
--------------------------------------------------------------------------------
1 | 27,https://www.wikidata.org/wiki/Property:P27,$s is a national of $o.,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $o recognizes $s as its citizen.,First French Empire recognizes Napoleon as its citizen.,Is $s a citizen of $o?,Is Napoleon a citizen of First French Empire?,TRUE,TRUE,What are the nationalities of $s?,What are the nationalities of Napoleon?,$o,First French Empire,How many nationalities does $s have?,How many nationalities does Napoleon have?,$o,First French Empire,Which country has the fewest number of citizens?,Which country has the fewest number of citizens?,$o [SEP] $s,First French Empire [SEP] Napoleon,Which country has the largest number of citizens?,Which country has the largest number of citizens?,$o [SEP] $s,First French Empire [SEP] Napoleon,,,,,,,,
4 | $s is a citizen of $o,Napoleon is a citizen of First French Empire,Does $s have the nationality of $o?,Does Napoleon have the nationality of First French Empire?,TRUE,TRUE,Who are the nationals of $o?,Who are the nationals of First French Empire?,$s,Napoleon,How many nationals does $o have?,How many nationals does First French Empire have?,$s,Napoleon,Which country has the lowest number of nationals?,Which country has the lowest number of nationals?,$o [SEP] $s,First French Empire [SEP] Napoleon,Which country has the highest number of nationals?,Which country has the highest number of nationals?,$o [SEP] $s,First French Empire [SEP] Napoleon,,,,,,,,
5 | $s has the citizenship of $o,Napoleon has the citizenship of First French Empire,,,,,Who are the citizens of $o?,Who are the citizens of First French Empire?,$s,Napoleon,How many citizens does $o have?,How many citizens does First French Empire have?,$s,Napoleon,,,,,,,,,,,,,,,,
6 | $s has the nationality of $o,Napoleon has the nationality of First French Empire,,,,,Who has citizenship of $o?,Who has citizenship of First French Empire?,$s,Napoleon,How many inhabitants are there in $o?,How many inhabitants are there in First French Empire?,$s,Napoleon,,,,,,,,,,,,,,,,
7 | ,,,,,,What is the nationality of $s?,What is the nationality of Napoleon?,$o,First French Empire,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P35.csv:
--------------------------------------------------------------------------------
1 | 35,https://www.wikidata.org/wiki/Property:P35,$o is the head of state of $s,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $o is head of state of $s.,Queen Elizabeth II is head of state of United Kingdom.,Is $o head of state of $s?,Is Queen Elizabeth II head of state of United Kingdom?,TRUE,TRUE,List the heads of state of $s,List the heads of state of United Kingdom,$o,Queen Elizabeth II,How many countries is $o head of state of?,How many countries is Queen Elizabeth II head of state of?,$s,United Kingdom,Which country has the fewest leaders?,Which country has the fewest leaders?,$s [SEP] $o,United Kingdom [SEP] Queen Elizabeth II,Which country has the most the leaders?,Which country has the most the leaders?,$s [SEP] $o,United Kingdom [SEP] Queen Elizabeth II,,,,,,,,
4 | $o is the official with the highest formal authority in $s.,Queen Elizabeth II is the official with the highest formal authority in United Kingdom.,Is $s's head of state $o?,Is United Kingdom's head of state Queen Elizabeth II?,TRUE,TRUE,Who's in charge of $s?,Who's in charge of United Kingdom?,$o,Queen Elizabeth II,How many countries does $o rule?,How many countries does Queen Elizabeth II rule?,$s,United Kingdom,Which country has had the fewest leaders?,Which country has had the fewest leaders?,$s [SEP] $o,United Kingdom [SEP] Queen Elizabeth II,Which country has had the most leaders?,Which country has had the most leaders?,$s [SEP] $o,United Kingdom [SEP] Queen Elizabeth II,,,,,,,,
5 | ,,Is $o the official with the highest formal authority in $s?,Is Queen Elizabeth II the official with the highest formal authority in United Kingdom?,TRUE,TRUE,Who's the leader of $s?,Who's the leader of United Kingdom?,$o,Queen Elizabeth II,How many countries is $o the ruler of?,How many countries is Queen Elizabeth II the ruler of?,$s,United Kingdom,Who rules over the fewest countries?,Who rules over the fewest countries?,$o [SEP] $s,Queen Elizabeth II [SEP] United Kingdom,Who rules over the most countries?,Who rules over the most countries?,$o [SEP] $s,Queen Elizabeth II [SEP] United Kingdom,,,,,,,,
6 | ,,Is the official with the highest formal authority in $s $o?,Is the official with the highest formal authority in United Kingdom Queen Elizabeth II?,TRUE,TRUE,List the leaders of $s?,List the leaders of United Kingdom?,$o,Queen Elizabeth II,How many rulers does $s have?,How many rulers does United Kingdom have?,$o,Queen Elizabeth II,Who is head of state of the fewest places?,Who is head of state of the fewest places?,$o [SEP] $s,Queen Elizabeth II [SEP] United Kingdom,Who is head of state of the most places?,Who is head of state of the most places?,$o [SEP] $s,Queen Elizabeth II [SEP] United Kingdom,,,,,,,,
7 | ,,,,,,Where is $o the ruler?,Where is Queen Elizabeth II the ruler?,$s,United Kingdom,How many heads of states are in charge of $s?,How many heads of states are in charge of United Kingdom?,$o,Queen Elizabeth II,,,,,,,,,,,,,,,,
8 | ,,,,,,Who rules over $s?,Who rules over United Kingdom?,$o,Queen Elizabeth II,,,,,,,,,,,,,,,,,,,,
9 | ,,,,,,Where is $o the head of state?,Where is Queen Elizabeth II the head of state?,$s,United Kingdom,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P38.csv:
--------------------------------------------------------------------------------
1 | 38,https://www.wikidata.org/wiki/Property:P38,$s <country or financial institution> accepts currency $o,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s accepts $o,People's Republic of China accepts renminbi,Does $s accept $o?,Does People's Republic of China accept renminbi?,TRUE,TRUE,Which places accept $o?,Which places accept renminbi?,$s,People's Republic of China,How many currencies does $s accept?,How many currencies does People's Republic of China accept?,$o,renminbi,What is the least used currency?,What is the least used currency?,$o [SEP] $s,renminbi [SEP] People's Republic of China,What is the most used currency?,What is the most used currency?,$o [SEP] $s,renminbi [SEP] People's Republic of China,,,,,,,,
4 | $s uses the $o,People's Republic of China uses the renminbi,Is $o accepted in $s?,Is renminbi accepted in People's Republic of China?,TRUE,TRUE,What currencies are accepted by $s?,What currencies are accepted by People's Republic of China?,$o,renminbi,How many places accept $o?,How many places accept renminbi?,$s,People's Republic of China,What country uses the fewest currencies?,What country uses the fewest currencies?,$s [SEP] $o,People's Republic of China [SEP] renminbi,What country has the most currencies?,What country has the most currencies?,$s [SEP] $o,People's Republic of China [SEP] renminbi,,,,,,,,
5 | The currency of $s is $o,The currency of People's Republic of China is renminbi,Is $o used in $s?,Is renminbi used in People's Republic of China?,TRUE,TRUE,Which currencies are used by $s?,Which currencies are used by People's Republic of China?,$o,renminbi,How many countries accept $o?,How many countries accept renminbi?,$s,People's Republic of China,What places accept the fewest different currencies?,What places accept the fewest different currencies?,$s [SEP] $o,People's Republic of China [SEP] renminbi,What places accept the most currencies?,What places accept the most currencies?,$s [SEP] $o,People's Republic of China [SEP] renminbi,,,,,,,,
6 | The official currency of $s is $o,The official currency of People's Republic of China is renminbi,Is $o used by $s?,Is renminbi used by People's Republic of China?,TRUE,TRUE,What currency is accepted by $s?,What currency is accepted by People's Republic of China?,$o,renminbi,$o is a currency of how many places?,renminbi is a currency of how many places?,$s,People's Republic of China,,,,,,,,,,,,,,,,
7 | The accepted currency of $s is $o,The accepted currency of People's Republic of China is renminbi,Is the $o currency accepted by $s?,Is the renminbi currency accepted by People's Republic of China?,TRUE,TRUE,What currency is used in $s?,What currency is used in People's Republic of China?,$o,renminbi,,,,,,,,,,,,,,,,,,,,
8 | ,,,,,,What currency is used by $s?,What currency is used by People's Republic of China?,$o,renminbi,,,,,,,,,,,,,,,,,,,,
9 | ,,,,,,Where is $o used?,Where is renminbi used?,$s,People's Republic of China,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P47.csv:
--------------------------------------------------------------------------------
1 | 47,https://www.wikidata.org/wiki/Property:P47,$s shares border with $o.,NOTES: This is a symmetric relation. Only generate relations with one direction! The training data generation script will generate the reverse relation,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s is neighbour with $o.,France is neighbour with Spain.,Are $s and $o neighbours?,Are France and Spain neighbours?,TRUE,TRUE,Which places neighbour $s?,Which places neighbour France?,$o,Spain,How many neighbours does $s have?,How many neighbours does France have?,$o,Spain,Which place has the lowest number of neighbours?,Which place has the lowest number of neighbours?,$s [SEP] $o,France [SEP] Spain,Which place has the highest number of neighbours?,Which place has the highest number of neighbours?,$s [SEP] $o,France [SEP] Spain,,,,,,,,
4 | $s is a neighbour of $o.,France is a neighbour of Spain.,Does $s share a border with $o?,Does France share a border with Spain?,TRUE,TRUE,Which places share a border with $s?,Which places share a border with France?,$o,Spain,How many places share border with $s?,How many places share border with France?,$o,Spain,What is the place with the minimum number of borders?,What is the place with the minimum number of borders?,$s [SEP] $o,France [SEP] Spain,What is the place with the maximum number of borders?,What is the place with the maximum number of borders?,$s [SEP] $o,France [SEP] Spain,,,,,,,,
5 | $s and $o are neighbours.,France and Spain are neighbours.,Do $s and $o share borders?,Do France and Spain share borders?,TRUE,TRUE,What places are next to $s?,What places are next to France?,$o,Spain,How many places border $s?,How many places border France?,$o,Spain,,,,,,,,,,,,,,,,
6 | $s and $o share borders with each other.,France and Spain share borders with each other.,Do $o and $s have common borders?,Do Spain and France have common borders?,TRUE,TRUE,,,,,,,,,,,,,,,,,,,,,,,,
7 | $s shares a border with $o.,France shares a border with Spain.,,,,,,,,,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P50.csv:
--------------------------------------------------------------------------------
1 | 50,https://www.wikidata.org/wiki/Property:P50,$o author of $s,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $o is the author of $s,Jane Austen is the author of Pride and Prejudice,Did $o write $s?,Did Jane Austen write Pride and Prejudice?,TRUE,TRUE,What are the books written by $o?,What are the books written by Jane Austen?,$s,Pride and Prejudice,How many authors does $s have?,How many authors does Pride and Prejudice have?,$o,Jane Austen,Which book has the fewest number of authors?,Which book has the fewest number of authors?,$s [SEP] $o,Pride and Prejudice [SEP] Jane Austen,Which book has the highest number of authors?,Which book has the highest number of authors?,$s [SEP] $o,Pride and Prejudice [SEP] Jane Austen,,,,,,,,
4 | $o wrote $s,Jane Austen wrote Pride and Prejudice,Was $s written by $o?,Was Pride and Prejudice written by Jane Austen?,TRUE,TRUE,List all the books that $o is the author of.,List all the books that Jane Austen is the author of.,$s,Pride and Prejudice,How many books did $s write?,How many books did Pride and Prejudice write?,$s,Pride and Prejudice,Which author has written the least number of books?,Which author has written the least number of books?,$o [SEP] $s,Jane Austen [SEP] Pride and Prejudice,Which author has written the largest number of books?,Which author has written the largest number of books?,$o [SEP] $s,Jane Austen [SEP] Pride and Prejudice,,,,,,,,
5 | $s was written by $o,Pride and Prejudice was written by Jane Austen,Is $o the author of $s?,Is Jane Austen the author of Pride and Prejudice?,TRUE,TRUE,List all the authors of $s.,List all the authors of Pride and Prejudice.,$o.,Jane Austen.,How many people wrote $s?,How many people wrote Pride and Prejudice?,$o,Jane Austen,Who is author with the fewest number of books?,Who is author with the fewest number of books?,$o [SEP] $s,Jane Austen [SEP] Pride and Prejudice,Who is author with the maximum number of books?,Who is author with the maximum number of books?,$o [SEP] $s,Jane Austen [SEP] Pride and Prejudice,,,,,,,,
6 | $s's author is $o,Pride and Prejudice's author is Jane Austen,,,,,Who are writers of $s?,Who are writers of Pride and Prejudice?,$o.,Jane Austen.,How many books is $o the author of?,How many books is Jane Austen the author of?,$s,Pride and Prejudice,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P54.csv:
--------------------------------------------------------------------------------
 1 | 54,https://www.wikidata.org/wiki/Property:P54,$s <person> is a member of team $o <team>,,,,,,,,,,,,,,,,,,,,,,,,,,,
 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
 3 | $s is a member of $o,Diego Maradona is a member of FC Barcelona,Does $s play for $o?,Does Diego Maradona play for FC Barcelona?,TRUE,TRUE,Who plays for $o?,Who plays for FC Barcelona?,$s,Diego Maradona,How many people play for $o?,How many people play for FC Barcelona?,$s,Diego Maradona,Which team has the least players?,Which team has the least players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,Which team has the most players?,Which team has the most players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,,
 4 | $s is a part of $o,Diego Maradona is a part of FC Barcelona,Is $s a member of $o?,Is Diego Maradona a member of FC Barcelona?,TRUE,TRUE,Who is a member of $o?,Who is a member of FC Barcelona?,$s,Diego Maradona,How many teams has $s played for?,How many teams has Diego Maradona played for?,$o,FC Barcelona,Which team has the fewest players?,Which team has the fewest players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,Which team has the highest number of players?,Which team has the highest number of players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,,
 5 | $s plays for $o,Diego Maradona plays for FC Barcelona,Is $s part of $o?,Is Diego Maradona part of FC Barcelona?,TRUE,TRUE,$s is a member of which teams?,Diego Maradona is a member of which teams?,$o,FC Barcelona,,,,,What is the team with the fewest players?,What is the team with the fewest players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,What is the largest team?,What is the largest team?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,,
 6 | $o's membership includes $s,FC Barcelona's membership includes Diego Maradona,,,,,List teams $s plays for,List teams Diego Maradona plays for,$o,FC Barcelona,,,,,What is the smallest team?,What is the smallest team?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,What is the team with the most players?,What is the team with the most players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,,
 7 | ,,,,,,,,,,,,,,Who has played for the fewest teams?,Who has played for the fewest teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,What is the team with the most number of players?,What is the team with the most number of players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,,,,,,,,
 8 | ,,,,,,,,,,,,,,Who has played for the least number of different teams?,Who has played for the least number of different teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,Who has played for the most teams?,Who has played for the most teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,,,,,,,,
 9 | ,,,,,,,,,,,,,,Who has played for the least teams?,Who has played for the least teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,Who has played for the highest number of teams?,Who has played for the highest number of teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,,,,,,,,
10 | ,,,,,,,,,,,,,,Which team has the lowest number of players?,Which team has the lowest number of players?,$o [SEP] $s,FC Barcelona [SEP] Diego Maradona,Who has played for the most number of teams?,Who has played for the most number of teams?,$s [SEP] $o,Diego Maradona [SEP] FC Barcelona,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P57.csv:
--------------------------------------------------------------------------------
1 | 57,https://www.wikidata.org/wiki/Property:P57,$o is the director of $s,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s is directed by $o,Titanic is directed by James Cameron,Was $o the director of $s?,Was James Cameron the director of Titanic?,TRUE,TRUE,Which movies has $o directed?,Which movies has James Cameron directed?,$s,Titanic,How many movies has $o directed?,How many movies has James Cameron directed?,$s,Titanic,Which director has directed minimum number of movies?,Which director has directed minimum number of movies?,$o [SEP] $s,James Cameron [SEP] Titanic,Which director has directed maximum number of movies?,Which director has directed maximum number of movies?,$o [SEP] $s,James Cameron [SEP] Titanic,,,,,,,,
4 | $s's director is $o,Titanic's director is James Cameron,Was $s directed by $o?,Was Titanic directed by James Cameron?,TRUE,TRUE,Which movies have been directed by $o?,Which movies have been directed by James Cameron?,$s,Titanic,How many directors does $s have?,How many directors does Titanic have?,$o,James Cameron,Who has directed minimum number of movies?,Who has directed minimum number of movies?,$o [SEP] $s,James Cameron [SEP] Titanic,Who has directed maximum number of movies?,Who has directed maximum number of movies?,$o [SEP] $s,James Cameron [SEP] Titanic,,,,,,,,
5 | $o directed $s,James Cameron directed Titanic,Is $o director of $s?,Is James Cameron director of Titanic?,TRUE,TRUE,Who are the directors of $s?,Who are the directors of Titanic?,$o,James Cameron,,,,,,,,,,,,,,,,,,,,
6 | $o is the director of $s,James Cameron is the director of Titanic,Did $o direct $s?,Did James Cameron direct Titanic?,TRUE,TRUE,,,,,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P58.csv:
--------------------------------------------------------------------------------
1 | 58,https://www.wikidata.org/wiki/Property:P58,$s <film> screenwriter was $o <person>,,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | $s was written by $o,Titanic was written by James Cameron,Did $o write the script for $s?,Did James Cameron write the script for Titanic?,TRUE,TRUE,Who wrote the script for $s?,Who wrote the script for Titanic?,$o,James Cameron,How many movies has $o written?,How many movies has James Cameron written?,$s,Titanic,Who has the written the fewest movie scripts?,Who has the written the fewest movie scripts?,$o [SEP] $s,James Cameron [SEP] Titanic,Who has written the most movie scripts?,Who has written the most movie scripts?,$o [SEP] $s,James Cameron [SEP] Titanic,,,,,,,,
4 | $o wrote the script of $s,James Cameron wrote the script of Titanic,Did $o write $s?,Did James Cameron write Titanic?,TRUE,TRUE,The script for $s is written by who?,The script for Titanic is written by who?,$o,James Cameron,How many movies was $o involved in?,How many movies was James Cameron involved in?,$s,Titanic,Who has written the fewest screenplays?,Who has written the fewest screenplays?,$o [SEP] $s,James Cameron [SEP] Titanic,Who has written the most screenplays?,Who has written the most screenplays?,$o [SEP] $s,James Cameron [SEP] Titanic,,,,,,,,
5 | The screenwriter for $s was $o,The screenwriter for Titanic was James Cameron,Did $o write the screenplay for $s?,Did James Cameron write the screenplay for Titanic?,TRUE,TRUE,Who is the screenwriter for $s?,Who is the screenwriter for Titanic?,$o,James Cameron,How many movies has $o written the screenplay for?,How many movies has James Cameron written the screenplay for?,$s,Titanic,Which movie has had the fewest writers,Which movie has had the fewest writers,$s [SEP] $o,Titanic [SEP] James Cameron,Which movie has had the most writers?,Which movie has had the most writers?,$s [SEP] $o,Titanic [SEP] James Cameron,,,,,,,,
6 | ,,Was $o involved in the making of $s?,Was James Cameron involved in the making of Titanic?,TRUE,TRUE,Who wrote the screenplay for $s?,Who wrote the screenplay for Titanic?,$o,James Cameron,How many writers were involved in $s?,How many writers were involved in Titanic?,$o,James Cameron,,,,,,,,,,,,,,,,
7 | ,,,,,,What films has $o written?,What films has James Cameron written?,$s,Titanic,,,,,,,,,,,,,,,,,,,,
8 | ,,,,,,$o has written which screenplays?,James Cameron has written which screenplays?,$s,Titanic,,,,,,,,,,,,,,,,,,,,
9 | ,,,,,,Which movies has $o written?,Which movies has James Cameron written?,$s,Titanic,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P6.csv:
--------------------------------------------------------------------------------
1 | 6,https://www.wikidata.org/wiki/Property:P6,"$s <place>, head of government, $o <person>",,,,,,,,,,,,,,,,,,,,,,,,,,,
2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
3 | The head of government of $s is $o,The head of government of United States of America is Donald Trump,Is the leader of $s $o?,Is the leader of United States of America Donald Trump?,TRUE,TRUE,$o is the leader of which places?,Donald Trump is the leader of which places?,$s,United States of America,How many leaders does $s have?,How many leaders does United States of America have?,$o,Donald Trump,Which place has the fewest leaders?,Which place has the fewest leaders?,$s [SEP] $o,United States of America [SEP] Donald Trump,Which place has the most leaders?,Which place has the most leaders?,$s [SEP] $o,United States of America [SEP] Donald Trump,,,,,,,,
4 | The leader of $s is $o,The leader of United States of America is Donald Trump,Is $o the leader of $s?,Is Donald Trump the leader of United States of America?,TRUE,TRUE,$o is the leader of which governments?,Donald Trump is the leader of which governments?,$s,United States of America,$o is the leader of how many places?,Donald Trump is the leader of how many places?,$s,United States of America,What is the country with the most number of leaders?,What is the country with the most number of leaders?,$s [SEP] $o,United States of America [SEP] Donald Trump,What country has the most number of leaders?,What country has the most number of leaders?,$s [SEP] $o,United States of America [SEP] Donald Trump,,,,,,,,
5 | $o is the leader of $s,Donald Trump is the leader of United States of America,Is $s governed by $o?,Is United States of America governed by Donald Trump?,TRUE,TRUE,Where is $o the head of government?,Where is Donald Trump the head of government?,$s,United States of America,How many heads of government preside in $s?,How many heads of government preside in United States of America?,$o,Donald Trump,,,,,,,,,,,,,,,,
6 | $o is the head of government of $s,Donald Trump is the head of government of United States of America,Is $o a head of government of $s?,Is Donald Trump a head of government of United States of America?,TRUE,TRUE,Who governs $s?,Who governs United States of America?,$o,Donald Trump,How many country leaders are there?,How many country leaders are there?,$o,Donald Trump,,,,,,,,,,,,,,,,
7 | $s's government is led by $o.,United States of America's government is led by Donald Trump.,Is $s's government led by $o?,Is United States of America's government led by Donald Trump?,TRUE,TRUE,Who is a governer of $s?,Who is a governer of United States of America?,$o,Donald Trump,How many heads of government are there?,How many heads of government are there?,$o,Donald Trump,,,,,,,,,,,,,,,,
8 | ,,,,,,Who is a head of government of $s?,Who is a head of government of United States of America?,$o,Donald Trump,,,,,,,,,,,,,,,,,,,,
9 | ,,,,,,Which country is $o the leader of?,Which country is Donald Trump the leader of?,$s,United States of America,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/configs/for_v1.5/NDB Relation Templates - P61.csv:
--------------------------------------------------------------------------------
 1 | 61,https://www.wikidata.org/wiki/Property:P61,$s <thing> was discovered or invented by $o <person>,,,,,,,,,,,,,,,,,,,,,,,,,,,
 2 | fact,example,bool,example,bool_answer,example_output,set,example,set_projection,example_output,count,example,count_projection,example_output,argmin,example,argmin_projection,example_output,argmax,example,argmax_projection,example_output,min,example,min_projection,example_output,max,example,max_projection,example_output
 3 | $s was discovered by $o,Uranus was discovered by William Herschel,Did $o discover $s?,Did William Herschel discover Uranus?,TRUE,TRUE,Who discovered $s?,Who discovered Uranus?,$o,William Herschel,How many things did $o discover?,How many things did William Herschel discover?,$s,Uranus,Who discovered the fewest things?,Who discovered the fewest things?,$o [SEP] $s,William Herschel [SEP] Uranus,Who has discovered the most things?,Who has discovered the most things?,$o [SEP] $s,William Herschel [SEP] Uranus,,,,,,,,
 4 | $s was invented by $o,Uranus was invented by William Herschel,Did $o invent $s?,Did William Herschel invent Uranus?,TRUE,TRUE,Who invented $s?,Who invented Uranus?,$o,William Herschel,How many people discovered $s?,How many people discovered Uranus?,$o,William Herschel,What has had the fewest inventors?,What has had the fewest inventors?,$s [SEP] $o,Uranus [SEP] William Herschel,What has had the most inventors?,What has had the most inventors?,$s [SEP] $o,Uranus [SEP] William Herschel,,,,,,,,
 5 | $o invented $s,William Herschel invented Uranus,Was $s discovered by $o?,Was Uranus discovered by William Herschel?,TRUE,TRUE,Who was the discoverer of $s?,Who was the discoverer of Uranus?,$o,William Herschel,How many people invented $s?,How many people invented Uranus?,$o,William Herschel,What item has had the fewest inventors?,What item has had the fewest inventors?,$s [SEP] $o,Uranus [SEP] William Herschel,What item has had the most inventors?,What item has had the most inventors?,$s [SEP] $o,Uranus [SEP] William Herschel,,,,,,,,
 6 | $o discovered $s,William Herschel discovered Uranus,Was $s invented by $o?,Was Uranus invented by William Herschel?,TRUE,TRUE,Who was the inventor of $s?,Who was the inventor of Uranus?,$o,William Herschel,How many things did $o invent?,How many things did William Herschel invent?,$s,Uranus,What has had the least discoverers?,What has had the least discoverers?,$s [SEP] $o,Uranus [SEP] William Herschel,What has had the most discoverers?,What has had the most discoverers?,$s [SEP] $o,Uranus [SEP] William Herschel,,,,,,,,
 7 | $o was the inventor of $s,William Herschel was the inventor of Uranus,Was $o the inventor of $s?,Was William Herschel the inventor of Uranus?,TRUE,TRUE,What did $o discover?,What did William Herschel discover?,$s,Uranus,,,,,Who has the fewest inventions?,Who has the fewest inventions?,$o [SEP] $s,William Herschel [SEP] Uranus,Who has the most inventions?,Who has the most inventions?,$o [SEP] $s,William Herschel [SEP] Uranus,,,,,,,,
 8 | $o was the discoverer of $s,William Herschel was the discoverer of Uranus,Was $o the discoverer of $s?,Was William Herschel the discoverer of Uranus?,TRUE,TRUE,What did $o invent?,What did William Herschel invent?,$s,Uranus,,,,,Who has the least number of inventions?,Who has the least number of inventions?,$o [SEP] $s,William Herschel [SEP] Uranus,Who has had the most number of inventions?,Who has had the most number of inventions?,$o [SEP] $s,William Herschel [SEP] Uranus,,,,,,,,
 9 | ,,,,,,List things $o discovered,List things William Herschel discovered,$s,Uranus,,,,,Who has had the fewest discoveries?,Who has had the fewest discoveries?,$o [SEP] $s,William Herschel [SEP] Uranus,Who has had the most discoveries?,Who has had the most discoveries?,$o [SEP] $s,William Herschel [SEP] Uranus,,,,,,,,
10 | ,,,,,,List things $o invented,List things William Herschel invented,$s,Uranus,,,,,,,,,,,,,,,,,,,,
11 | ,,,,,,List all things discovered,List all things discovered,$s,Uranus,,,,,,,,,,,,,,,,,,,,
12 | ,,,,,,List all discoveries,List all discoveries,$s,Uranus,,,,,,,,,,,,,,,,,,,,
13 | ,,,,,,List all inventions,List all inventions,$s,Uranus,,,,,,,,,,,,,,,,,,,,
14 | ,,,,,,What has been invented?,What has been invented?,$s,Uranus,,,,,,,,,,,,,,,,,,,,
15 | ,,,,,,What has been discovered?,What has been discovered?,$s,Uranus,,,,,,,,,,,,,,,,,,,,


--------------------------------------------------------------------------------
/dataset-construction/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | pymongo
3 | numpy


--------------------------------------------------------------------------------
/dataset-construction/scripts/initial_sample.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/25 --num_dbs_to_make 10000 --sample_rels 1 --sample_per_rel 16 --sample_extra 1
20 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/50 --num_dbs_to_make 5000 --sample_rels 1 --sample_per_rel 16 --sample_extra 2
21 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/100 --num_dbs_to_make 2500 --sample_rels 2 --sample_per_rel 16 --sample_extra 2
22 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/250 --num_dbs_to_make 1000 --sample_rels 2 --sample_per_rel 32 --sample_extra 3
23 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/500 --num_dbs_to_make 500 --sample_rels 2 --sample_per_rel 64 --sample_extra 4
24 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/1000 --num_dbs_to_make 250 --sample_rels 2 --sample_per_rel 128 --sample_extra 4
25 | 


--------------------------------------------------------------------------------
/dataset-construction/scripts/make_databases.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | size=$1
20 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/${size}_train work/newdbs/intermediate_train_${size}.jsonl --target-size ${size}
21 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/${size}_dev work/newdbs/intermediate_dev_${size}.jsonl --target-size ${size}
22 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/${size}_test work/newdbs/intermediate_test_${size}.jsonl --target-size ${size}


--------------------------------------------------------------------------------
/dataset-construction/scripts/make_questions.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | size=$1
20 | python src/ndb_data/construction/make_questions.py work/newdbs/intermediate_train_${size}.jsonl work/newdbs/final_train_${size}.jsonl
21 | python src/ndb_data/construction/make_questions.py work/newdbs/intermediate_dev_${size}.jsonl work/newdbs/final_dev_${size}.jsonl
22 | python src/ndb_data/construction/make_questions.py work/newdbs/intermediate_test_${size}.jsonl work/newdbs/final_test_${size}.jsonl
23 | 


--------------------------------------------------------------------------------
/dataset-construction/scripts/make_v2.4.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | mkdir -pv work/newdbs
20 | 
21 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/25 --num_dbs_to_make 1000 --sample_rels 1 --sample_per_rel 16 --sample_extra 2
22 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/25_train work/newdbs/intermediate_train_25.jsonl --target-size 25
23 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/25_dev work/newdbs/intermediate_dev_25.jsonl --target-size 25
24 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/25_test work/newdbs/intermediate_test_25.jsonl --target-size 25
25 | 
26 | 
27 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/50 --num_dbs_to_make 500 --sample_rels 1 --sample_per_rel 16 --sample_extra 2
28 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_50.jsonl --target-size 50
29 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_50.jsonl --target-size 50
30 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_50.jsonl --target-size 50
31 | 
32 | 
33 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/100 --num_dbs_to_make 250 --sample_rels 2 --sample_per_rel 16 --sample_extra 2
34 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_100.jsonl --target-size 100
35 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_100.jsonl --target-size 100
36 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_100.jsonl --target-size 100
37 | 
38 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/250 --num_dbs_to_make 100 --sample_rels 2 --sample_per_rel 32 --sample_extra 3
39 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_250.jsonl --target-size 250
40 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_250.jsonl --target-size 250
41 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_250.jsonl --target-size 250
42 | 
43 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/500 --num_dbs_to_make 50 --sample_rels 3 --sample_per_rel 40 --sample_extra 4
44 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_250.jsonl --target-size 500
45 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_250.jsonl --target-size 500
46 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_250.jsonl --target-size 500
47 | 
48 | python src/ndb_data/construction/make_database_initial.py work/kelm_cache/ work/newdbs/1000 --num_dbs_to_make 25 --sample_rels 3 --sample_per_rel 50 --sample_extra 4
49 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_train work/newdbs/intermediate_train_250.jsonl --target-size 1000
50 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_dev work/newdbs/intermediate_dev_250.jsonl --target-size 1000
51 | python src/ndb_data/construction/make_database_finalize.py work/newdbs/50_test work/newdbs/intermediate_test_250.jsonl --target-size 1000
52 | 
53 | 
54 | 
55 | bash scripts/make_databases.sh 50;
56 | bash scripts/make_databases.sh 100;
57 | bash scripts/make_databases.sh 250;
58 | bash scripts/make_databases.sh 500;
59 | bash scripts/make_databases.sh 1000;
60 | 
61 | bash scripts/make_questions.sh 50;
62 | bash scripts/make_questions.sh 100;
63 | bash scripts/make_questions.sh 250;
64 | bash scripts/make_questions.sh 500;
65 | bash scripts/make_questions.sh 1000;


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/construction/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/construction/make_database_initial_cache.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
  3 | #
  4 | # This file is part of NeuralDB.
  5 | # See https://github.com/facebookresearch/NeuralDB for further info.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | import json
 20 | from collections import defaultdict
 21 | 
 22 | import numpy as np
 23 | from argparse import ArgumentParser
 24 | from nltk import ngrams
 25 | from nltk import word_tokenize
 26 | from nltk.tokenize.treebank import TreebankWordDetokenizer
 27 | from similarity.normalized_levenshtein import NormalizedLevenshtein
 28 | from tqdm import tqdm
 29 | 
 30 | from ndb_data.wikidata_common.wikidata import Wikidata
 31 | 
 32 | detok = TreebankWordDetokenizer()
 33 | 
 34 | 
 35 | def generate_hypotheses(instance):
 36 |     for s, r, o in instance["valid_hypotheses"]:
 37 |         if r not in final_templates:
 38 |             continue
 39 |         yield (s, r, o)
 40 | 
 41 | 
 42 | def normalize_subject(subject_name, fact):
 43 |     if subject_name is None:
 44 |         return None
 45 | 
 46 |     skip = {"is", "a", "of", "between", "on", "in"}
 47 | 
 48 |     n = NormalizedLevenshtein()
 49 |     mixed_case_subject = not subject_name.islower()
 50 |     if mixed_case_subject and subject_name not in fact:
 51 |         toks = word_tokenize(fact)
 52 |         all_grams = []
 53 |         for i in range(1, len(toks)):
 54 |             all_grams.extend(" ".join(a) for a in ngrams(toks, i) if a[0] not in skip)
 55 | 
 56 |         scores = [n.similarity(gram, subject_name) for gram in all_grams]
 57 |         best_post = int(np.argmax(scores))
 58 | 
 59 |         original_subject_name = all_grams[best_post]
 60 |         if scores[best_post] < 0.5 or all_grams[best_post] == "name":
 61 |             return None
 62 | 
 63 |         fact = " ".join(toks)
 64 |         fact = fact.replace(original_subject_name, subject_name)
 65 |         fact = detok.detokenize(fact.split()).replace(" 's", "'s").replace(" ,", ",")
 66 | 
 67 |         if subject_name not in fact:
 68 |             return None
 69 | 
 70 |         assert subject_name in fact, f"Subject {subject_name} was not in {fact}"
 71 |     return fact
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 |     parser = ArgumentParser()
 76 |     parser.add_argument("in_file")
 77 |     parser.add_argument("out_file")
 78 |     parser.add_argument("--estimated_size", type=int)
 79 |     args = parser.parse_args()
 80 | 
 81 |     wiki = Wikidata()
 82 |     loaded = []
 83 |     by_subject = defaultdict(list)
 84 |     by_relation = defaultdict(list)
 85 |     by_object = defaultdict(list)
 86 | 
 87 |     with open("configs/generate_v1.5.json") as f:
 88 |         final_templates = json.load(f)
 89 | 
 90 |     # print(final_templates.keys())
 91 |     with open(args.in_file) as f:
 92 |         for ix, line in enumerate(tqdm(f, total=args.estimated_size)):
 93 |             instance = json.loads(line)
 94 |             added_id = None
 95 | 
 96 |             # Check if it contains a relation we care about
 97 |             for s, r, o in generate_hypotheses(instance):
 98 |                 if added_id is None:
 99 |                     added_id = len(loaded)
100 |                     loaded.append(instance)
101 | 
102 |             # If it doesn't skip this claim
103 |             if added_id is None:
104 |                 continue
105 | 
106 |             # Correct the claim
107 |             fact = instance["candidate"].strip()
108 |             for s, r, o in generate_hypotheses(instance):
109 |                 name = wiki.get_by_id_or_uri(s)["english_name"]
110 | 
111 |                 if name is None:
112 |                     fact = None
113 |                     break
114 | 
115 |                 fact = normalize_subject(name, fact)
116 | 
117 |                 if fact is None:
118 |                     break
119 | 
120 |                 if o.startswith("Q"):
121 |                     name = wiki.get_by_id_or_uri(o)["english_name"]
122 |                     if name is None:
123 |                         fact = None
124 |                         break
125 | 
126 |                     fact = normalize_subject(name, fact)
127 | 
128 |                 if fact is None:
129 |                     break
130 | 
131 |             instance["fact"] = fact
132 |             if fact is None or "⁇" in fact:
133 |                 continue
134 | 
135 |             # Add the filtered relations to the dictionaries
136 |             for s, r, o in generate_hypotheses(instance):
137 |                 by_subject[s].append(added_id)
138 |                 by_relation[r].append(added_id)
139 |                 by_object[o].append(added_id)
140 | 
141 |     with (open(args.out_file, "w+")) as f:
142 |         json.dump(
143 |             {
144 |                 "loaded": loaded,
145 |                 "by_subject": by_subject,
146 |                 "by_object": by_object,
147 |                 "by_relation": by_relation,
148 |             },
149 |             f,
150 |         )
151 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/data_import/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/data_import/fix_sitelinks.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | from pymongo import UpdateOne
20 | from tqdm import tqdm
21 | 
22 | from ndb_data.wikidata_common.wikidata import Wikidata
23 | 
24 | 
25 | def write_updates(batch_update):
26 |     bulks = []
27 |     for k, v in batch_update:
28 |         bulks.append(UpdateOne(k, v))
29 | 
30 |     collection.bulk_write(bulks)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     client = Wikidata()
35 |     collection = client.collection
36 | 
37 |     batch_update = []
38 | 
39 |     num_ops = 0
40 |     tqdm_iter = tqdm(
41 |         collection.find({}, {"_id": 1, "sitelinks": 1}),
42 |         total=collection.estimated_document_count(),
43 |     )
44 |     for i in tqdm_iter:
45 |         if type(i["sitelinks"]) == dict:
46 |             batch_update.append(
47 |                 (
48 |                     {"_id": i["_id"]},
49 |                     {"$set": {"sitelinks": list(i["sitelinks"].values())}},
50 |                 )
51 |             )
52 | 
53 |         if len(batch_update) > 10000:
54 |             write_updates(batch_update)
55 |             batch_update = []
56 |             num_ops += 1
57 |             tqdm_iter.desc = f"Performed update {num_ops}"
58 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/data_import/kelm_data.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | from argparse import ArgumentParser
21 | from tqdm import tqdm
22 | 
23 | from ndb_data.wikidata_common.kelm import KELMMongo
24 | 
25 | if __name__ == "__main__":
26 |     parser = ArgumentParser()
27 |     parser.add_argument("kelm_file")
28 |     args = parser.parse_args()
29 | 
30 |     client = KELMMongo()
31 |     collection = client.collection
32 | 
33 |     batch = []
34 |     insert_count = 0
35 |     with open(args.kelm_file) as f:
36 |         _tqdm_iter = tqdm(enumerate(f))
37 | 
38 |         for idx, line in _tqdm_iter:
39 |             instance = json.loads(line)
40 | 
41 |             subjects = set()
42 |             relations = set()
43 |             for hypothesis in instance["valid_hypotheses"]:
44 |                 s, r, o = hypothesis
45 |                 if s.startswith("Q"):
46 |                     subjects.add(s)
47 | 
48 |                 if o is not None and not isinstance(o, dict) and o.startswith("Q"):
49 |                     subjects.add(o)
50 | 
51 |                 relations.add(r)
52 | 
53 |             instance["entities"] = list(subjects)
54 |             instance["relations"] = list(relations)
55 | 
56 |             batch.append(instance)
57 |             if len(batch) >= 5000:
58 |                 collection.insert_many(batch)
59 |                 batch = []
60 |                 insert_count += 1
61 | 
62 |                 _tqdm_iter.desc = f"Insert batch {insert_count}"
63 | 
64 |     collection.insert_many(batch)
65 |     client.close()
66 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/data_import/wikidata_index.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
  3 | #
  4 | # This file is part of NeuralDB.
  5 | # See https://github.com/facebookresearch/NeuralDB for further info.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | import bz2
 20 | import json
 21 | from collections import defaultdict
 22 | from json import JSONDecodeError
 23 | 
 24 | import pydash
 25 | from argparse import ArgumentParser
 26 | 
 27 | from tqdm import tqdm
 28 | 
 29 | from ndb_data.wikidata_common.wikidata import Wikidata
 30 | 
 31 | 
 32 | def read_dump(wikidata_file):
 33 |     with bz2.open(wikidata_file, mode="rt") as f:
 34 |         f.read(2)
 35 |         for line in f:
 36 |             yield line.rstrip(",\n")
 37 | 
 38 | 
 39 | def get_indexable(instance):
 40 |     wikidata_id = pydash.get(instance, "id")
 41 |     english_name = pydash.get(instance, "labels.en.value")
 42 | 
 43 |     claims = pydash.get(instance, "claims")
 44 | 
 45 |     properties = set()
 46 |     property_entity = defaultdict(list)
 47 |     for property, claims in claims.items():
 48 |         properties.add(property)
 49 |         for claim in claims:
 50 |             property_entity[property].append(
 51 |                 (
 52 |                     pydash.get(claim, "mainsnak.datavalue.value"),
 53 |                     list(pydash.get(claim, "qualifiers").values())
 54 |                     if pydash.get(claim, "qualifiers") is not None
 55 |                     else None,
 56 |                 )
 57 |             )
 58 |     sitelinks = pydash.get(instance, "sitelinks")
 59 |     enwiki = pydash.get(instance, "sitelinks.enwiki.title")
 60 |     yield wikidata_id, english_name, sitelinks, enwiki, list(properties), dict(
 61 |         property_entity
 62 |     )
 63 | 
 64 | 
 65 | def index_dump(dump):
 66 |     for idx, line in enumerate(dump):
 67 |         try:
 68 |             yield from get_indexable(json.loads(line))
 69 |         except JSONDecodeError as e:
 70 |             print(e)
 71 |             pass
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 |     parser = ArgumentParser()
 76 |     parser.add_argument("wikidata_file")
 77 |     args = parser.parse_args()
 78 | 
 79 |     wiki = Wikidata()
 80 |     collection = wiki.collection
 81 | 
 82 |     insert_count = 0
 83 |     dump = read_dump(args.wikidata_file)
 84 |     batch = []
 85 | 
 86 |     _tqdm_iter = tqdm(index_dump(dump), total=90e6)
 87 |     for w_id, e_name, sitelinks, enwiki, props, prop_dict in _tqdm_iter:
 88 |         batch.append(
 89 |             {
 90 |                 "wikidata_id": w_id,
 91 |                 "english_name": e_name,
 92 |                 "english_wiki": enwiki,
 93 |                 "property_types": props,
 94 |                 "properties": prop_dict,
 95 |                 "sitelinks": list(sitelinks.values()),
 96 |             }
 97 |         )
 98 | 
 99 |         if len(batch) >= 5000:
100 |             collection.insert_many(batch)
101 |             batch = []
102 |             insert_count += 1
103 | 
104 |             _tqdm_iter.desc = f"Insert batch {insert_count}"
105 | 
106 |     print("last")
107 |     collection.insert_many(batch)
108 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/dataset_statistics.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | from argparse import ArgumentParser
21 | from collections import Counter
22 | 
23 | 
24 | def merge_type(query_type):
25 |     return query_type
26 |     # .replace("arg", "")
27 | 
28 | 
29 | def get_bool_ans(answers):
30 |     return "NULL" if not len(answers) else ("TRUE" if "TRUE" in answers else "FALSE")
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     parser = ArgumentParser()
35 |     parser.add_argument("in_file")
36 |     args = parser.parse_args()
37 | 
38 |     type_counter = Counter()
39 |     support_set_size_counter = Counter()
40 |     bool_ans_counter = Counter()
41 |     total_queries = 0
42 |     total_dbs = 0
43 |     with open(args.in_file) as f:
44 | 
45 |         for line in f:
46 |             database = json.loads(line)
47 |             for query in database["queries"]:
48 | 
49 |                 support_set_size_counter[len(query["facts"])] += 1
50 |                 type_counter[merge_type(query["type"])] += 1
51 | 
52 |                 if query["type"] == "bool":
53 |                     bool_ans_counter[get_bool_ans(query["answer"])] += 1
54 |             total_queries += len(database["queries"])
55 |             total_dbs += 1
56 | 
57 |     for k, v in type_counter.items():
58 |         print(k, v)
59 | 
60 |     print()
61 |     for i in range(0, 20):
62 |         print(i, support_set_size_counter[i])
63 | 
64 |     print(total_queries, total_dbs)
65 |     print(bool_ans_counter)
66 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/generation/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/generation/describe_db_facts.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import glob
20 | import json
21 | import os
22 | 
23 | from tqdm import tqdm
24 | from transformers import AutoTokenizer
25 | 
26 | if __name__ == "__main__":
27 |     tokenizer = AutoTokenizer.from_pretrained("t5-base")
28 |     if os.path.exists("db_sizes.jsonl"):
29 |         os.unlink("db_sizes.jsonl")
30 | 
31 |     for file in tqdm(glob.glob("dbs/*.jsonl")):
32 |         with open(file) as f:
33 |             sizes = []
34 |             for line in f:
35 |                 db = json.loads(line)
36 |                 sizes.append(sum(len(tokenizer.tokenize(fact)) for fact in db["facts"]))
37 | 
38 |         with open("db_sizes.jsonl", "a+") as f:
39 |             f.write(json.dumps({"file": file, "sizes": sizes}) + "\n")
40 |             print(sizes)
41 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/generation/describe_dbs.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | import numpy as np
21 | from collections import Counter
22 | 
23 | 
24 | def read_databases(file):
25 |     with open(file) as f:
26 |         for line in f:
27 |             instance = json.loads(line)
28 |             yield instance
29 | 
30 | 
31 | if __name__ == "__main__":
32 | 
33 |     null_answers = 0
34 |     true_answers = 0
35 |     zero_answers = 0
36 |     other_answers = 0
37 | 
38 |     num_fact_used = []
39 |     answer_sizes = []
40 |     type_counter = Counter()
41 |     for db in read_databases("generated_dbs.jsonl"):
42 |         for query in db["queries"]:
43 |             num_fact_used.append(len(query["facts"]))
44 |             type_counter[query["type"]] += 1
45 |             answer_sizes.append(len(query["answer"]))
46 | 
47 |             if None in query["answer"]:
48 |                 null_answers += 1
49 |             elif len(query["answer"]) == 0:
50 |                 zero_answers += 1
51 |             elif True in query["answer"]:
52 |                 true_answers += 1
53 |             else:
54 |                 other_answers += 1
55 |     print(np.mean(num_fact_used), np.mean(answer_sizes))
56 |     print(type_counter)
57 |     print(true_answers, null_answers, zero_answers, other_answers)
58 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/generation/filter_db_facts.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import glob
20 | import json
21 | import os
22 | from argparse import ArgumentParser
23 | 
24 | from tqdm import tqdm
25 | from transformers import AutoTokenizer
26 | 
27 | if __name__ == "__main__":
28 |     tokenizer = AutoTokenizer.from_pretrained("t5-base")
29 |     parser = ArgumentParser()
30 |     parser.add_argument("in_dir")
31 |     parser.add_argument("out_dir")
32 |     args = parser.parse_args()
33 | 
34 |     os.makedirs(args.out_dir, exist_ok=True)
35 | 
36 |     for file in glob.glob(args.in_dir + "/*"):
37 |         with open(file) as f, open(
38 |             args.out_dir + "/" + os.path.basename(file), "w+"
39 |         ) as of:
40 |             sizes = []
41 |             for line in tqdm(f, desc=file):
42 |                 db = json.loads(line)
43 |                 tt = sum(len(tokenizer.tokenize(fact)) for fact in db["facts"])
44 |                 if tt < 900:
45 |                     of.write(line)
46 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/generation/plot_db_sizes.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | from operator import itemgetter
21 | 
22 | import numpy as np
23 | import matplotlib.pyplot as plt
24 | 
25 | if __name__ == "__main__":
26 | 
27 |     with open("db_sizes.jsonl") as f:
28 |         plot = []
29 | 
30 |         for line in f:
31 |             db = json.loads(line)
32 |             size = int(db["file"].split(".")[0].rsplit("_", maxsplit=1)[1])
33 |             l5, lq, med, uq, u95, highest = np.percentile(
34 |                 db["sizes"], (1, 25, 50, 75, 99, 100)
35 |             )
36 | 
37 |             plot.append((size, (l5, lq, med, uq, u95, highest)))
38 | 
39 |         plot.sort(key=itemgetter(0))
40 |     lowers5 = [p[0] for q, p in plot]
41 |     lowers = [p[1] for q, p in plot]
42 |     median = [p[2] for q, p in plot]
43 |     uppers = [p[3] for q, p in plot]
44 |     upper5 = [p[4] for q, p in plot]
45 |     limit = [p[5] for q, p in plot]
46 |     nums = [q for q, p in plot]
47 | 
48 |     plt.fill_between(nums, lowers, uppers, alpha=0.3, color="purple")
49 |     plt.fill_between(nums, lowers5, upper5, alpha=0.3, color="blue")
50 |     plt.plot(nums, median, color="blue")
51 |     plt.plot(nums, limit, color="blue")
52 |     plt.title("KELM Database size")
53 |     plt.ylabel("Number of subword tokens (T5 tokenizer)")
54 |     plt.xlabel("Number of facts")
55 |     plt.hlines(1024, 0, 50)
56 | 
57 |     plt.legend(["Median", "Max", "25th Percentile", "99th percentile", "Budget"])
58 | 
59 |     plt.show()
60 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/util/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/util/build_json.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
  3 | #
  4 | # This file is part of NeuralDB.
  5 | # See https://github.com/facebookresearch/NeuralDB for further info.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | import glob
 20 | import csv
 21 | import json
 22 | from argparse import ArgumentParser
 23 | from collections import defaultdict
 24 | 
 25 | import re
 26 | 
 27 | 
 28 | def read_csv(f):
 29 |     next(f)
 30 |     reader = csv.DictReader(f)
 31 | 
 32 |     templates = defaultdict(set)
 33 | 
 34 |     for template in reader:
 35 | 
 36 |         if len(template["fact"]):
 37 |             templates["fact"].add(template["fact"])
 38 | 
 39 |         if len(template["bool"]):
 40 |             if template["bool_answer"].lower() in ["true", "t", "1", "yes", "y"]:
 41 |                 templates["bool"].add((template["bool"], template["bool_answer"]))
 42 | 
 43 |         if len(template["set"]):
 44 |             templates["set"].add((template["set"], template["set_projection"]))
 45 | 
 46 |         if len(template["count"]):
 47 |             templates["count"].add((template["count"], template["count_projection"]))
 48 | 
 49 |         if len(template["min"]):
 50 |             templates["min"].add((template["min"], template["min_projection"]))
 51 | 
 52 |         if len(template["max"]):
 53 |             templates["max"].add((template["max"], template["max_projection"]))
 54 | 
 55 |         if len(template["argmin"]):
 56 |             templates["argmin"].add((template["argmin"], template["argmin_projection"]))
 57 | 
 58 |         if len(template["argmax"]):
 59 |             templates["argmax"].add((template["argmax"], template["argmax_projection"]))
 60 | 
 61 |         templates["_subject"] = "$s"
 62 |         templates["_object"] = "$o"
 63 | 
 64 |     return {k: list(v) if isinstance(v, set) else v for k, v in templates.items()}
 65 | 
 66 | 
 67 | def swap_so(statement):
 68 |     return statement.replace("$s", "$tmp_s").replace("$o", "$s").replace("$tmp_s", "$o")
 69 | 
 70 | 
 71 | def make_symmetric(k, templates):
 72 | 
 73 |     if not k.startswith("_"):
 74 |         out = []
 75 |         out.extend(templates)
 76 |         out.extend([(swap_so(t[0]), swap_so(t[1])) for t in templates if len(t) == 2])
 77 |         out.extend([swap_so(t) for t in templates if isinstance(t, str)])
 78 |         return out
 79 |     else:
 80 |         return templates
 81 | 
 82 | 
 83 | if __name__ == "__main__":
 84 |     print("Generate")
 85 |     parser = ArgumentParser()
 86 |     parser.add_argument("version")
 87 |     args = parser.parse_args()
 88 |     # Read all CSV files in dir
 89 |     files = glob.glob("configs/for_{}/*.csv".format(args.version))
 90 |     print(files)
 91 | 
 92 |     all_templates = {}
 93 |     for file in files:
 94 |         match = re.match(r".*(P[0-9]+).*", file)
 95 | 
 96 |         if match is not None:
 97 |             name = match.group(1)
 98 | 
 99 |             with open(file) as f:
100 |                 template = read_csv(f)
101 | 
102 |             if name in {"P47", "P26"}:
103 |                 all_templates[name] = {
104 |                     prop: make_symmetric(prop, rules)
105 |                     for prop, rules in template.items()
106 |                 }
107 |             else:
108 |                 all_templates[name] = template
109 | 
110 |     with open("configs/generate_{}.json".format(args.version), "w+") as of:
111 |         json.dump(all_templates, of, indent=4)
112 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/wikidata_common/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/wikidata_common/common_mongo.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import os
20 | from abc import ABC
21 | 
22 | import pymongo
23 | 
24 | 
25 | class MongoDataSource(ABC):
26 |     def __init__(self):
27 |         user = os.getenv("MONGO_USER", "")
28 |         password = os.getenv("MONGO_PASSWORD", "")
29 |         host = os.getenv("MONGO_HOST", "localhost")
30 |         port = os.getenv("MONGO_PORT", "27017")
31 |         db = os.getenv("MONGO_DB", "wikidata")
32 | 
33 |         client = pymongo.MongoClient(f"mongodb://{user}:{password}@{host}:{port}")
34 | 
35 |         self.db = client[db]
36 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/wikidata_common/kelm.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | from ndb_data.wikidata_common.common_mongo import MongoDataSource
20 | 
21 | 
22 | class KELMMongo(MongoDataSource):
23 |     def __init__(self):
24 |         super().__init__()
25 |         self.collection = self.db["kelm"]
26 | 
27 |     def find_entity(self, entity):
28 |         results = self.collection.find({"entities": entity})
29 |         return results
30 | 
31 |     def find_entity_rel(self, entity, rels):
32 |         results = self.collection.find(
33 |             {"entities": entity, "relations": {"$in": list(rels)}}
34 |         )
35 |         return results
36 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/wikidata_common/wikidata.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | from ndb_data.wikidata_common.common_mongo import MongoDataSource
20 | 
21 | 
22 | class Wikidata(MongoDataSource):
23 |     def __init__(self):
24 |         super().__init__()
25 |         self.collection = self.db["wiki_graph"]
26 | 
27 |     def get_by_id_or_uri(self, unit_uri):
28 |         return self.collection.find_one(
29 |             {"wikidata_id": unit_uri.replace("http://www.wikidata.org/entity/", "")}
30 |         )
31 | 
32 |     def find_custom(self, search_key, search_toks):
33 |         return self.collection.find({search_key: {"$in": search_toks}})
34 | 
35 |     def find_matching_relation(self, relation):
36 |         return self.collection.find({"propery_types": relation})
37 | 


--------------------------------------------------------------------------------
/dataset-construction/src/ndb_data/wikidata_common/wikpedia.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | from ndb_data.wikidata_common.common_mongo import MongoDataSource
20 | 
21 | 
22 | class Wikipedia(MongoDataSource):
23 |     def __init__(self):
24 |         super().__init__()
25 |         self.collection = self.db["wiki_redirects"]
26 | 
27 |     def resolve_redirect(self, names):
28 |         results = self.collection.find({"title": {"$in": names}})
29 |         new_search = []
30 |         for res in results:
31 |             new_search.append(res["target"])
32 |         return new_search
33 | 


--------------------------------------------------------------------------------
/modelling/.gitignore:
--------------------------------------------------------------------------------
1 | private/
2 | 


--------------------------------------------------------------------------------
/modelling/README.md:
--------------------------------------------------------------------------------
 1 | # Neural database implementation for Database Reasoning over Text ACL paper
 2 | 
 3 | ## Package Structure
 4 | 
 5 | ```
 6 | resources/                       # DB files go here
 7 | 
 8 | src/
 9 |     neuraldb/                    # Python package containing NeuralDB project
10 |        
11 |         dataset/                 # Components pertaining to reading and loading the DB files
12 |             instance_generator/  # Generates model inputs from different DB formats
13 | 
14 |         evaluation/              # Helper functions that allow in-line evaluation of the models from the training script
15 |         modelling/               # Extra models/trainers etc
16 |         retriever/               # TF-IDF and DPR baselines
17 |         util/                    # Other utils
18 |         
19 | tests/                           # Unit tests for scorer
20 | ```
21 | 
22 | ## TF-IDF and DPR retrieval
23 | 
24 | Baseline retrieval methods can be run first, collecting the data to be used by the downstream models
25 | 
26 | ```
27 | bash scripts/baselines/retrieve.sh dpr
28 | bash scripts/baselines/retrieve.sh tfidf
29 | ```
30 | 
31 | ## Replicating ACL experiments
32 | 
33 | These will train and generate predictions for the v2.4 databases containing up to 25 facts. 
34 | The scripts use task spooler to manager a queue of jobs. If you do not have this, remove `tsp` from the scripts.
35 | ```
36 | export SEED=1
37 | bash scripts/experiments_ours.sh v2.4_25
38 | bash scripts/experiments_baseline.sh v2.4_25
39 | ```
40 | 
41 | The final scoring script would take the predictions generated by these scripts and evaluate them against the reference predictions. 
42 | 
43 | ```
44 | python -m neuraldb.final_scoring
45 | ``` 
46 | 
47 | Graphs which plot the answer accuracy by DB size are generated from 
48 | ```
49 | python -m neuraldb.final_scoring_with_db_size
50 | ``` 
51 | 
52 | ### Larger databases
53 | There are a couple of variants of this scoring script to evaluate for larger databases (v2.4_50, v2.4_100, v2.4_250, v2.4_500 and v2.4_1000):
54 | This would involve running the models trained on 25 facts with larger databases.
55 | 
56 | ```
57 | bash scripts/ours/predict_spj_rand_sweep.sh
58 | python -m neuraldb.final_scoring_with_db_size_sweep
59 | ```
60 | 
61 | ### Fusion in decoder baseline
62 | 
63 | Was performed using a modified version of the FiD code adapted from https://github.com/facebookresearch/FiD, the outputs of this can be converted to the NeuralDB format with 
64 | 
65 | ```
66 | python -m neuraldb.convert_legacy_predictions <fid_output> <output_file>
67 | ```


--------------------------------------------------------------------------------
/modelling/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.6.0
2 | torch
3 | datasets
4 | fever-drqa
5 | 


--------------------------------------------------------------------------------
/modelling/scripts/baselines/retrieve.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | dataset=$1
20 | retriever=$2
21 | 
22 | export PYTHONPATH=src
23 | mkdir -pv resources/${dataset}_${retriever}
24 | python src/neuraldb/retriever/${retriever}.py resources/${dataset}/train.jsonl resources/${dataset}_${retriever}/train.jsonl
25 | python src/neuraldb/retriever/${retriever}.py resources/${dataset}/dev.jsonl resources/${dataset}_${retriever}/dev.jsonl
26 | python src/neuraldb/retriever/${retriever}.py resources/${dataset}/test.jsonl resources/${dataset}_${retriever}/test.jsonl


--------------------------------------------------------------------------------
/modelling/scripts/baselines/train_longformer.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | export PYTHONPATH=src
20 | export TRANSFORMERS_CACHE=/local/scratch/jt719/.cache
21 | 
22 | data=$1
23 | generator=$2
24 | lr=$3
25 | steps=${4:-1}
26 | seed=${SEED:-1}
27 | 
28 | work_dir=work/${data}/model=longformer,generator=${generator},lr=${lr},steps=${steps}/seed-${seed}
29 | data_dir=resources/${data}
30 | 
31 | python src/neuraldb/run.py \
32 |   --model_name_or_path allenai/led-base-16384 \
33 |   --learning_rate ${lr} \
34 |   --gradient_accumulation_steps ${steps} \
35 |   --output_dir ${work_dir} \
36 |   --train_file ${data_dir}/train.jsonl \
37 |   --validation_file ${data_dir}/dev.jsonl \
38 |   --instance_generator ${generator} \
39 |   --do_train \
40 |   --do_eval \
41 |   --num_train_epochs 3 \
42 |   --evaluation_strategy epoch \
43 |   --per_device_train_batch_size 8 \
44 |   --per_device_eval_batch_size 8 \
45 |   --predict_with_generate \
46 |   --save_total_limit 2 \
47 |   --seed ${seed} \
48 |   --save_steps 10000
49 | #--overwrite_output_dir \
50 | 
51 | rm -rfv ${work_dir}/checkpoint-*


--------------------------------------------------------------------------------
/modelling/scripts/baselines/train_t5.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | export PYTHONPATH=src
20 | export TRANSFORMERS_CACHE=/local/scratch/jt719/.cache
21 | 
22 | data=$1
23 | generator=$2
24 | lr=$3
25 | steps=${4:-1}
26 | seed=${SEED:-1}
27 | 
28 | work_dir=work/${data}/model=t5,generator=${generator},lr=${lr},steps=${steps}/seed-${seed}
29 | data_dir=resources/${data}
30 | 
31 | python src/neuraldb/run.py \
32 |   --model_name_or_path t5-base \
33 |   --learning_rate ${lr} \
34 |   --gradient_accumulation_steps ${steps} \
35 |   --output_dir ${work_dir} \
36 |   --train_file ${data_dir}/train.jsonl \
37 |   --validation_file ${data_dir}/dev.jsonl \
38 |   --instance_generator ${generator} \
39 |   --do_train \
40 |   --do_eval \
41 |   --num_train_epochs 3 \
42 |   --evaluation_strategy epoch \
43 |   --per_device_train_batch_size 32 \
44 |   --per_device_eval_batch_size 32 \
45 |   --predict_with_generate \
46 |   --save_total_limit 2 \
47 |   --seed ${seed} \
48 |   --save_steps 10000
49 | #--overwrite_output_dir \
50 | 
51 | rm -rfv ${work_dir}/checkpoint-*


--------------------------------------------------------------------------------
/modelling/scripts/baselines/train_t5_retriever.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | export PYTHONPATH=src
20 | export TRANSFORMERS_CACHE=/local/scratch/jt719/.cache
21 | 
22 | data=$1
23 | generator=$2
24 | retriever=$3
25 | lr=$4
26 | steps=${5:-1}
27 | seed=${SEED:-1}
28 | 
29 | work_dir=work/${data}/model=t5,generator=${generator},retriever=${retriever},lr=${lr},steps=${steps}/seed-${seed}
30 | data_dir=resources/${data}_${retriever}
31 | 
32 | python src/neuraldb/run.py \
33 |   --model_name_or_path t5-base \
34 |   --learning_rate ${lr} \
35 |   --gradient_accumulation_steps ${steps} \
36 |   --output_dir ${work_dir} \
37 |   --train_file ${data_dir}/train.jsonl \
38 |   --validation_file ${data_dir}/dev.jsonl \
39 |   --instance_generator ${generator} \
40 |   --do_train \
41 |   --do_eval \
42 |   --num_train_epochs 3 \
43 |   --evaluation_strategy epoch \
44 |   --per_device_train_batch_size 8 \
45 |   --per_device_eval_batch_size 8 \
46 |   --predict_with_generate \
47 |   --save_total_limit 2 \
48 |   --seed ${seed} \
49 |   --save_steps 10000
50 | #--overwrite_output_dir \
51 | 
52 | rm -rfv ${work_dir}/checkpoint-*


--------------------------------------------------------------------------------
/modelling/scripts/convert_ssg_predictions.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | 
20 | dataset=${1:-v2.4}
21 | 
22 | function convert(){
23 |   size=$1
24 | 
25 |   echo "Convert ${dataset} ${size}"
26 |   mkdir -pv resources/${dataset}_${size}_ssg
27 |   python src/neuraldb/convert_ssg_predictions.py resources/ssg_predictions/${dataset}_${size}/test_0.8_st_ssg_sup.json resources/${dataset}_${size}_ssg/test.jsonl --master_file resources/${dataset}_${size}/test.jsonl
28 | }
29 | 
30 | convert 25
31 | convert 50
32 | convert 100
33 | convert 250
34 | convert 500
35 | convert 1000


--------------------------------------------------------------------------------
/modelling/scripts/experiments_baselines.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | function do_predictions() {
20 |   model_path=$1
21 |   generator=$2
22 |   predictions_path=$3
23 |   tsp python src/neuraldb/run.py \
24 |     --model_name_or_path $model_path \
25 |     --output_dir $model_path \
26 |     --predictions_file $model_path/predictions.jsonl \
27 |     --do_predict --test_file resources/${predictions_path}/test.jsonl \
28 |     --instance_generator $generator \
29 |     --per_device_eval_batch_size 4 \
30 |     --predict_with_generate
31 | 
32 | }
33 | 
34 | dataset=${1:-v2.4_25}
35 | export seed=${SEED:-1}
36 | 
37 | SEED=${seed} tsp bash scripts/baselines/train_t5.sh $dataset perfectir 1e-4
38 | SEED=${seed} tsp bash scripts/baselines/train_t5.sh $dataset wholedb 1e-4
39 | 
40 | do_predictions work/${dataset}/model=t5,generator=perfectir,lr=1e-4,steps=1/seed-${seed} perfectir ${dataset}
41 | do_predictions work/${dataset}/model=t5,generator=wholedb,lr=1e-4,steps=1/seed-${seed} wholedb ${dataset}
42 | 
43 | SEED=${seed} tsp bash scripts/baselines/train_longformer.sh $dataset perfectir 1e-4
44 | SEED=${seed} tsp bash scripts/baselines/train_longformer.sh $dataset wholedb 1e-4
45 | 
46 | do_predictions work/${dataset}/model=longformer,generator=perfectir,lr=1e-4,steps=1/seed-${seed} perfectir ${dataset}
47 | do_predictions work/${dataset}/model=longformer,generator=wholedb,lr=1e-4,steps=1/seed-${seed} wholedb ${dataset}
48 | 
49 | SEED=${seed} tsp bash scripts/baselines/train_t5_retriever.sh $dataset externalir dpr 1e-4
50 | SEED=${seed} tsp bash scripts/baselines/train_t5_retriever.sh $dataset externalir tfidf 1e-4
51 | 
52 | do_predictions work/${dataset}/model=t5,generator=externalir,retriever=dpr,lr=1e-4,steps=1/seed-${seed} externalir ${dataset}_dpr
53 | do_predictions work/${dataset}/model=t5,generator=externalir,retriever=tfidf,lr=1e-4,steps=1/seed-${seed} externalir ${dataset}_tfidf
54 | 


--------------------------------------------------------------------------------
/modelling/scripts/experiments_ours.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | dataset=${1:-v2.4_25}
20 | export seed=${SEED:-1}
21 | 
22 | SEED=${seed} tsp bash scripts/ours/train_spj.sh $dataset spj_rand 1e-4
23 | SEED=${seed} tsp bash scripts/ours/train_spj.sh $dataset spj 1e-4
24 | 


--------------------------------------------------------------------------------
/modelling/scripts/ours/predict_spj.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | dataset=v2.4_25
20 | 
21 | function do_predictions_spj() {
22 |   model_path=$1
23 |   generator=spj
24 |   tsp python src/neuraldb/run.py \
25 |     --model_name_or_path $model_path \
26 |     --output_dir $model_path \
27 |     --predictions_file $model_path/intermediate_predictions.jsonl \
28 |     --do_predict --test_file resources/${dataset}/test.jsonl \
29 |     --instance_generator $generator \
30 |     --per_device_eval_batch_size 64 \
31 |     --predict_with_generate
32 | 
33 |   tsp python src/neuraldb/convert_spj_to_predictions.py $model_path/intermediate_predictions.jsonl $model_path/predictions.jsonl
34 | }
35 | 
36 | function do_predictions_ssg_spj() {
37 |   model_path=$1
38 |   out_path=$2
39 |   generator=spj
40 | 
41 |   mkdir -pv $out_path
42 |   tsp python src/neuraldb/run.py \
43 |     --model_name_or_path $model_path \
44 |     --output_dir $model_path \
45 |     --predictions_file $out_path/intermediate_predictions.jsonl \
46 |     --do_predict \
47 |     --test_file resources/${dataset}_ssg/test.jsonl \
48 |     --instance_generator $generator \
49 |     --per_device_eval_batch_size 64 \
50 |     --predict_with_generate
51 | 
52 |   tsp python src/neuraldb/convert_spj_to_predictions.py $out_path/intermediate_predictions.jsonl $out_path/predictions.jsonl --actual_file resources/${dataset}/test.jsonl
53 | }
54 | 
55 | seed=${SEED:-1}
56 | do_predictions_spj work/${dataset}/model=t5,generator=spj,lr=1e-4,steps=1/seed-${seed}
57 | 


--------------------------------------------------------------------------------
/modelling/scripts/ours/predict_spj_rand_sweep.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | dataset=v2.4_25
20 | 
21 | function do_predictions_ssg_spj() {
22 |   dataset1=$1
23 |   dataset2=$2
24 |   model_path=$3
25 |   generator=spj
26 |   mkdir -pv work/${dataset2}/$model_path/
27 |   tsp python src/neuraldb/run.py \
28 |     --model_name_or_path work/${dataset1}/$model_path \
29 |     --output_dir work/${dataset1}/$model_path \
30 |     --predictions_file work/${dataset2}/$model_path/intermediate_predictions.jsonl \
31 |     --do_predict --test_file resources/${dataset2}_ssg/test.jsonl \
32 |     --instance_generator $generator \
33 |     --per_device_eval_batch_size 64 \
34 |     --predict_with_generate
35 | 
36 |   tsp python src/neuraldb/convert_spj_to_predictions.py work/${dataset2}/$model_path/intermediate_predictions.jsonl work/${dataset2}/$model_path/predictions.jsonl --actual_file resources/${dataset2}/test.jsonl
37 | }
38 | 
39 | 
40 | seed=${SEED:-1}
41 | do_predictions_ssg_spj v2.4_25 v2.4_25 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed}
42 | do_predictions_ssg_spj v2.4_25 v2.4_50 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed}
43 | do_predictions_ssg_spj v2.4_25 v2.4_100 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed}
44 | do_predictions_ssg_spj v2.4_25 v2.4_250 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed}
45 | do_predictions_ssg_spj v2.4_25 v2.4_500 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed}
46 | do_predictions_ssg_spj v2.4_25 v2.4_1000 model=t5,generator=spj_rand,lr=1e-4,steps=1/seed-${seed}


--------------------------------------------------------------------------------
/modelling/scripts/ours/train_spj.sh:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | ##
 4 | ## This file is part of NeuralDB.
 5 | ## See https://github.com/facebookresearch/NeuralDB for further info.
 6 | ##
 7 | ## Licensed under the Apache License, Version 2.0 (the "License");
 8 | ## you may not use this file except in compliance with the License.
 9 | ## You may obtain a copy of the License at
10 | ##
11 | ##     http://www.apache.org/licenses/LICENSE-2.0
12 | ##
13 | ## Unless required by applicable law or agreed to in writing, software
14 | ## distributed under the License is distributed on an "AS IS" BASIS,
15 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | ## See the License for the specific language governing permissions and
17 | ## limitations under the License.
18 | ##
19 | export PYTHONPATH=src
20 | export TRANSFORMERS_CACHE=/local/scratch/jt719/.cache
21 | 
22 | dataset=$1
23 | generator=$2
24 | lr=$3
25 | steps=${4:-1}
26 | seed=${SEED:-1}
27 | 
28 | work_dir=work/${dataset}/model=t5,generator=${generator},lr=${lr},steps=${steps}/seed-${seed}
29 | data_dir=resources/${dataset}
30 | 
31 | python src/neuraldb/run.py \
32 |   --model_name_or_path t5-base \
33 |   --learning_rate ${lr} \
34 |   --gradient_accumulation_steps ${steps} \
35 |   --output_dir ${work_dir} \
36 |   --train_file ${data_dir}/train.jsonl \
37 |   --validation_file ${data_dir}/dev.jsonl \
38 |   --instance_generator ${generator} \
39 |   --do_train \
40 |   --do_eval \
41 |   --num_train_epochs 3 \
42 |   --evaluation_strategy epoch \
43 |   --per_device_train_batch_size 8 \
44 |   --per_device_eval_batch_size 8 \
45 |   --predict_with_generate \
46 |   --save_total_limit 2 \
47 |   --seed ${seed} \
48 |   --save_steps 10000
49 | 
50 | rm -rfv ${work_dir}/checkpoint-*


--------------------------------------------------------------------------------
/modelling/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import shutil
20 | from pathlib import Path
21 | from setuptools import find_packages, setup
22 | 
23 | stale_egg_info = Path(__file__).parent / "neuraldb.egg-info"
24 | if stale_egg_info.exists():
25 |     shutil.rmtree(stale_egg_info)
26 | 
27 | setup(
28 |     name="neuraldb",
29 |     version="0.0.0",
30 |     author="",
31 |     author_email="jt719@cam.ac.uk",
32 |     description="NeuralDB Baseline implementation",
33 |     long_description=open("README.md", "r", encoding="utf-8").read(),
34 |     long_description_content_type="text/markdown",
35 |     keywords="NLP neuraldb neural database deep learning transformer",
36 |     license="Apache",
37 |     url="",
38 |     package_dir={"": "src"},
39 |     packages=find_packages("src"),
40 |     extras_require=[],
41 |     entry_points={"console_scripts": [""]},
42 |     python_requires=">=3.6.0",
43 |     install_requires=["transformers==4.6.0","torch","datasets","fever-drqa"],
44 |     classifiers=[
45 |         "Programming Language :: Python :: 3.8",
46 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
47 |     ],
48 | )
49 | 


--------------------------------------------------------------------------------
/modelling/src/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/__pycache__/run.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/__pycache__/run.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/convert_legacy_predictions.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | from argparse import ArgumentParser
21 | 
22 | from functools import reduce
23 | 
24 | if __name__ == "__main__":
25 | 
26 |     parser = ArgumentParser()
27 |     parser.add_argument("in_file")
28 |     parser.add_argument("out_file")
29 |     args = parser.parse_args()
30 |     with open(args.in_file) as f, open(
31 |         args.out_file,
32 |         "w+",
33 |     ) as of:
34 |         for line in f:
35 |             results = json.loads(line)
36 |             for prediction in results["test"]["raw"]:
37 |                 predicted, actual, ems, eml, meta = prediction
38 | 
39 |                 print(meta)
40 | 
41 |                 instance = {
42 |                     "prediction": predicted.split("[LIST]"),
43 |                     "actual": actual.split("[LIST]"),
44 |                     "metadata": {
45 |                         "dbsize": len(
46 |                             set(reduce(lambda a, b: a + b, meta["query"]["gold_facts"]))
47 |                         )
48 |                         if len(meta["query"]["gold_facts"])
49 |                         else 0,
50 |                         "type": meta["query"]["metadata"]["query_type"],
51 |                     },
52 |                 }
53 | 
54 |                 of.write(json.dumps(instance) + "\n")
55 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/convert_ssg_predictions.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | import logging
21 | from argparse import ArgumentParser
22 | from collections import defaultdict
23 | 
24 | from neuraldb.util.log_helper import setup_logging
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     setup_logging()
31 | 
32 |     parser = ArgumentParser()
33 |     parser.add_argument("predictions_file")
34 |     parser.add_argument("output_file")
35 |     parser.add_argument("--master_file", required=True)
36 |     args = parser.parse_args()
37 |     questions_answers = defaultdict(list)
38 | 
39 |     use_predicted_type = True
40 |     master_file = args.master_file  # "resources/v2.1_25/test.jsonl"
41 |     out_file = args.output_file  # "resources/v2.1_25/dev_ssg_predictions.jsonl"
42 |     predictions_file = (
43 |         args.predictions_file
44 |     )  # "resources/v2.1_25/dev_0.76_st_ssg_sup.json"
45 | 
46 |     predicted_instances = {}
47 |     with open(predictions_file) as f:
48 |         predictions = json.load(f)
49 |         for inst in predictions:
50 |             predicted_instances[(inst["db_id"], inst["question_id"])] = inst
51 | 
52 |     with open(master_file) as f, open(out_file, "w+") as of:
53 |         for db_idx, line in enumerate(f):
54 |             database = json.loads(line)
55 | 
56 |             for q_idx, query in enumerate(database["queries"]):
57 |                 query["predicted_facts"] = [
58 |                     [a[0] for a in b]
59 |                     for b in predicted_instances[(db_idx, q_idx)]["ssg_output"]
60 |                 ]
61 | 
62 |             of.write(json.dumps(database) + "\n")
63 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/__pycache__/data_collator_seq2seq.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/data_collator_seq2seq.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/__pycache__/neuraldb_file_reader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/neuraldb_file_reader.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/__pycache__/neuraldb_parser.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/neuraldb_parser.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/__pycache__/seq2seq_dataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/__pycache__/seq2seq_dataset.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/__pycache__/instance_generator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/instance_generator.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/__pycache__/perfectir_generator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/perfectir_generator.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/__pycache__/spj_generator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/spj_generator.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/__pycache__/wholedb_generator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/dataset/instance_generator/__pycache__/wholedb_generator.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/externalir_generator.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import itertools
20 | import logging
21 | 
22 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | 
27 | class ExternalIRGenerator(InstanceGenerator):
28 |     def _process_query(self, query_obj, update_tokens):
29 |         query_tokens = self.tokenizer.tokenize("Answer question: " + query_obj["query"])
30 |         answer_tokens = [
31 |             self.maybe_tokenize_answer(answer) for answer in query_obj["answer"]
32 |         ]
33 | 
34 |         facts = set(itertools.chain(*query_obj["predicted_facts"]))
35 |         context_tokens = [update_tokens[fact] for fact in facts][:10]
36 | 
37 |         yield self.maybe_decorate_with_metadata(
38 |             {
39 |                 "query": query_tokens,
40 |                 "context": context_tokens,
41 |                 "output": self.concatenate_answer(answer_tokens),
42 |             },
43 |             query_obj,
44 |         )
45 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/externalir_generator_maxtok.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import itertools
20 | import logging
21 | 
22 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | 
27 | class ExternalIRGeneratorMaxTok(InstanceGenerator):
28 |     def _process_query(self, query_obj, update_tokens):
29 |         query_tokens = self.tokenizer.tokenize("Answer question: " + query_obj["query"])
30 |         answer_tokens = [
31 |             self.maybe_tokenize_answer(answer) for answer in query_obj["answer"]
32 |         ]
33 | 
34 |         facts = set(itertools.chain(*query_obj["predicted_facts"]))
35 |         context_tokens = []
36 |         for fact in facts:
37 |             context_tokens.append(update_tokens[fact])
38 |             if len(list(itertools.chain(*context_tokens))) > 900:
39 |                 break
40 | 
41 |         yield self.maybe_decorate_with_metadata(
42 |             {
43 |                 "query": query_tokens,
44 |                 "context": context_tokens,
45 |                 "output": self.concatenate_answer(answer_tokens),
46 |             },
47 |             query_obj,
48 |         )
49 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/perfectir_generator.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import itertools
20 | import logging
21 | 
22 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | 
27 | class PerfectIRGenerator(InstanceGenerator):
28 |     def _process_query(self, query_obj, update_tokens):
29 |         query_tokens = self.tokenizer.tokenize("Answer question: " + query_obj["query"])
30 |         answer_tokens = [
31 |             self.maybe_tokenize_answer(answer) for answer in query_obj["answer"]
32 |         ]
33 | 
34 |         facts = set(itertools.chain(*query_obj["facts"]))
35 |         context_tokens = [update_tokens[fact] for fact in facts]
36 | 
37 |         yield self.maybe_decorate_with_metadata(
38 |             {
39 |                 "query": query_tokens,
40 |                 "context": context_tokens,
41 |                 "output": self.concatenate_answer(answer_tokens),
42 |             },
43 |             query_obj,
44 |         )
45 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/subsampler.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import random
20 | 
21 | 
22 | class Subsampler:
23 | 
24 |     # Take a list of sample types and probability of keeping
25 |     def __init__(self, sample_types):
26 |         self.sample_types = sample_types
27 | 
28 |     #
29 |     def maybe_drop_sample(self, query):
30 |         if query["type"] in self.sample_types:
31 |             sample_rate = self.sample_types[query["type"]]
32 |             rand = random.random()
33 | 
34 |             if isinstance(sample_rate, list):
35 |                 if not len(query["answer"]):
36 |                     sample_rate = sample_rate[2]
37 |                 else:
38 |                     if "TRUE" in query["answer"]:
39 |                         sample_rate = sample_rate[0]
40 |                     else:
41 |                         sample_rate = sample_rate[1]
42 | 
43 |             # Drop sample if needed
44 |             return rand < sample_rate
45 | 
46 |         return False
47 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/instance_generator/wholedb_generator.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import logging
20 | 
21 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | class WholeDBGenerator(InstanceGenerator):
27 |     def _process_query(self, query_obj, update_tokens):
28 |         query_tokens = self.tokenizer.tokenize(query_obj["query"])
29 |         answer_tokens = [
30 |             self.maybe_tokenize_answer(answer) for answer in query_obj["answer"]
31 |         ]
32 | 
33 |         context_tokens = update_tokens[: query_obj["height"] + 1]
34 | 
35 |         yield self.maybe_decorate_with_metadata(
36 |             {
37 |                 "query": query_tokens,
38 |                 "context": context_tokens,
39 |                 "output": self.concatenate_answer(answer_tokens),
40 |             },
41 |             query_obj,
42 |         )
43 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/neuraldb_file_reader.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | import logging
21 | import os
22 | 
23 | from neuraldb.dataset.instance_generator.instance_generator import InstanceGenerator
24 | from neuraldb.dataset.neuraldb_parser import NeuralDBParser
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | 
29 | class NeuralDBFileReader:
30 |     def __init__(self, instance_generator: InstanceGenerator):
31 |         self.database_reader = NeuralDBParser()
32 |         self.instance_generator = instance_generator
33 | 
34 |     def read(self, file_path):
35 |         logger.info("Reading instances from {}".format(file_path))
36 | 
37 |         database_count = 0
38 |         with open(file_path) as f:
39 |             for idx, line in enumerate(f):
40 |                 database_count += 1
41 |                 database = json.loads(line)
42 | 
43 |                 loaded_database = self.database_reader.load_instances(database)
44 |                 yield from self.instance_generator.generate(
45 |                     loaded_database, database_idx=idx
46 |                 )
47 | 
48 |                 if os.getenv("DEBUG", None) is not None and idx > 3:
49 |                     break
50 | 
51 |         logger.info("Dataset file contains {} databases".format(database_count))
52 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/neuraldb_parser.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import random
20 | import logging
21 | from typing import List, Any, Dict
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | class NeuralDBParser:
27 |     def __init__(self, max_queries=None):
28 |         self._max_queries = max_queries
29 | 
30 |     def load_instances(self, database: Dict[str, List[Any]]):
31 |         return self._load_instances(database)
32 | 
33 |     def _load_instances(self, database: Dict[str, List[Any]]):
34 |         logger.debug("Loading updates")
35 |         updates = map(self._read_update, database["facts"])
36 | 
37 |         logger.debug("Loading queries")
38 |         queries = filter(
39 |             lambda query: query is not None,
40 |             map(self._read_query, self._maybe_sample(database["queries"])),
41 |         )
42 | 
43 |         return {"updates": updates, "queries": queries, "metadata": {}}
44 | 
45 |     def _maybe_sample(self, queries: List[Any]):
46 |         if self._max_queries is not None:
47 |             queries = random.sample(queries, min(len(queries), self._max_queries))
48 |         return queries
49 | 
50 |     def _read_update(self, update):
51 |         return update
52 | 
53 |     def _read_query(self, query):
54 |         answer, answer_type = self._process_answer(query["answer"])
55 |         query["answer"] = answer
56 |         query["answer_type"] = answer_type
57 |         return query
58 | 
59 |         # return {
60 |         #     "id": query["id"],
61 |         #     "height": query["height"],
62 |         #     "input": query["query"],
63 |         #     "output": answer,
64 |         #     "metadata": {
65 |         #         "answer_type": answer_type,
66 |         #         "original_instance": query
67 |         #     }
68 |         # }
69 | 
70 |     def _process_answer(self, answer):
71 |         return answer, None
72 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset/seq2seq_dataset.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | from operator import itemgetter
20 | from torch.utils.data import Dataset
21 | from tqdm import tqdm
22 | import logging
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | 
27 | class Seq2SeqDataset(Dataset):
28 |     def __init__(self, generator, auto_pad=None):
29 |         self.generator = generator
30 |         self.auto_pad = auto_pad
31 | 
32 |         if self.auto_pad:
33 |             self.features = list(
34 |                 map(
35 |                     self.auto_pad,
36 |                     tqdm(generator, desc="Reading and padding instances"),
37 |                 )
38 |             )
39 |         else:
40 |             self.features = list(tqdm(generator, desc="Reading instances"))
41 | 
42 |     def __len__(self):
43 |         return len(self.features)
44 | 
45 |     def __getitem__(self, item):
46 |         return self.features[item]
47 | 
48 |     def to_dict(self):
49 |         assert len(self.features)
50 |         keys = self.features[0].keys()
51 |         return {key: list(map(itemgetter(key), self.features)) for key in keys}
52 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/dataset_statistics.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import logging
20 | import json
21 | from collections import defaultdict
22 | import random
23 | 
24 | from datasets import tqdm
25 | 
26 | from neuraldb.dataset.neuraldb_parser import NeuralDBParser
27 | from neuraldb.util.log_helper import setup_logging
28 | 
29 | logger = logging.getLogger(__name__)
30 | 
31 | 
32 | def get_instances_from_file(file):
33 |     parser = NeuralDBParser()
34 | 
35 |     with open(file) as f:
36 |         for line in f:
37 |             database = json.loads(line)
38 |             yield from parser.load_instances(database)["queries"]
39 | 
40 | 
41 | def get_bool_breakdown(answers):
42 |     if len(answers) == 0:
43 |         return "NULL"
44 | 
45 |     answer_str = " ".join(answers)
46 | 
47 |     if "TRUE" in answer_str:
48 |         return "TRUE"
49 | 
50 |     elif "FALSE" in answer_str:
51 |         return "FALSE"
52 | 
53 |     assert False, "malformed"
54 | 
55 | 
56 | def get_file_stats(file, drop_argmax_chance=None):
57 |     stats = defaultdict(lambda: defaultdict(int))
58 |     for instance in tqdm(get_instances_from_file(file)):
59 |         if drop_argmax_chance and instance["type"] in ["argmax", "argmin"]:
60 |             if random.randint(0, 100) < drop_argmax_chance * 100:
61 |                 continue
62 | 
63 |         stats["type"][instance["type"]] += 1
64 |         stats["relation"][instance["relation"]] += 1
65 |         stats["num_support_sets"][len(instance["facts"])] += 1
66 | 
67 |         if instance["type"] == "bool":
68 |             stats["bool_breakdown"][get_bool_breakdown(instance["answer"])] += 1
69 | 
70 |     return stats
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     setup_logging()
75 |     print()
76 | 
77 |     file = "resources/v2.1_25_big/train.jsonl"
78 | 
79 |     stats = get_file_stats(file, 0.8)
80 | 
81 |     print(stats)
82 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/evaluation/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/evaluation/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/evaluation/__pycache__/postprocess_baselines.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/evaluation/__pycache__/postprocess_baselines.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/evaluation/__pycache__/scoring_functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/modelling/src/neuraldb/evaluation/__pycache__/scoring_functions.cpython-38.pyc


--------------------------------------------------------------------------------
/modelling/src/neuraldb/evaluation/postprocess_spj.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
  3 | #
  4 | # This file is part of NeuralDB.
  5 | # See https://github.com/facebookresearch/NeuralDB for further info.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | import json
 20 | import logging
 21 | import random
 22 | 
 23 | import numpy as np
 24 | 
 25 | from neuraldb.evaluation.scoring_functions import (
 26 |     average_score,
 27 |     exact_match,
 28 |     exact_match_case_insensitive,
 29 |     breakdown_score,
 30 | )
 31 | 
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | 
 35 | def get_spj_evaluation(data_args, tokenizer, generator):
 36 |     def postprocess_text(preds, labels):
 37 |         preds = [
 38 |             [
 39 |                 answer.strip() if len(answer.strip()) else generator.null_answer_special
 40 |                 for answer in pred.replace(
 41 |                     tokenizer.bos_token if tokenizer.bos_token is not None else "", ""
 42 |                 )
 43 |                 .replace(
 44 |                     tokenizer.eos_token if tokenizer.eos_token is not None else "", ""
 45 |                 )
 46 |                 .replace(
 47 |                     tokenizer.pad_token if tokenizer.pad_token is not None else "", ""
 48 |                 )
 49 |                 .strip()
 50 |                 .split(generator.answer_delimiter)
 51 |             ]
 52 |             for pred in preds
 53 |         ]
 54 |         labels = [
 55 |             [
 56 |                 answer.strip() if len(answer.strip()) else generator.null_answer_special
 57 |                 for answer in label.replace(
 58 |                     tokenizer.bos_token if tokenizer.bos_token is not None else "", ""
 59 |                 )
 60 |                 .replace(
 61 |                     tokenizer.eos_token if tokenizer.eos_token is not None else "", ""
 62 |                 )
 63 |                 .replace(
 64 |                     tokenizer.pad_token if tokenizer.pad_token is not None else "", ""
 65 |                 )
 66 |                 .strip()
 67 |                 .split(generator.answer_delimiter)
 68 |             ]
 69 |             for label in labels
 70 |         ]
 71 | 
 72 |         return preds, labels
 73 | 
 74 |     def compute_metrics(eval_preds):
 75 |         preds, labels, metadata = eval_preds
 76 | 
 77 |         if isinstance(preds, tuple):
 78 |             preds = preds[0]
 79 | 
 80 |         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=False)
 81 |         if data_args.ignore_pad_token_for_loss:
 82 |             # Replace -100 in the labels as we can't decode them.
 83 |             labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 84 |         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False)
 85 | 
 86 |         # Some simple post-processing
 87 |         decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
 88 |         if data_args.predictions_file is not None:
 89 |             with open(data_args.predictions_file, "w+") as f:
 90 |                 for pred, label, meta in zip(decoded_preds, decoded_labels, metadata):
 91 |                     f.write(
 92 |                         json.dumps(
 93 |                             {"prediction": pred, "actual": label, "metadata": meta}
 94 |                         )
 95 |                         + "\n"
 96 |                     )
 97 | 
 98 |         sampled_ids = random.sample(list(range(len(decoded_preds))), 10)
 99 |         for id in sampled_ids:
100 |             logger.info(
101 |                 f"Example prediction  \n"
102 |                 f"Q: {metadata[id]['question']}\n"
103 |                 f"P: {decoded_preds[id]}\n"
104 |                 f"A: {decoded_labels[id]}\n"
105 |                 f"\n"
106 |             )
107 | 
108 |         em = average_score(decoded_labels, decoded_preds, exact_match)
109 |         em_lower = average_score(
110 |             decoded_labels, decoded_preds, exact_match_case_insensitive
111 |         )
112 | 
113 |         result = {
114 |             "em": em,
115 |             "emi": em_lower,
116 |             "em_breakdown_type": breakdown_score(
117 |                 "type", decoded_labels, decoded_preds, metadata, exact_match
118 |             ),
119 |             "em_breakdown_relation": breakdown_score(
120 |                 "relation", decoded_labels, decoded_preds, metadata, exact_match
121 |             ),
122 |         }
123 | 
124 |         prediction_lens = [
125 |             np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
126 |         ]
127 |         result["gen_len"] = np.mean(prediction_lens)
128 |         result = {
129 |             k: {k2: round(v2, 4) for k2, v2 in v.items()}
130 |             if isinstance(v, dict)
131 |             else round(v, 4)
132 |             for k, v in result.items()
133 |         }
134 |         return result
135 | 
136 |     return compute_metrics
137 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/evaluation/scoring_functions.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | from collections import defaultdict
20 | 
21 | 
22 | def precision(actual, predicted):
23 |     return (
24 |         sum(1.0 for p in predicted if p in actual) / float(len(predicted))
25 |         if len(predicted)
26 |         else 1.0
27 |     )
28 | 
29 | 
30 | def recall(actual, predicted):
31 |     return (
32 |         sum(1.0 for p in predicted if p in actual) / float(len(actual))
33 |         if len(actual)
34 |         else 1.0
35 |     )
36 | 
37 | 
38 | def f1(actual, predicted):
39 |     actual = set(actual)
40 |     predicted = set(predicted)
41 | 
42 |     pr = precision(actual, predicted)
43 |     rec = recall(actual, predicted)
44 | 
45 |     return compute_f1(pr, rec)
46 | 
47 | 
48 | def join_decoded(decoded_labels):
49 |     return " ".join(decoded_labels)
50 | 
51 | 
52 | def exact_match(actual, predicted):
53 |     return 1.0 if join_decoded(actual) == join_decoded(predicted) else 0.0
54 | 
55 | 
56 | def exact_match_case_insensitive(actual, predicted):
57 |     return (
58 |         1.0 if join_decoded(actual).lower() == join_decoded(predicted).lower() else 0.0
59 |     )
60 | 
61 | 
62 | def compute_f1(pr, rec):
63 |     return 2.0 * pr * rec / (pr + rec) if (pr + rec > 0.0) else 0.0
64 | 
65 | 
66 | def average_score(all_actual, all_predicted, scoring_function):
67 |     running_score = 0
68 |     num_instances = 0
69 | 
70 |     for actual, predicted in zip(all_actual, all_predicted):
71 |         num_instances += 1
72 |         local_score = scoring_function(actual, predicted)
73 |         assert local_score <= 1
74 | 
75 |         running_score += local_score
76 |         assert running_score <= num_instances
77 | 
78 |     return running_score / num_instances if num_instances > 0 else 0.0
79 | 
80 | 
81 | def breakdown_score(key, all_actual, all_predicted, metadata, scoring_function):
82 |     running_score = defaultdict(int)
83 |     num_instances = defaultdict(int)
84 | 
85 |     for actual, predicted, metadatum in zip(all_actual, all_predicted, metadata):
86 |         num_instances[metadatum[key]] += 1
87 |         local_score = scoring_function(actual, predicted)
88 |         assert local_score <= 1
89 | 
90 |         running_score[metadatum[key]] += local_score
91 |         assert running_score[metadatum[key]] <= num_instances[metadatum[key]]
92 | 
93 |     return {
94 |         key: running_score[key] / num_instances[key] if num_instances[key] > 0 else 0.0
95 |         for key in num_instances.keys()
96 |     }
97 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/final_scoring.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
  3 | #
  4 | # This file is part of NeuralDB.
  5 | # See https://github.com/facebookresearch/NeuralDB for further info.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | import glob
 20 | import json
 21 | from collections import OrderedDict, defaultdict
 22 | 
 23 | import numpy as np
 24 | import pandas as pd
 25 | 
 26 | from neuraldb.evaluation.scoring_functions import f1
 27 | 
 28 | 
 29 | def load_experiment(path):
 30 | 
 31 |     running_score = defaultdict(lambda: defaultdict(int))
 32 |     running_count = defaultdict(lambda: defaultdict(int))
 33 | 
 34 |     print(path)
 35 |     with open(path) as f:
 36 |         for line in f:
 37 |             instance = json.loads(line)
 38 |             actual = instance["actual"]
 39 |             prediction = instance["prediction"]
 40 | 
 41 |             local_score = f1(set(actual), set(prediction))
 42 | 
 43 |             # relation = instance["metadata"]["relation"]
 44 |             # running_score["relation"][relation] += local_score
 45 |             # running_count["relation"][relation] += 1
 46 | 
 47 |             qtype = instance["metadata"]["type"]
 48 |             if qtype in {"argmin", "argmax", "min", "max"}:
 49 |                 qtype = "minmax"
 50 |             running_score["type"][qtype] += local_score
 51 |             running_count["type"][qtype] += 1
 52 | 
 53 |             running_score["all"][""] += local_score
 54 |             running_count["all"][""] += 1
 55 | 
 56 |     scores = {}
 57 |     for k, v in running_score.items():
 58 |         for attr, val in v.items():
 59 |             score = (
 60 |                 running_score[k][attr] / running_count[k][attr]
 61 |                 if running_count[k][attr]
 62 |                 else 0
 63 |             )
 64 |             print(f"Running score: {k}\t{attr}\t\t{score}")
 65 |             scores["_".join([k, attr])] = (
 66 |                 running_score[k][attr] / running_count[k][attr]
 67 |                 if running_count[k][attr]
 68 |                 else 0
 69 |             )
 70 | 
 71 |     return scores
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 |     ndb_predictions = glob.glob(
 76 |         "consolidated/work/v2.4_25/**/predictions.jsonl", recursive=True
 77 |     )
 78 |     all_experiments = []
 79 |     for prediction in ndb_predictions:
 80 |         print(prediction)
 81 | 
 82 |         experiment = OrderedDict()
 83 | 
 84 |         for element in prediction.split("/"):
 85 |             if "," in element:
 86 |                 for kvp in element.split(","):
 87 |                     k, v = kvp.split("=", maxsplit=1)
 88 |                     experiment[k] = v
 89 |             elif "-" in element:
 90 |                 for kvp in element.split(","):
 91 |                     k, v = kvp.split("-", maxsplit=1)
 92 |                     experiment[k] = v
 93 | 
 94 |         # experiment["ssg"] = prediction.replace(".jsonl", "").rsplit("_", maxsplit=1)[1]
 95 |         experiment["dataset"] = prediction.split("/")[2]
 96 |         if "retriever" not in experiment:
 97 |             experiment["retriever"] = ""
 98 |         experiment["path"] = prediction
 99 |         all_experiments.append(experiment)
100 | 
101 |     print("Reading by experiment: \n\n\n")
102 |     for expt in all_experiments:
103 |         expt.update(load_experiment(expt["path"]))
104 |         del expt["path"]
105 | 
106 |     frame = pd.DataFrame(all_experiments)
107 |     frame[frame.select_dtypes(include=["number"]).columns] *= 100
108 |     pd.set_option("display.width", 1000)
109 |     pd.set_option("display.max_columns", None)
110 | 
111 |     aggr = {"all_": [np.mean, np.std]}
112 |     aggr.update({k: [np.mean] for k in frame.columns if "type" in k})
113 |     pt = pd.pivot_table(
114 |         frame, index=["model", "generator", "retriever", "lr", "steps"], aggfunc=aggr
115 |     )
116 |     print(pt)
117 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/modelling/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/retriever/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/retriever/dpr.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | import logging
21 | 
22 | import torch
23 | from argparse import ArgumentParser
24 | from tqdm import tqdm
25 | from transformers import (
26 |     DPRContextEncoderTokenizer,
27 |     DPRContextEncoder,
28 | )
29 | from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
30 | 
31 | from neuraldb.util.log_helper import setup_logging
32 | 
33 | logger = logging.getLogger(__name__)
34 | 
35 | 
36 | class DPRRetriever:
37 |     def __init__(self):
38 |         self.question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
39 |             "facebook/dpr-question_encoder-single-nq-base"
40 |         )
41 |         self.question_model = DPRQuestionEncoder.from_pretrained(
42 |             "facebook/dpr-question_encoder-single-nq-base"
43 |         ).to("cuda")
44 |         self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
45 |             "facebook/dpr-ctx_encoder-single-nq-base"
46 |         )
47 |         self.context_model = DPRContextEncoder.from_pretrained(
48 |             "facebook/dpr-ctx_encoder-single-nq-base"
49 |         ).to("cuda")
50 | 
51 |     def lookup(self, queries, facts):
52 |         encoded_questions = self.question_tokenizer(queries, padding=True)
53 |         device_inputs = {
54 |             k: torch.LongTensor(v).to("cuda") for k, v in encoded_questions.items()
55 |         }
56 |         question_outputs = self.question_model(**device_inputs)
57 | 
58 |         encoded_context = self.context_tokenizer(facts, padding=True)
59 |         device_inputs = {
60 |             k: torch.LongTensor(v).to("cuda") for k, v in encoded_context.items()
61 |         }
62 |         context_outputs = self.context_model(**device_inputs)
63 | 
64 |         yield from torch.matmul(
65 |             question_outputs.pooler_output, context_outputs.pooler_output.T
66 |         ).cpu().detach().numpy().argsort(axis=1).tolist()
67 | 
68 | 
69 | if __name__ == "__main__":
70 | 
71 |     parser = ArgumentParser()
72 |     parser.add_argument("in_file")
73 |     parser.add_argument("out_file")
74 |     args = parser.parse_args()
75 | 
76 |     setup_logging()
77 |     dpr = DPRRetriever()
78 |     dpr.context_model.eval()
79 |     dpr.question_model.eval()
80 |     with open(args.in_file) as f, open(args.out_file, "w+") as of:
81 |         for line in tqdm(f):
82 |             database = json.loads(line)
83 | 
84 |             facts = database["facts"]
85 |             queries = [q["query"] for q in database["queries"]]
86 | 
87 |             for query, ids in zip(database["queries"], dpr.lookup(queries, facts)):
88 |                 filtered_ids = list(
89 |                     filter(lambda idx: idx <= query["height"], ids[::-1])
90 |                 )
91 |                 query["predicted_facts"] = [filtered_ids]
92 | 
93 |             of.write(json.dumps(database) + "\n")
94 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/retriever/tfidf.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | import json
20 | import logging
21 | from argparse import ArgumentParser
22 | 
23 | import math
24 | from drqascripts.retriever.build_tfidf_lines import OnlineTfidfDocRanker
25 | 
26 | from neuraldb.util.log_helper import setup_logging
27 | 
28 | logger = logging.getLogger(__name__)
29 | 
30 | 
31 | class TFIDFRetriever:
32 |     class RankArgs:
33 |         def __init__(self):
34 |             self.ngram = 2
35 |             self.hash_size = int(math.pow(2, 24))
36 |             self.tokenizer = "simple"
37 |             self.num_workers = None
38 |             self.max_sent = 50
39 | 
40 |     def __init__(self):
41 |         self.args = self.RankArgs()
42 | 
43 |     def lookup(self, queries, facts):
44 |         tfidf = OnlineTfidfDocRanker(self.args, facts)
45 | 
46 |         for query in queries:
47 |             ids, scores = tfidf.closest_docs(query, self.args.max_sent)
48 |             yield ids
49 | 
50 | 
51 | if __name__ == "__main__":
52 | 
53 |     parser = ArgumentParser()
54 |     parser.add_argument("in_file")
55 |     parser.add_argument("out_file")
56 |     args = parser.parse_args()
57 | 
58 |     setup_logging()
59 |     tfidf = TFIDFRetriever()
60 |     with open(args.in_file) as f, open(args.out_file, "w+") as of:
61 |         for line in f:
62 |             database = json.loads(line)
63 | 
64 |             facts = database["facts"]
65 |             queries = [q["query"] for q in database["queries"]]
66 | 
67 |             for query, ids in zip(database["queries"], tfidf.lookup(queries, facts)):
68 |                 query["predicted_facts"] = [
69 |                     list(filter(lambda idx: idx <= query["height"], ids))
70 |                 ]
71 | 
72 |             of.write(json.dumps(database) + "\n")
73 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/util/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 


--------------------------------------------------------------------------------
/modelling/src/neuraldb/util/log_helper.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
 3 | #
 4 | # This file is part of NeuralDB.
 5 | # See https://github.com/facebookresearch/NeuralDB for further info.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | import logging
21 | import os
22 | 
23 | def setup_logging():
24 |     h = logging.StreamHandler(None)
25 |     logging.root.addHandler(h)
26 |     logging.root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     setup_logging()
31 |     logger = logging.getLogger(__name__)
32 | 
33 |     logger.debug("This is a debug message")
34 |     logger.info("This is an info message")
35 |     logger.warning("This is a warning message")
36 |     logger.error("This is an error message")
37 |     logger.critical("This is a critical message")
38 | 


--------------------------------------------------------------------------------
/modelling/tests/test_evaluation.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
  3 | #
  4 | # This file is part of NeuralDB.
  5 | # See https://github.com/facebookresearch/NeuralDB for further info.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | import unittest
 20 | from neuraldb.evaluation.scoring_functions import (
 21 |     precision,
 22 |     recall,
 23 |     compute_f1,
 24 |     average_score,
 25 | )
 26 | 
 27 | 
 28 | class PrecisionTestCase(unittest.TestCase):
 29 |     def setUp(self) -> None:
 30 |         self.gold_reference = ["a", "b", "c", "d", "e"]
 31 | 
 32 |         self.precision_fail1 = ["a", "b", "c", "d", "e", "f"]
 33 |         self.precision_fail2 = ["f"]
 34 |         self.precision_fail3 = ["a", "f"]
 35 | 
 36 |     def testPrecisionExact(self):
 37 |         self.assertEqual(precision(self.gold_reference, self.gold_reference), 1)
 38 | 
 39 |     def testPrecisionFailOneTooMany(self):
 40 |         self.assertEqual(precision(self.gold_reference, self.precision_fail1), 5 / 6)
 41 | 
 42 |     def testPrecisionFailOnlyWrong(self):
 43 |         self.assertEqual(precision(self.gold_reference, self.precision_fail2), 0)
 44 | 
 45 |     def testPrecisionFailHalfWrong(self):
 46 |         self.assertEqual(precision(self.gold_reference, self.precision_fail3), 0.5)
 47 | 
 48 |     def testPrecisionNoPredictions(self):
 49 |         self.assertEqual(precision(self.gold_reference, []), 1)
 50 | 
 51 |     def testPrecisionNoSourceNoPredictions(self):
 52 |         self.assertEqual(precision([], []), 1)
 53 | 
 54 |     def testPrecisionNoSourceBadPredictions(self):
 55 |         self.assertEqual(precision([], self.precision_fail2), 0)
 56 | 
 57 | 
 58 | class RecallTestCase(unittest.TestCase):
 59 |     def setUp(self) -> None:
 60 |         self.gold_reference = ["a", "b", "c", "d", "e"]
 61 | 
 62 |         self.recall_ok1 = ["a", "b", "c", "d", "e", "f"]
 63 |         self.recall_fail1 = ["f"]
 64 |         self.recall_fail2 = ["a", "f"]
 65 |         self.recall_fail3 = ["a"]
 66 | 
 67 |     def testRecallExact(self):
 68 |         self.assertEqual(recall(self.gold_reference, self.gold_reference), 1)
 69 | 
 70 |     def testRecallTooMany(self):
 71 |         self.assertEqual(recall(self.gold_reference, self.recall_ok1), 1)
 72 | 
 73 |     def testRecallNoPredictions(self):
 74 |         self.assertEqual(recall(self.gold_reference, []), 0)
 75 | 
 76 |     def testRecallNoSourceNoPredictions(self):
 77 |         self.assertEqual(recall([], []), 1)
 78 | 
 79 |     def testRecallOnlyOne(self):
 80 |         self.assertEqual(recall(self.gold_reference, self.recall_fail3), 1 / 5)
 81 | 
 82 |     def testRecallOnlyWithFalsePositive(self):
 83 |         self.assertEqual(recall(self.gold_reference, self.recall_fail2), 1 / 5)
 84 | 
 85 |     def testRecallOnlyFalsePositive(self):
 86 |         self.assertEqual(recall(self.gold_reference, self.recall_fail1), 0)
 87 | 
 88 | 
 89 | class F1Test(unittest.TestCase):
 90 |     def testBothOne(self):
 91 |         self.assertEqual(compute_f1(1, 1), 1.0)
 92 | 
 93 |     def testBothZero(self):
 94 |         self.assertEqual(compute_f1(0, 0), 0.0)
 95 | 
 96 |     def testOneZero(self):
 97 |         self.assertEqual(compute_f1(1, 0), 0.0)
 98 | 
 99 |     def testBothHalf(self):
100 |         self.assertEqual(compute_f1(0.5, 0.5), 0.5)
101 | 
102 |     def testHalfAndOne(self):
103 |         self.assertEqual(compute_f1(0.5, 1), 2 / 3)
104 | 
105 | 
106 | class AverageScoreTest(unittest.TestCase):
107 |     @staticmethod
108 |     def passThroughA(a, b):
109 |         return a
110 | 
111 |     def setUp(self):
112 |         self.scores0 = [0, 0, 0, 0]
113 |         self.scores1Quarter = [0, 0, 0, 1]
114 |         self.scores1 = [1, 1, 1, 1]
115 | 
116 |     def testScoresZero(self):
117 |         self.assertEqual(
118 |             average_score(self.scores0, self.scores0, self.passThroughA), 0
119 |         )
120 | 
121 |     def testScores1Quarter(self):
122 |         self.assertEqual(
123 |             average_score(self.scores1Quarter, self.scores1Quarter, self.passThroughA),
124 |             1 / 4,
125 |         )
126 | 
127 |     def testScores1(self):
128 |         self.assertEqual(
129 |             average_score(self.scores1, self.scores1, self.passThroughA), 1
130 |         )
131 | 
132 |     def testScoresNoInstances(self):
133 |         self.assertEqual(average_score([], [], self.passThroughA), 0)
134 | 


--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/NeuralDB/756801d038d0df2ed5e88e8d5db89be0ade57b54/overview.png


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | black
3 | flake8
4 | licenseheaders


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | 


--------------------------------------------------------------------------------
/ssg/README.md:
--------------------------------------------------------------------------------
 1 | To Train the ssg:
 2 | ```
 3 | python train_ssg.py -i "data_folder" -b "batch_size" -e "number of epochs" -o "output folder"
 4 | ```
 5 | 
 6 | To run the prediction:
 7 | 
 8 | ```
 9 | python ssg_prediction.py -i "data_folder" -m "model_address" -th list_of_thresholds
10 | ```
11 | 
12 | To evaluate the predictions:
13 | 
14 | ```
15 | python evaluate_set_ssg.py -i "prediction file"
16 | ```


--------------------------------------------------------------------------------
/ssg/requirements.txt:
--------------------------------------------------------------------------------
1 | sentence-transformers==0.4.1.2
2 | transformers==4.5.1
3 | torch


--------------------------------------------------------------------------------
/ssg/ssg_utils.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
  3 | #
  4 | # This file is part of NeuralDB.
  5 | # See https://github.com/facebookresearch/NeuralDB for further info.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | import json
 20 | 
 21 | 
 22 | def read_NDB(data_file):
 23 |     with open(data_file) as file:
 24 |         dataset = []
 25 | 
 26 |         for line in file:
 27 |             db = json.loads(line)
 28 | 
 29 |             facts = db["facts"]
 30 |             queries = db["queries"]
 31 |             dataset.append([facts, queries])
 32 |         return dataset
 33 | 
 34 | 
 35 | def create_dataset(db):
 36 |     dataset = []
 37 |     eos = "<eos>"
 38 |     for d in db:
 39 | 
 40 |         questions = d[1]
 41 |         ctx = d[0]
 42 | 
 43 |         for q in questions:
 44 | 
 45 |             t = q["height"]
 46 |             gold_facts = q["facts"]
 47 |             context = ctx[: t + 1]
 48 |             flat_facts = [item for sublist in gold_facts for item in sublist]
 49 | 
 50 |             # all facts in flat facts can be positive
 51 |             state = [q["query"]]
 52 |             pos_act = [context[g] for g in flat_facts]
 53 |             # everything else is negative
 54 |             neg_act = [x for i, x in enumerate(context) if i not in flat_facts]
 55 | 
 56 |             dataset.append([state, eos, 0])
 57 |             dataset.extend([[state, n, 0] for n in neg_act])
 58 |             pos_set = [[state, p, 1] for p in pos_act]
 59 | 
 60 |             dataset.extend(pos_set)
 61 | 
 62 |             for g in gold_facts:
 63 |                 if len(g) <= 1:
 64 |                     state = [q["query"], context[g[0]]]
 65 | 
 66 |                     pos_act = eos
 67 |                     neg_act = context
 68 |                     item = [state, pos_act, 1]
 69 |                     dataset.append(item)
 70 |                     dataset.extend([[state, n, 0] for n in neg_act])
 71 |                 else:
 72 |                     g_0 = g[0]
 73 |                     g_1 = g[1]
 74 | 
 75 |                     state = [q["query"], context[g_0]]
 76 |                     pos_act = context[g_1]
 77 |                     neg_act = [x for i, x in enumerate(context) if i != g_1]
 78 |                     item = [state, pos_act, 1]
 79 |                     dataset.append(item)
 80 |                     dataset.extend([[state, n, 0] for n in neg_act])
 81 | 
 82 |                     state = [q["query"], context[g_1]]
 83 |                     pos_act = context[g_0]
 84 |                     neg_act = [x for i, x in enumerate(context) if i != g_0]
 85 |                     item = [state, pos_act, 1]
 86 |                     dataset.append(item)
 87 |                     dataset.extend([[state, n, 0] for n in neg_act])
 88 | 
 89 |                     state = [q["query"], context[g_0], context[g_1]]
 90 |                     pos_act = eos
 91 |                     neg_act = context
 92 |                     item = [state, pos_act, 1]
 93 |                     dataset.append(item)
 94 |                     dataset.extend([[state, n, 0] for n in neg_act])
 95 | 
 96 |     return dataset
 97 | 
 98 | 
 99 | def prepare_tokenizer(tokenizer):
100 |     special_tokens = []
101 |     special_tokens.extend(["<sep>", "<SEP>", "<eos>", "[SEP]"])
102 |     tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
103 | 


--------------------------------------------------------------------------------
/ssg/train_ssg.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2021 Facebook, Inc. and its affiliates.
  3 | #
  4 | # This file is part of NeuralDB.
  5 | # See https://github.com/facebookresearch/NeuralDB for further info.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | import argparse
 20 | import os
 21 | 
 22 | from sentence_transformers import SentencesDataset, InputExample, SentenceTransformer
 23 | from sentence_transformers.evaluation import BinaryClassificationEvaluator
 24 | from sentence_transformers.losses import ContrastiveLoss
 25 | from torch.utils.data import DataLoader
 26 | from torch.utils.data.sampler import WeightedRandomSampler
 27 | 
 28 | from ssg_utils import read_NDB, create_dataset
 29 | 
 30 | 
 31 | def is_valid_folder(parser, arg):
 32 |     if not os.path.exists(arg):
 33 |         parser.error("The file %s does not exist!" % arg)
 34 |     else:
 35 |         return arg
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 | 
 40 |     parser = argparse.ArgumentParser(description="training ssg")
 41 |     parser.add_argument(
 42 |         "-i",
 43 |         dest="folder",
 44 |         required=True,
 45 |         help="input data folder",
 46 |         type=lambda x: is_valid_folder(parser, x),
 47 |     )
 48 |     parser.add_argument("-b", dest="batch_size", type=int, help="batch size", default=100)
 49 | 
 50 |     parser.add_argument("-e", dest="epochs", type=int, help="number of epochs", default=10)
 51 | 
 52 |     parser.add_argument("-o", dest="output", required=True, help="output address")
 53 |     parser.add_argument("-d", dest="device", default="cuda:0", help="output address")
 54 | 
 55 |     args = parser.parse_args()
 56 | 
 57 |     folder = args.folder
 58 |     batch_size = args.batch_size
 59 |     epochs = args.epochs
 60 |     output = args.output
 61 |     device = args.device
 62 | 
 63 |     # Define the model. Either from scratch of by loading a pre-trained model
 64 |     model = SentenceTransformer("distilbert-base-nli-mean-tokens", device=device)
 65 | 
 66 |     # read the train data
 67 |     name = "train"
 68 |     data_file = folder + "/" + name + ".jsonl"
 69 |     db = read_NDB(data_file)
 70 |     dataset = create_dataset(db)
 71 | 
 72 |     train_examples = []
 73 |     weights = []
 74 |     for d in dataset:
 75 |         texts = ["[SEP]".join(d[0]), "".join(d[1])]
 76 |         label = d[2]
 77 |         if label == 1:
 78 |             weights.append(10)
 79 |         else:
 80 |             weights.append(1)
 81 |         train_examples.append(InputExample(texts=texts, label=label))
 82 | 
 83 |     # read the dev data
 84 |     name = "dev"
 85 |     data_file = folder + "/" + name + ".jsonl"
 86 |     db = read_NDB(data_file)
 87 |     dataset = create_dataset(db)
 88 | 
 89 |     dev_examples = []
 90 |     for d in dataset:
 91 |         texts = ["[SEP]".join(d[0]), "".join(d[1])]
 92 |         label = d[2]
 93 |         dev_examples.append(InputExample(texts=texts, label=label))
 94 | 
 95 |     train_loss = ContrastiveLoss(model)
 96 | 
 97 |     # Define your train dataset, the dataloader and the train loss
 98 |     train_dataset = SentencesDataset(train_examples, model)
 99 |     sampler = WeightedRandomSampler(weights=weights, num_samples=len(train_examples))
100 |     train_dataloader = DataLoader(
101 |         train_dataset, sampler=sampler, shuffle=False, batch_size=batch_size
102 |     )
103 | 
104 |     evaluator = BinaryClassificationEvaluator.from_input_examples(
105 |         dev_examples, batch_size=batch_size
106 |     )
107 | 
108 |     model.fit(
109 |         train_objectives=[(train_dataloader, train_loss)],
110 |         epochs=epochs,
111 |         warmup_steps=100,
112 |         evaluator=evaluator,
113 |         output_path=output,
114 |         evaluation_steps=100,
115 |     )
116 | 


--------------------------------------------------------------------------------