├── DPR
├── CHANGELOG.md
├── dpr
│ ├── __init__.py
│ ├── data
│ │ └── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ └── conf_utils.py
│ └── .DS_Store
├── .DS_Store
├── conf
│ ├── .DS_Store
│ ├── ctx_sources
│ │ └── default_sources.yaml
│ ├── train
│ │ ├── extractive_reader_default.yaml
│ │ ├── biencoder_nq.yaml
│ │ ├── biencoder_default.yaml
│ │ └── biencoder_local.yaml
│ ├── encoder
│ │ └── hf_bert.yaml
│ ├── datasets
│ │ └── retriever_default.yaml
│ ├── biencoder_train_cfg.yaml
│ └── gen_embs.yaml
├── req.txt
├── edit.txt
├── run.sh
├── eval.sh
├── setup.py
└── CONTRIBUTING.md
├── dataflow
├── py.typed
├── analysis
│ └── __init__.py
├── .DS_Store
├── __init__.py
├── core
│ ├── __init__.py
│ ├── constants.py
│ ├── utterance_tokenizer.py
│ ├── utterance_utils.py
│ ├── prediction_report.py
│ └── turn_prediction.py
├── multiwoz
│ ├── __init__.py
│ └── trade_dst
│ │ ├── __init__.py
│ │ └── mapping.pair
├── leaderboard
│ └── __init__.py
└── onmt_helpers
│ └── __init__.py
├── src
├── __init__.py
├── data
│ └── datasets
│ │ ├── mtop.py.lock
│ │ ├── smcalflow.py.lock
│ │ ├── totto.py
│ │ └── __init__.py
├── .DS_Store
├── dataset_readers
│ ├── .DS_Store
│ ├── bm25_tasks
│ │ ├── __init__.py
│ │ ├── php.py
│ │ ├── dart.py
│ │ ├── java.py
│ │ ├── pubmed.py
│ │ ├── python.py
│ │ ├── e2e.py
│ │ ├── mtop.py
│ │ ├── go.py
│ │ ├── roc_story_generation.py
│ │ ├── smcalflow.py
│ │ ├── roc_ending_generation.py
│ │ └── reddit.py
│ ├── scorer_tasks
│ │ ├── __init__.py
│ │ ├── e2e.py
│ │ ├── go.py
│ │ ├── java.py
│ │ ├── pubmed.py
│ │ ├── reddit.py
│ │ ├── dart.py
│ │ ├── php.py
│ │ ├── python.py
│ │ ├── cnndailymail.py
│ │ ├── copa.py
│ │ ├── cr.py
│ │ ├── cs_valid.py
│ │ ├── mr.py
│ │ ├── cs_explan.py
│ │ ├── rte.py
│ │ ├── cola.py
│ │ ├── cosmos_qa.py
│ │ ├── mnli.py
│ │ ├── snli.py
│ │ ├── subj.py
│ │ ├── trec.py
│ │ ├── sst2.py
│ │ ├── sst5.py
│ │ ├── agnews.py
│ │ ├── amazon.py
│ │ ├── yahoo.py
│ │ ├── roc_ending_generation.py
│ │ ├── roc_story_generation.py
│ │ ├── dbpedia.py
│ │ ├── yelp_full.py
│ │ ├── mtop.py
│ │ ├── common_gen.py
│ │ ├── break.py
│ │ ├── smcalflow.py
│ │ └── wikiauto.py
│ └── inference_tasks
│ │ ├── __init__.py
│ │ ├── mtop.py
│ │ ├── e2e.py
│ │ ├── go.py
│ │ ├── pubmed.py
│ │ ├── reddit.py
│ │ └── php.py
├── utils
│ ├── tokenizer_utils.py
│ ├── app.py
│ ├── dataset_utils.py
│ ├── log_utils.py
│ └── cache_util.py
└── models
│ ├── instructor_embedder.py
│ ├── model.py
│ └── embedder.py
├── easy-elasticsearch
├── easy_elasticsearch
│ ├── __init__.py
│ └── examples
│ │ ├── __init__.py
│ │ └── download_and_run.sh
└── setup.py
├── qdecomp_with_dependency_graphs
├── qdecomp_nlp
│ ├── __init__.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── samplers
│ │ │ └── __init__.py
│ │ ├── tokenizers
│ │ │ └── __init__.py
│ │ ├── dataset_readers
│ │ │ ├── __init__.py
│ │ │ └── util.py
│ │ └── token_indexers
│ │ │ └── __init__.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── hybrid
│ │ │ └── __init__.py
│ │ ├── seq2seq
│ │ │ ├── __init__.py
│ │ │ ├── simple_seq2seq_custom.py
│ │ │ └── custom_copynet_seq2seq_for_soft_rat.py
│ │ └── dependencies_graph
│ │ │ └── __init__.py
│ ├── modules
│ │ ├── __init__.py
│ │ ├── token_embedders
│ │ │ └── __init__.py
│ │ └── seq2seq_encoders
│ │ │ └── __init__.py
│ ├── training
│ │ ├── __init__.py
│ │ ├── metrics
│ │ │ └── __init__.py
│ │ └── learning_rate_schedulers
│ │ │ └── __init__.py
│ ├── predictors
│ │ ├── __init__.py
│ │ ├── seq2seq
│ │ │ ├── __init__.py
│ │ │ └── simple_seq2seq_dynamic_predictor.py
│ │ └── dependencies_graph
│ │ │ └── __init__.py
│ └── .DS_Store
├── .gitignore
├── .DS_Store
├── scripts
│ ├── .DS_Store
│ ├── qdmr_to_logical_form
│ │ └── utils_.py
│ ├── data_processing
│ │ └── add_extra_tokens.py
│ ├── tune
│ │ └── studies
│ │ │ ├── biaffine-graph-parser--transformer-encoder.py
│ │ │ └── operators-aware-biaffine-graph-parser--transformer-encoder.py
│ ├── utils
│ │ └── change_config.py
│ └── eval
│ │ └── eval_copy_files.py
├── dependencies_graph
│ ├── .DS_Store
│ ├── extractors
│ │ ├── spans_dependencies_extractors
│ │ │ ├── __init__.py
│ │ │ └── base_spans_dependencies_extractor.py
│ │ ├── tokens_dependencies_to_qdmr_extractors
│ │ │ ├── converters
│ │ │ │ ├── __init__.py
│ │ │ │ └── base_spans_dep_to_qdmr_converter.py
│ │ │ ├── __init__.py
│ │ │ └── base_tokens_dep_to_qdmr_extractor.py
│ │ ├── tokens_dependencies_extractors
│ │ │ ├── __init__.py
│ │ │ ├── collapsers
│ │ │ │ ├── __init__.py
│ │ │ │ ├── to_dependency_type_collapser.py
│ │ │ │ ├── base_collapser.py
│ │ │ │ ├── to_sequential_ids_collapser.py
│ │ │ │ └── add_operator_properties_collapser.py
│ │ │ └── base_tokens_dependencies_extractor.py
│ │ ├── __init__.py
│ │ ├── steps_dependencies_extractors
│ │ │ ├── __init__.py
│ │ │ └── base_steps_dependencies_extractor.py
│ │ └── steps_spans_extractors
│ │ │ ├── __init__.py
│ │ │ ├── aligners
│ │ │ └── base_aligner.py
│ │ │ ├── base_steps_spans_extractor.py
│ │ │ └── from_file_steps_spans_extractor.py
│ ├── data_types
│ │ ├── qdmr_operation.py
│ │ └── __init__.py
│ ├── config
│ │ └── configuration_loader.py
│ └── operators_sequence.py
├── utils
│ ├── timeout_test.py
│ └── timeout.py
├── requirements_core.txt
└── debug.py
├── configs
├── random_finder.yaml
├── bm25_finder.yaml
├── create_index.yaml
├── knn_finder.yaml
├── scorer.yaml
├── client.yaml
├── api_scorer.yaml
└── inference.yaml
├── break_evaluator
├── .DS_Store
├── tmp
│ ├── results
│ │ ├── metrics.json
│ │ ├── decomp_summary.txt
│ │ └── question_decomp_summary.txt
│ └── .DS_Store
├── example_test_predictions
│ └── .DS_Store
├── requirements.txt
├── utils
│ ├── timeout_test.py
│ ├── timeout.py
│ └── graph.py
├── allennlp_preds_format.py
├── Dockerfile
└── evaluate.yaml
├── Channel_LM_Prompting
├── img
│ ├── teaser.png
│ ├── tuning.png
│ ├── data_download.png
│ └── demonstration.png
└── .gitignore
├── semantic_parsing_with_constrained_lm
├── .DS_Store
├── src
│ └── semantic_parsing_with_constrained_lm
│ │ ├── domains
│ │ ├── calflow
│ │ │ └── grammar
│ │ │ │ ├── start.scfg
│ │ │ │ ├── entities.scfg
│ │ │ │ ├── enum_wrappers.scfg
│ │ │ │ └── quoted.scfg
│ │ └── __init__.py
│ │ ├── __init__.py
│ │ ├── configs
│ │ ├── __init__.py
│ │ ├── lib
│ │ │ └── __init__.py
│ │ └── smpa_20210929_zeroshot.py
│ │ ├── earley
│ │ └── __init__.py
│ │ ├── scfg
│ │ ├── __init__.py
│ │ ├── parser
│ │ │ ├── __init__.py
│ │ │ ├── utils.py
│ │ │ └── types.py
│ │ └── string_utils.py
│ │ ├── scripts
│ │ └── __init__.py
│ │ ├── async_tools
│ │ └── __init__.py
│ │ ├── finetune
│ │ └── __init__.py
│ │ ├── paths.py
│ │ ├── util
│ │ ├── types.py
│ │ └── missing_sentinel.py
│ │ ├── cache.py
│ │ ├── datum.py
│ │ └── trie_partial_parse.py
├── tests
│ └── semantic_parsing_with_constrained_lm
│ │ ├── __init__.py
│ │ ├── scfg
│ │ ├── __init__.py
│ │ ├── test_read_grammar.py
│ │ └── test_string_utils.py
│ │ ├── domains
│ │ └── __init__.py
│ │ ├── earley
│ │ ├── __init__.py
│ │ ├── test_input.py
│ │ └── test_agenda.py
│ │ └── async_tools
│ │ └── __init__.py
├── third_party
│ └── break-evaluator
│ │ ├── tmp
│ │ └── results
│ │ │ └── metrics.json
│ │ ├── requirements.txt
│ │ ├── utils
│ │ ├── timeout_test.py
│ │ ├── timeout.py
│ │ └── graph.py
│ │ ├── allennlp_preds_format.py
│ │ ├── Dockerfile
│ │ ├── evaluate.yaml
│ │ ├── pyproject.toml
│ │ └── LICENSE
├── .gitignore
├── SUPPORT.md
├── CODE_OF_CONDUCT.md
├── pyproject.toml
├── LICENSE
└── NOTICE.md
├── .idea
└── deployment.xml
├── scripts
├── find_bm25.sh
└── score_bm25.sh
└── find_random.py
/DPR/CHANGELOG.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dataflow/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/DPR/dpr/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/DPR/dpr/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/DPR/dpr/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dataflow/analysis/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/data/datasets/mtop.py.lock:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/data/datasets/smcalflow.py.lock:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/easy-elasticsearch/easy_elasticsearch/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/easy-elasticsearch/easy_elasticsearch/examples/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/modules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/training/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/models/hybrid/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/predictors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/DPR/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/DPR/.DS_Store
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/data/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/models/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/predictors/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/training/metrics/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/src/.DS_Store
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/data/dataset_readers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/data/token_indexers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/modules/token_embedders/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/DPR/conf/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/DPR/conf/.DS_Store
--------------------------------------------------------------------------------
/DPR/dpr/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/DPR/dpr/.DS_Store
--------------------------------------------------------------------------------
/dataflow/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/dataflow/.DS_Store
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/models/dependencies_graph/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/modules/seq2seq_encoders/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/configs/random_finder.yaml:
--------------------------------------------------------------------------------
1 | output_path: ???
2 | dataset_split: ???
3 | task_name: ???
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/predictors/dependencies_graph/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/training/learning_rate_schedulers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/break_evaluator/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/break_evaluator/.DS_Store
--------------------------------------------------------------------------------
/break_evaluator/tmp/results/metrics.json:
--------------------------------------------------------------------------------
1 | {"ged": 0.3659574013246697, "normalized_exact_match": 0.15}
--------------------------------------------------------------------------------
/dataflow/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | __pycache__/
3 | datasets/
4 | misc/
5 | *.pyc
6 |
--------------------------------------------------------------------------------
/break_evaluator/tmp/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/break_evaluator/tmp/.DS_Store
--------------------------------------------------------------------------------
/dataflow/core/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
--------------------------------------------------------------------------------
/src/dataset_readers/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/src/dataset_readers/.DS_Store
--------------------------------------------------------------------------------
/dataflow/multiwoz/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
--------------------------------------------------------------------------------
/Channel_LM_Prompting/img/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/Channel_LM_Prompting/img/teaser.png
--------------------------------------------------------------------------------
/Channel_LM_Prompting/img/tuning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/Channel_LM_Prompting/img/tuning.png
--------------------------------------------------------------------------------
/dataflow/leaderboard/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
--------------------------------------------------------------------------------
/dataflow/onmt_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
--------------------------------------------------------------------------------
/dataflow/multiwoz/trade_dst/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/qdecomp_with_dependency_graphs/.DS_Store
--------------------------------------------------------------------------------
/Channel_LM_Prompting/img/data_download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/Channel_LM_Prompting/img/data_download.png
--------------------------------------------------------------------------------
/Channel_LM_Prompting/img/demonstration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/Channel_LM_Prompting/img/demonstration.png
--------------------------------------------------------------------------------
/DPR/req.txt:
--------------------------------------------------------------------------------
1 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz
--------------------------------------------------------------------------------
/Channel_LM_Prompting/.gitignore:
--------------------------------------------------------------------------------
1 | original
2 | data
3 | out
4 | *.err
5 | *.out
6 | *.txt
7 | __pycache__
8 | Makefile
9 | tmp*
10 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/scripts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/qdecomp_with_dependency_graphs/scripts/.DS_Store
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/semantic_parsing_with_constrained_lm/.DS_Store
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/start.scfg:
--------------------------------------------------------------------------------
1 | start -> !" " unit, unit
2 |
--------------------------------------------------------------------------------
/break_evaluator/example_test_predictions/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/break_evaluator/example_test_predictions/.DS_Store
--------------------------------------------------------------------------------
/configs/bm25_finder.yaml:
--------------------------------------------------------------------------------
1 | output_path: ???
2 | dataset_split: ???
3 | setup_type: ???
4 | task_name: ???
5 | L: 50
6 | score: False
7 | reindexing: True
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/qdecomp_with_dependency_graphs/qdecomp_nlp/.DS_Store
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/qdecomp_with_dependency_graphs/dependencies_graph/.DS_Store
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/configs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/earley/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/scfg/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/async_tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/configs/lib/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/parser/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/domains/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/earley/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/DPR/edit.txt:
--------------------------------------------------------------------------------
1 | DPR/conf/datasets/encoder_train_default.yaml
2 | DPR/dpr/data/download_data.py
3 | DPR/conf/datasets/retriever_default.yaml
4 | DPR/conf/ctx_sources/default_sources.yaml
5 |
6 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/async_tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/tmp/results/metrics.json:
--------------------------------------------------------------------------------
1 | {"exact_match": 0.24242424242424243, "sari": 0.7061778423719823, "ged": 0.4089606835211786, "normalized_exact_match": 0.32323232323232326}
--------------------------------------------------------------------------------
/src/utils/tokenizer_utils.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | def get_length(tokenizer, text):
4 | tokenized_example = tokenizer.encode_plus(text,truncation=False,return_tensors='pt')
5 | return int(tokenized_example.input_ids.squeeze().shape[0])
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/spans_dependencies_extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_spans_dependencies_extractor import BaseSpansDependenciesExtractor
2 | from .merge_spans_dependencies_exatractor import MergeSpansDependenciesExtractor
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/paths.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from pathlib import Path
5 |
6 | DOMAINS_DIR = Path(__file__).resolve().parent / "domains"
7 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_to_qdmr_extractors/converters/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_spans_dep_to_qdmr_converter import BaseSpansDepToQdmrConverter
2 | from .rule_based_spans_dep_to_qdmr_converter import RuleBasedSpansDepToQdmrConverter
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_tokens_dependencies_extractor import BaseTokensDependenciesExtractor
2 | from .tokens_dependencies_extractor import TokensDependenciesExtractor
3 |
4 | from .collapsers import *
5 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .steps_spans_extractors import *
2 | from .steps_dependencies_extractors import *
3 | from .spans_dependencies_extractors import *
4 | from .tokens_dependencies_extractors import *
5 | from .tokens_dependencies_to_qdmr_extractors import *
6 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_to_qdmr_extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_tokens_dep_to_qdmr_extractor import BaseTokensDependenciesToQDMRExtractor
2 | from .spans_based_tokens_dep_to_qdmr_extractor import SpansBasedTokensDependenciesToQDMRExtractor
3 | from .converters import *
4 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/parser/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | def is_skippable(string: str):
5 | """A string is skippable if it's empty or begins with a '#'"""
6 | return not string or string[0] == "#"
7 |
--------------------------------------------------------------------------------
/DPR/run.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES="0,1" python train_dense_encoder.py train_datasets=[grailqa_train] dev_datasets=[grailqa_dev] train=biencoder_local output_dir=/media/disk1/ohadr/dropout0.15
2 | CUDA_VISIBLE_DEVICES="4" python train_dense_encoder.py train_datasets=[break_train_qd] train=biencoder_local output_dir=/media/disk1/ohadr/break_qd
--------------------------------------------------------------------------------
/break_evaluator/requirements.txt:
--------------------------------------------------------------------------------
1 | # python 3.7.6
2 |
3 | edit-distance==1.0.4
4 | editdistance==0.5.3
5 | matplotlib==3.1.2
6 | networkx==2.4
7 | neuralcoref==4.0
8 | overrides==2.8.0
9 | pandas==0.25.3
10 | lxml==4.5.0
11 | progressbar==2.5
12 | scipy==1.4.1
13 | spacy==2.1.9
14 |
15 |
16 | # python -m spacy download en_core_web_sm
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .venv/
3 | .pipx/
4 |
5 | logs/
6 | trained_models/
7 |
8 | src/semantic_parsing_with_constrained_lm/domains/calflow/data/*.jsonl
9 | src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/grammar.scfg
10 | src/semantic_parsing_with_constrained_lm/domains/overnight/data/
--------------------------------------------------------------------------------
/src/utils/app.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | class App:
4 | def __init__(self):
5 | self.functions = {}
6 | def add(self, key):
7 | def adder(func):
8 | self.functions[key] = func
9 | return func
10 | return adder
11 | def __getitem__(self, __name: str) :
12 | return self.functions[__name]
13 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_dependencies_extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_steps_dependencies_extractor import BaseStepsDependenciesExtractor
2 | from .logical_form_based_steps_dependencies_extractor import LogicalFormBasedStepsDependenciesExtractor
3 | from .pattern_based_steps_dependencies_extractor import PatternBasedStepsDependenciesExtractor
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/util/types.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import os # pylint: disable=unused-import
5 | from typing import Union
6 |
7 | # This can be used to annotate arguments that are supposed to be file paths.
8 | StrPath = Union[str, "os.PathLike[str]"]
9 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/requirements.txt:
--------------------------------------------------------------------------------
1 | # python 3.7.6
2 |
3 | edit-distance==1.0.4
4 | editdistance==0.5.3
5 | matplotlib==3.1.2
6 | networkx==2.4
7 | neuralcoref==4.0
8 | overrides==2.8.0
9 | pandas==0.25.3
10 | lxml==4.5.0
11 | progressbar==2.5
12 | scipy==1.4.1
13 | spacy==2.1.9
14 |
15 |
16 | # python -m spacy download en_core_web_sm
--------------------------------------------------------------------------------
/DPR/conf/ctx_sources/default_sources.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | dpr_wiki:
4 | _target_: dpr.data.retriever_data.CsvCtxSrc
5 | file: data.wikipedia_split.psgs_w100
6 | id_prefix: 'wiki:'
7 | dpr_grail:
8 | _target_: dpr.data.retriever_data.CsvCtxSrc
9 | file: data.wikipedia_split.entities
10 | dpr_epr:
11 | _target_: dpr.data.retriever_data.EPRCtxSrc
12 | setup_type: ???
13 | task_name: ???
14 |
15 |
16 | # id_prefix: 'grail:'
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # Support
2 |
3 | ## How to file issues and get help
4 |
5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
7 | feature request as a new Issue.
8 |
9 | ## Microsoft Support Policy
10 |
11 | Support for this project is limited to the resources listed above.
12 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/configs/smpa_20210929_zeroshot.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | # SMPA with no training examples. Things to vary:
5 | # - Normalize over valid tokens
6 | # - Reward per token
7 | # - Length normalization
8 |
9 |
10 | # - Context?
11 | # - Length normalization
12 | # - Normalize over valid tokens
13 | # - EOS penalty
14 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_spans_extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_steps_spans_extractor import BaseSpansExtractor
2 | from .from_file_steps_spans_extractor import FromFileSpansExtractor
3 | from .variations_based_steps_spans_extractor import VariationsBasedSpansExtractor
4 |
5 | from .aligners.base_aligner import BaseAligner
6 | from .aligners.ILP_based_aligner import ILPAligner
7 | from .aligners.rule_based_aligner import RuleBasedAligner
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/parser/types.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from typing import Tuple
5 |
6 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.scfg.parser.token import SCFGToken
7 |
8 | Nonterminal = str
9 | # An Alias is just another name for a nonterminal.
10 | Alias = str
11 |
12 |
13 | Expansion = Tuple[SCFGToken, ...]
14 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/data_types/qdmr_operation.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class QDMROperation(str, Enum):
5 | FIND, SELECT, FILTER, PROJECT, AGGREGATE, GROUP, SUPERLATIVE, COMPARATIVE, UNION, INTERSECTION, DISCARD, SORT, \
6 | BOOLEAN, ARITHMETIC, COMPARISON, NONE = \
7 | 'find', 'select', 'filter', 'project', 'aggregate', 'group', 'superlative', 'comparative', 'union', \
8 | 'intersection', 'discard', 'sort', 'boolean', 'arithmetic', 'comparison', 'None'
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/entities.scfg:
--------------------------------------------------------------------------------
1 | personname -> quoted, " #(PersonName " quoted ")"
2 | string -> quoted, " #(String " quoted ")"
3 | respondcomment -> quoted, " #(RespondComment " quoted ")"
4 | locationkeyphrase -> quoted, " #(LocationKeyphrase " quoted ")"
5 | path -> quoted, " #(Path " quoted ")"
6 |
7 | list_path_ -> !"(empty list)", " #(List[Path] [])"
8 | list_recipient_ -> !"(empty recipient list)", " #(List[Recipient] [])"
9 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/data_types/__init__.py:
--------------------------------------------------------------------------------
1 | from .qdmr_operation import QDMROperation
2 |
3 | from .steps_dependencies_graph import \
4 | StepsDependencies, DependencyType, StepData, StepDependencyData
5 |
6 | from .steps_spans import StepsSpans, Span
7 |
8 | from .spans_dependencies_graph import SpansData, SpanDependencyData, SpansDependencies
9 |
10 | from .tokens_dependencies_graph import \
11 | TokenData, TokenDependencyData, TokenDependencyType, TokensDependencies
12 |
--------------------------------------------------------------------------------
/break_evaluator/utils/timeout_test.py:
--------------------------------------------------------------------------------
1 |
2 | from time import sleep
3 |
4 | from break_evaluator.utils.timeout import exit_after
5 |
6 |
7 | @exit_after(5)
8 | def countdown(n):
9 | print('countdown started', flush=True)
10 | for i in range(n, -1, -1):
11 | print(i, end=', ', flush=True)
12 | sleep(1)
13 | print('countdown finished')
14 |
15 |
16 | if __name__ == "__main__":
17 | try:
18 | countdown(10)
19 | except KeyboardInterrupt:
20 | print('timeout!')
21 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/enum_wrappers.scfg:
--------------------------------------------------------------------------------
1 | holiday -> holiday_entity, " #(Holiday \"" holiday_entity "\")"
2 | placefeature -> place_feature_entity, " #(PlaceFeature \"" place_feature_entity "\")"
3 | weatherquantifier -> weather_quantifier_entity, " #(WeatherQuantifier \"" weather_quantifier_entity "\")"
4 | responsestatustype -> response_entity, " #(ResponseStatusType \"" response_entity "\")"
5 | number -> number_entity, " #(Number" number_entity ")"
6 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/utils/timeout_test.py:
--------------------------------------------------------------------------------
1 |
2 | from time import sleep
3 |
4 | from qdecomp_with_dependency_graphs.utils.timeout import exit_after
5 |
6 |
7 | @exit_after(5)
8 | def countdown(n):
9 | print('countdown started', flush=True)
10 | for i in range(n, -1, -1):
11 | print(i, end=', ', flush=True)
12 | sleep(1)
13 | print('countdown finished')
14 |
15 |
16 | if __name__ == "__main__":
17 | try:
18 | countdown(10)
19 | except KeyboardInterrupt:
20 | print('timeout!')
21 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/utils/timeout_test.py:
--------------------------------------------------------------------------------
1 |
2 | from time import sleep
3 |
4 | from utils.timeout import exit_after
5 |
6 |
7 | @exit_after(5)
8 | def countdown(n):
9 | print('countdown started', flush=True)
10 | for i in range(n, -1, -1):
11 | print(i, end=', ', flush=True)
12 | sleep(1)
13 | print('countdown finished')
14 |
15 |
16 | if __name__ == "__main__":
17 | try:
18 | countdown(10)
19 | except KeyboardInterrupt:
20 | print('timeout!')
21 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/spans_dependencies_extractors/base_spans_dependencies_extractor.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod, ABC
2 | from typing import List, Tuple
3 |
4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies
5 |
6 |
7 | class BaseSpansDependenciesExtractor(ABC):
8 | @abstractmethod
9 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None,
10 | debug: dict = None) -> SpansDependencies:
11 | raise NotImplementedError()
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_dependencies_extractors/base_steps_dependencies_extractor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import List, Tuple
3 |
4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import StepsDependencies
5 |
6 |
7 | class BaseStepsDependenciesExtractor(ABC):
8 | @abstractmethod
9 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None,
10 | debug: dict = None) -> StepsDependencies:
11 | raise NotImplementedError()
--------------------------------------------------------------------------------
/src/utils/dataset_utils.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | def load_train_dataset(dataset,size=None,listify=True):
4 | if size is not None:
5 | p = size
6 | data = dataset['train']
7 | total_size = len(data)
8 |
9 | rand = random.Random(x=int(p*total_size))
10 | index_list = list(range(total_size))
11 | rand.shuffle(index_list)
12 | x = data.select(index_list[:int(p*total_size)])
13 |
14 |
15 | else:
16 | x = dataset['train']
17 | if listify:
18 | return list(x)
19 | else:
20 | return x
--------------------------------------------------------------------------------
/configs/create_index.yaml:
--------------------------------------------------------------------------------
1 | cuda_device: ???
2 | output_file: ???
3 | setup_type: ???
4 | dataset_split: ???
5 | task_name: ???
6 | batch_size: 50
7 | model_name: 'sentence-transformers/paraphrase-mpnet-base-v2'
8 | instruction: False
9 | dataset_reader:
10 | _target_: src.dataset_readers.indexer_dsr.IndexerDatasetReader
11 | task_name: ${task_name}
12 | setup_type: ${setup_type}
13 | dataset_split: ${dataset_split}
14 | model_name: ${model_name}
15 | instruction: ${instruction}
16 | model:
17 | _target_: src.models.embedder.IndexEmbedder
18 | model_name: ${model_name}
19 |
--------------------------------------------------------------------------------
/src/models/instructor_embedder.py:
--------------------------------------------------------------------------------
1 | from sentence_transformers import SentenceTransformer
2 | from typing import Dict
3 | import torch
4 |
5 |
6 | class IndexEmbedder(torch.nn.Module):
7 | def __init__(self, model_name) -> None:
8 | super().__init__()
9 | self.embedder = SentenceTransformer(model_name)
10 |
11 | def forward(self, instruction, enc_text, **kwargs) -> Dict[str, torch.Tensor]:
12 | input = [[i, e, 0] for i, e in zip(instruction, enc_text)]
13 | enc_emb = self.embedder.encode(input, show_progress_bar=False)
14 | return enc_emb
--------------------------------------------------------------------------------
/DPR/conf/train/extractive_reader_default.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | eval_step: 2000
4 | batch_size: 16
5 | dev_batch_size: 72
6 | adam_eps: 1e-8
7 | adam_betas: (0.9, 0.999)
8 | max_grad_norm: 1.0
9 | log_batch_step: 100
10 | train_rolling_loss_step: 100
11 | weight_decay: 0.0
12 | learning_rate: 1e-5
13 |
14 | # Linear warmup over warmup_steps.
15 | warmup_steps: 0
16 |
17 | # Number of updates steps to accumulate before performing a backward/update pass.
18 | gradient_accumulation_steps: 1
19 |
20 | # Total number of training epochs to perform.
21 | num_train_epochs: 100000
22 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/requirements_core.txt:
--------------------------------------------------------------------------------
1 | # python 3.8.5
2 |
3 | allennlp==2.0.1
4 | allennlp-models==2.0.1
5 | dash==1.17.0 # optuna visualization
6 | edit-distance==1.0.4
7 | inflect==4.1.0
8 | lxml==4.5.2
9 | matplotlib==3.3.2
10 | more-itertools==8.5.0
11 | networkx==2.5
12 | nltk==3.5
13 | neuralcoref==4.0
14 | optuna==2.3.0
15 | ortools==8.0.8283
16 | pandas==1.1.3
17 | progressbar==2.5
18 | psutil==5.8.0
19 | tensorboard==2.3.0
20 | torch==1.7.1
21 | transformers==4.2.2
22 |
23 | # python -m spacy download en_core_web_sm
24 | # python -c "import nltk; nltk.download('wordnet')"
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_to_qdmr_extractors/converters/base_spans_dep_to_qdmr_converter.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | import networkx as nx
4 |
5 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies
6 | from qdecomp_with_dependency_graphs.evaluation.decomposition import Decomposition
7 |
8 |
9 | class BaseSpansDepToQdmrConverter(ABC):
10 | @abstractmethod
11 | def convert(self, spans_dependencies: SpansDependencies) -> Decomposition:
12 | raise NotImplementedError()
13 |
--------------------------------------------------------------------------------
/configs/knn_finder.yaml:
--------------------------------------------------------------------------------
1 | index_path: ???
2 | output_path: ???
3 | dataset_split: ???
4 | setup_type: ???
5 | task_name: ???
6 | model_name: 'sentence-transformers/paraphrase-mpnet-base-v2'
7 | cuda_device: ???
8 | instruction: False
9 | batch_size: 50
10 | dataset_reader:
11 | _target_: src.dataset_readers.indexer_dsr.IndexerDatasetReader
12 | task_name: ${task_name}
13 | setup_type: ${setup_type}
14 | dataset_split: ${dataset_split}
15 | model_name: ${model_name}
16 | instruction: ${instruction}
17 | model:
18 | _target_: src.models.embedder.IndexEmbedder
19 | model_name: ${model_name}
20 |
21 |
--------------------------------------------------------------------------------
/configs/scorer.yaml:
--------------------------------------------------------------------------------
1 | batch_size: 1
2 | model_name: "EleutherAI/gpt-neo-2.7B"
3 | # model_name: "EleutherAI/gpt-neo-125M"
4 | output_file: ???
5 | example_file: ???
6 | setup_type: ???
7 | task_name: ???
8 | sort: True
9 |
10 | dataset_reader:
11 | _target_: src.dataset_readers.scorer_dsr.ScorerDatasetReader
12 | example_file: ${example_file}
13 | task_name: ${task_name}
14 | model_name: ${model_name}
15 | setup_type: ${setup_type}
16 | model:
17 | _target_: transformers.AutoModelForCausalLM.from_pretrained
18 | pretrained_model_name_or_path: ${model_name}
19 | local_files_only: True
20 |
21 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_to_qdmr_extractors/base_tokens_dep_to_qdmr_extractor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import List, Tuple
3 |
4 | from qdecomp_with_dependency_graphs.evaluation.decomposition import Decomposition
5 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import TokensDependencies
6 |
7 |
8 | class BaseTokensDependenciesToQDMRExtractor(ABC):
9 | @abstractmethod
10 | def extract(self, tokens_dependencies: TokensDependencies, debug: dict = None) -> Decomposition:
11 | raise NotImplementedError()
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_spans_extractors/aligners/base_aligner.py:
--------------------------------------------------------------------------------
1 | import re
2 | from abc import ABC, abstractmethod
3 | from typing import List, Tuple, Set
4 |
5 | import spacy
6 | from spacy.tokens.doc import Doc
7 |
8 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import QDMROperation, StepsSpans
9 |
10 |
11 | class BaseAligner(ABC):
12 | def align(self, question: Doc, steps: List[Doc], steps_operators: List[QDMROperation],
13 | index_to_steps: List[Set[Tuple[int, int]]]) -> List[Set[Tuple[int, int]]]:
14 | raise NotImplementedError()
15 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/cache.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from abc import ABC, abstractmethod
5 | from typing import Optional
6 |
7 |
8 | class CacheClient(ABC):
9 | async def __aenter__(self):
10 | pass
11 |
12 | async def __aexit__(self, exc_type, exc_value, traceback):
13 | pass
14 |
15 | @abstractmethod
16 | async def get(self, args: dict) -> Optional[dict]:
17 | pass
18 |
19 | @abstractmethod
20 | async def upload(self, args: dict, result: dict) -> None:
21 | pass
22 |
--------------------------------------------------------------------------------
/configs/client.yaml:
--------------------------------------------------------------------------------
1 | # cwd: ???
2 |
3 | # model_name: 'google/t5-v1_1-xl'
4 | model_name: "EleutherAI/gpt-neo-2.7B"
5 | # model_name: "EleutherAI/gpt-neo-125M"
6 | engine: "ada"
7 | output_file: ???
8 | batch_size: 5
9 | # length_file: ???
10 | prompt_file: ???
11 | max_length: 2048
12 | num_prompts: -1
13 | task_name: ???
14 |
15 | dataset_reader:
16 | _target_: src.dataset_readers.few_shot_dsr.FewShotDatasetReader
17 | model_name: ${model_name}
18 | task_name: ${task_name}
19 | # _target_: src.dataset_readers.tasks.break_task.BreakTask
20 | prompt_file: ${prompt_file}
21 | num_prompts: ${num_prompts}
22 | # length_file: ${length_file}
23 |
--------------------------------------------------------------------------------
/easy-elasticsearch/easy_elasticsearch/examples/download_and_run.sh:
--------------------------------------------------------------------------------
1 | #### Downloading ####
2 | ES=./elasticsearch-7.9.1/bin/elasticsearch
3 | if test -f "$ES"; then
4 | echo "$ES exists. Using the existent one"
5 | else
6 | echo "$ES does not exist. Downloading a new one"
7 | wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.1-linux-x86_64.tar.gz
8 | tar -xf elasticsearch-7.9.1-linux-x86_64.tar.gz
9 | fi
10 |
11 | #### Starting the ES service ####
12 | nohup ./elasticsearch-7.9.1/bin/elasticsearch > elasticsearch.log &
13 |
14 | #### Run the example ####
15 | python -m easy_elasticsearch.examples.quora --mode existing
16 |
--------------------------------------------------------------------------------
/configs/api_scorer.yaml:
--------------------------------------------------------------------------------
1 | # cwd: ???
2 |
3 | # model_name: 'google/t5-v1_1-xl'
4 | model_name: "EleutherAI/gpt-neo-2.7B"
5 | # model_name: "EleutherAI/gpt-neo-125M"
6 | engine: "ada"
7 | output_file: ???
8 | batch_size: 5
9 | # length_file: ???
10 | example_file: ???
11 | setup_type: qa
12 | max_length: 2048
13 | task_name: ???
14 |
15 | dataset_reader:
16 | _target_: src.dataset_readers.scorer_dsr.ScorerDatasetReader
17 | model_name: ${model_name}
18 | task_name: ${task_name}
19 | # _target_: src.dataset_readers.tasks.break_task.BreakTask
20 | # prompt_file: ${prompt_file}
21 | setup_type: ${setup_type}
22 | example_file: ${example_file}
23 | # length_file: ${length_file}
24 |
--------------------------------------------------------------------------------
/break_evaluator/allennlp_preds_format.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | def main():
5 | # todo: @@SEP@@ to ; , @@#@@ to #
6 | predictions_file = "old_data_dev_low_level_preds.json"
7 | traget_file= predictions_file.replace('.json', '.csv')
8 | with open(predictions_file, "r") as fd:
9 | preds = [json.loads(line) for line in fd.readlines()]
10 | preds = [re.sub(r'@@(\d+)@@', '#\g<1>', re.sub('@@SEP@@',';', ' '.join(p['predicted_tokens'][0]))) for p in preds]
11 | preds.insert(0,'prediction')
12 | preds = [f'"{p}"\n' for p in preds]
13 | with open(traget_file, "wt") as fd:
14 | fd.writelines(preds)
15 |
16 |
17 | if __name__ == '__main__':
18 | main()
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/datum.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from dataclasses import dataclass
5 | from typing import Optional, TypeVar
6 |
7 |
8 | @dataclass(frozen=True, eq=True)
9 | class Datum:
10 | dialogue_id: Optional[str]
11 | turn_part_index: Optional[int]
12 | agent_context: Optional[str]
13 | natural: str
14 |
15 |
16 | @dataclass(frozen=True, eq=True)
17 | class FullDatum(Datum):
18 | canonical: str
19 |
20 |
21 | FullDatumSub = TypeVar("FullDatumSub", bound=FullDatum, contravariant=True)
22 | DatumSub = TypeVar("DatumSub", bound=Datum, contravariant=True)
23 |
--------------------------------------------------------------------------------
/dataflow/core/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 |
5 | class SpecialStrings:
6 | """Special strings in stringified turn parts.
7 | """
8 |
9 | # an empty value (we need it since some library doesn't like an empty string)
10 | NULL = "__NULL"
11 | # indicates there is a break between the two utterance segments
12 | BREAK = "__BREAK"
13 | # indicates the user is the speaker for the following utterance
14 | SPEAKER_USER = "__User"
15 | # indicates the agent is the speaker for the following utterance
16 | SPEAKER_AGENT = "__Agent"
17 | # start of a program
18 | START_OF_PROGRAM = "__StartOfProgram"
19 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/string_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from typing import Iterable
5 |
6 |
7 | def detokenize(tokens: Iterable[str], with_treebank: bool = True) -> str:
8 | """
9 | Given a list of tokens, join them together into a string.
10 | with_treebank = True is typically used when rendering utterances, so we don't need to deal with things like
11 | "andrew's"
12 | with_treebank = False is typically for rendering express.
13 | """
14 | if with_treebank:
15 | return " ".join(tokens).replace(" ", " ")
16 |
17 | return "".join(tokens)
18 |
--------------------------------------------------------------------------------
/src/models/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import transformers
3 | from transformers import AutoTokenizer, AutoModelForCausalLM
4 |
5 | def no_init(loading_code):
6 | def dummy(self):
7 | return
8 |
9 | modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm]
10 | original = {}
11 | for mod in modules:
12 | original[mod] = mod.reset_parameters
13 | mod.reset_parameters = dummy
14 |
15 | result = loading_code()
16 | for mod in modules:
17 | mod.reset_parameters = original[mod]
18 |
19 | return result
20 |
21 |
22 | def get_model(**kwargs):
23 | return no_init(lambda: AutoModelForCausalLM.from_pretrained(**kwargs, local_files_only=True))
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/debug.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import shutil
3 |
4 | # past command as script parameter
5 | from allennlp.commands import main
6 |
7 | sys.argv=sys.argv[1:] # remove script name
8 |
9 | serialization_dir = "tmp/debugger_train"
10 |
11 | if "train" in sys.argv:
12 | sys.argv.extend(["-s", serialization_dir])
13 |
14 | # Training will fail if the serialization directory already
15 | # has stuff in it. If you are running the same training loop
16 | # over and over again for debugging purposes, it will.
17 | # Hence we wipe it out in advance.
18 | # BE VERY CAREFUL NOT TO DO THIS FOR ACTUAL TRAINING!
19 | shutil.rmtree(serialization_dir, ignore_errors=True)
20 |
21 | main()
--------------------------------------------------------------------------------
/DPR/conf/train/biencoder_nq.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | batch_size: 16
4 | dev_batch_size: 64
5 | adam_eps: 1e-8
6 | adam_betas: (0.9, 0.999)
7 | max_grad_norm: 2.0
8 | log_batch_step: 100
9 | train_rolling_loss_step: 100
10 | weight_decay: 0.0
11 | learning_rate: 2e-5
12 |
13 | # Linear warmup over warmup_steps.
14 | warmup_steps: 1237
15 |
16 | # Number of updates steps to accumulate before performing a backward/update pass.
17 | gradient_accumulation_steps: 1
18 |
19 | # Total number of training epochs to perform.
20 | num_train_epochs: 40
21 | eval_per_epoch: 1
22 | hard_negatives: 1
23 | other_negatives: 0
24 | val_av_rank_hard_neg: 30
25 | val_av_rank_other_neg: 30
26 | val_av_rank_bsz: 128
27 | val_av_rank_max_qs: 10000
--------------------------------------------------------------------------------
/DPR/conf/train/biencoder_default.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | batch_size: 2
4 | dev_batch_size: 4
5 | adam_eps: 1e-8
6 | adam_betas: (0.9, 0.999)
7 | max_grad_norm: 1.0
8 | log_batch_step: 100
9 | train_rolling_loss_step: 100
10 | weight_decay: 0.0
11 | learning_rate: 1e-5
12 |
13 | # Linear warmup over warmup_steps.
14 | warmup_steps: 100
15 |
16 | # Number of updates steps to accumulate before performing a backward/update pass.
17 | gradient_accumulation_steps: 1
18 |
19 | # Total number of training epochs to perform.
20 | num_train_epochs: 40
21 | eval_per_epoch: 1
22 | hard_negatives: 1
23 | other_negatives: 0
24 | val_av_rank_hard_neg: 30
25 | val_av_rank_other_neg: 30
26 | val_av_rank_bsz: 128
27 | val_av_rank_max_qs: 10000
--------------------------------------------------------------------------------
/break_evaluator/Dockerfile:
--------------------------------------------------------------------------------
1 | # Evaluator for Break dataset on beaker
2 |
3 | FROM python:3.7.6-slim-buster
4 |
5 | ENV PYTHONPATH .
6 |
7 | # set the working directory
8 |
9 | WORKDIR /break-evaluator
10 |
11 |
12 | # install python packages
13 |
14 | ADD ./requirements.txt .
15 |
16 | RUN pip3.7 install -r requirements.txt
17 | RUN python3.7 -m spacy download en_core_web_sm
18 |
19 |
20 | # add in the readme and evaluation scripts
21 |
22 | ADD README.md .
23 | ADD allennlp_preds_format.py .
24 | COPY evaluation ./evaluation
25 | COPY scripts ./scripts
26 | COPY utils ./utils
27 |
28 | RUN mkdir /results
29 |
30 |
31 | # define the default command
32 | # in this case a linux shell where we can run the eval script
33 | CMD ["/bin/bash"]
34 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/quoted.scfg:
--------------------------------------------------------------------------------
1 | # Quoted strings *do* begin with a space in this grammar.
2 | # For example, `create event with " Rose"`.
3 | # The space has to be a regex, b/c it gets consumed by CopyTokens,
4 | # and it has to not be inside nonquoteplus, because it doesn't
5 | # appear on the plan side.
6 | quoted -> !"\"" !/ / nonquoteplus !"\"", "\"" nonquoteplus "\""
7 |
8 | # matches one or more characters that are not double quotes
9 | nonquoteplus -> !/[^"]/ nonquotestar, /[^"]/ nonquotestar
10 |
11 | # matches zero or more characters that are not double quotes
12 | nonquotestar -> !/[^"]/ nonquotestar, /[^"]/ nonquotestar
13 | nonquotestar -> empty, empty
14 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/scfg/test_read_grammar.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import pytest
5 |
6 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.scfg.read_grammar import PreprocessedGrammar
7 |
8 |
9 | def test_from_line_iter():
10 | with pytest.raises(AssertionError) as excinfo:
11 | PreprocessedGrammar.from_line_iter(
12 | ['describe 2> "describe"', 'describe 2> "describe(" ")"']
13 | )
14 | assert "Macro describe cannot be defined more than once" in str(excinfo)
15 | # Doesn't throw.
16 | PreprocessedGrammar.from_line_iter(['describe 2> "describe"'])
17 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/allennlp_preds_format.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | def main():
5 | # todo: @@SEP@@ to ; , @@#@@ to #
6 | predictions_file = "old_data_dev_low_level_preds.json"
7 | traget_file= predictions_file.replace('.json', '.csv')
8 | with open(predictions_file, "r") as fd:
9 | preds = [json.loads(line) for line in fd.readlines()]
10 | preds = [re.sub(r'@@(\d+)@@', '#\g<1>', re.sub('@@SEP@@',';', ' '.join(p['predicted_tokens'][0]))) for p in preds]
11 | preds.insert(0,'prediction')
12 | preds = [f'"{p}"\n' for p in preds]
13 | with open(traget_file, "wt") as fd:
14 | fd.writelines(preds)
15 |
16 |
17 | if __name__ == '__main__':
18 | main()
--------------------------------------------------------------------------------
/break_evaluator/evaluate.yaml:
--------------------------------------------------------------------------------
1 |
2 | description: Run the evaluator for the Break dataset.
3 | tasks:
4 | - spec:
5 | blueprint: $BREAK_EVALUATOR
6 | resultPath: /results
7 | args:
8 | - PYTHONPATH="."
9 | - python3.7
10 | - scripts/evaluate_predictions.py
11 | - --dataset_file=data/labels.csv
12 | - --preds_file=data/predictions.csv
13 | - --no_cache
14 | - --output_file_base=/results/results
15 | - --metrics
16 | - ged_scores exact_match sari normalized_exact_match
17 | datasetMounts:
18 | - datasetId: $BREAK_PREDICTIONS
19 | containerPath: /data/predictions
20 | - datasetId: $BREAK_LABELS
21 | containerPath: /data/labels.csv
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/util/missing_sentinel.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | class MissingSentinel:
5 | """One instance of this is created as MISSING_SENTINEL below.
6 |
7 | That instance is used to indicate that a variable lacks a value, and nothing else.
8 |
9 | Usually None is used for this purpose, but sometimes None is in the valid
10 | set of values and cannot be used to mean that a value is missing.
11 |
12 | This is very similar to dataclasses.MISSING, but that value has a private type."""
13 |
14 | def __repr__(self) -> str:
15 | return ""
16 |
17 |
18 | MISSING_SENTINEL = MissingSentinel()
19 |
--------------------------------------------------------------------------------
/src/data/datasets/totto.py:
--------------------------------------------------------------------------------
1 | import json
2 | import datasets
3 |
4 | class ToTToDataset:
5 | def __init__(self):
6 | _URL = "https://storage.googleapis.com/totto-public/totto_data.zip"
7 | dl_manager = datasets.utils.download_manager.DownloadManager()
8 | self.cache_path = dl_manager.download_and_extract(_URL)
9 | self.splits = {}
10 | for split_name in ["train","dev"]:
11 | with open(f"{self.cache_path}/totto_data/totto_{split_name}_data.jsonl", 'r') as f:
12 | proccessed_dataset = []
13 | for example in f:
14 | dict_example = json.loads(example)
15 | proccessed_dataset.append(dict_example)
16 | self.splits[split_name] = proccessed_dataset
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/earley/test_input.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.earley.input import SigmaStarTriePosition
5 |
6 |
7 | def test_sigmastar():
8 | p = SigmaStarTriePosition[str]()
9 | (a_1,) = p.scan("a")
10 | (a_2,) = p.scan("a")
11 | assert id(a_1) == id(a_2), "scans should be cached and reused"
12 |
13 | (as_1,) = a_1.scan("s")
14 | (asd,) = as_1.scan("d")
15 | (asdf,) = asd.scan("f")
16 | assert asdf.last() == "f"
17 | assert asdf.prefix() == ["a", "s", "d", "f"]
18 |
19 | (asde,) = asd.scan("e")
20 | assert asde.prefix() == ["a", "s", "d", "e"]
21 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/Dockerfile:
--------------------------------------------------------------------------------
1 | # Evaluator for Break dataset on beaker
2 |
3 | FROM python:3.7.6-slim-buster
4 |
5 | ENV PYTHONPATH .
6 |
7 | # set the working directory
8 |
9 | WORKDIR /break-evaluator
10 |
11 |
12 | # install python packages
13 |
14 | ADD ./requirements.txt .
15 |
16 | RUN pip3.7 install -r requirements.txt
17 | RUN python3.7 -m spacy download en_core_web_sm
18 |
19 |
20 | # add in the readme and evaluation scripts
21 |
22 | ADD README.md .
23 | ADD allennlp_preds_format.py .
24 | COPY evaluation ./evaluation
25 | COPY scripts ./scripts
26 | COPY utils ./utils
27 |
28 | RUN mkdir /results
29 |
30 |
31 | # define the default command
32 | # in this case a linux shell where we can run the eval script
33 | CMD ["/bin/bash"]
34 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/evaluate.yaml:
--------------------------------------------------------------------------------
1 |
2 | description: Run the evaluator for the Break dataset.
3 | tasks:
4 | - spec:
5 | blueprint: $BREAK_EVALUATOR
6 | resultPath: /results
7 | args:
8 | - PYTHONPATH="."
9 | - python3.7
10 | - scripts/evaluate_predictions.py
11 | - --dataset_file=data/labels.csv
12 | - --preds_file=data/predictions.csv
13 | - --no_cache
14 | - --output_file_base=/results/results
15 | - --metrics
16 | - ged_scores exact_match sari normalized_exact_match
17 | datasetMounts:
18 | - datasetId: $BREAK_PREDICTIONS
19 | containerPath: /data/predictions
20 | - datasetId: $BREAK_LABELS
21 | containerPath: /data/labels.csv
--------------------------------------------------------------------------------
/DPR/conf/encoder/hf_bert.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | # model type. One of [hf_bert, pytext_bert, fairseq_roberta]
4 | encoder_model_type: hf_bert
5 |
6 | # HuggingFace's config name for model initialization
7 | pretrained_model_cfg: bert-base-uncased
8 | #pretrained_model_cfg: Luyu/co-condenser-marco
9 |
10 | # Some encoders need to be initialized from a file
11 | pretrained_file:
12 |
13 | # Extra linear layer on top of standard bert/roberta encoder
14 | projection_dim: 0
15 |
16 | # Max length of the encoder input sequence
17 | sequence_length: 256
18 |
19 | dropout: 0.1
20 |
21 | # whether to fix (don't update) context encoder during training or not
22 | fix_ctx_encoder: False
23 |
24 | # if False, the model won't load pre-trained BERT weights
25 | pretrained: True
26 |
27 | gradient_checkpointing: False
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_collapser import BaseCollapser
2 | from .join_collapser import JoinCollapser
3 | from .concat_collapser import ConcatCollapser
4 | from .missing_resources_collapser import MissingResourcesCollapser
5 | from .last_step_collapser import LastStepCollapser
6 | from .to_dependency_type_collapser import ToDependencyTypeCollapser
7 | from .single_to_multiple_steps_pre_collapser import PreSingleToMultipleStepsCollapser
8 | from .not_aligned_dum_collapser import NotAlignedDumCollapser
9 | from .single_to_multiple_steps_dup_collapser import DupSingleToMultipleStepsCollapser
10 | from .add_operator_properties_collapser import AddOperatorsPropertiesCollapser
11 | from .to_sequential_ids_collapser import ToSequentialIdsCollapser
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/to_dependency_type_collapser.py:
--------------------------------------------------------------------------------
1 | from overrides import overrides
2 |
3 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import DependencyType, SpansDependencies
4 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors.tokens_dependencies_extractors.collapsers.base_collapser import BaseCollapser
5 |
6 |
7 | class ToDependencyTypeCollapser(BaseCollapser):
8 | @overrides
9 | def collapse(self, spans_dependencies: SpansDependencies, decomposition: str= None) -> None:
10 | pass
11 |
12 | @overrides
13 | def unwind(self, spans_dependencies: SpansDependencies) -> None:
14 | for _, _, data in spans_dependencies.dependencies():
15 | data.dep_type = DependencyType(data.dep_type)
16 |
--------------------------------------------------------------------------------
/src/models/embedder.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModel
2 | from typing import Dict
3 | import torch
4 |
5 | def mean_pooling(model_output, attention_mask):
6 | token_embeddings = model_output[0] #First element of model_output contains all token embeddings
7 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
8 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
9 |
10 | class IndexEmbedder(torch.nn.Module):
11 | def __init__(self, model_name) -> None:
12 | super().__init__()
13 | self.embedder = AutoModel.from_pretrained(model_name)
14 |
15 | def forward(self, input_ids, attention_mask,**kwargs) -> Dict[str, torch.Tensor]:
16 | enc_emb = self.embedder(input_ids)
17 | return mean_pooling(enc_emb, attention_mask)
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/scfg/test_string_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.scfg.parser.utils import is_skippable
5 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.scfg.string_utils import detokenize
6 |
7 |
8 | def test_is_comment():
9 | assert is_skippable("#hi")
10 | assert is_skippable("")
11 | assert not is_skippable("hi")
12 |
13 |
14 | def test_detokenize():
15 | assert (
16 | detokenize(["find", "Event", "time", ".", "results", "chris", "'s", "car"])
17 | == "find Event time . results chris 's car"
18 | )
19 |
20 | assert detokenize(["f", "(", "x", ",", "y", ")"], with_treebank=False) == "f(x,y)"
21 |
--------------------------------------------------------------------------------
/DPR/conf/train/biencoder_local.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | # batch_size: 40
4 | batch_size: 120
5 | dev_batch_size: 32
6 | adam_eps: 1e-8
7 | adam_betas: (0.9, 0.999)
8 | max_grad_norm: 2.0
9 | log_batch_step: 1
10 | train_rolling_loss_step: 100
11 | weight_decay: 0.0
12 | # learning_rate: 2e-5
13 | # learning_rate: 0.000213
14 | learning_rate: 0.00013416407864998739
15 | # learning_rate: 0.0001065
16 |
17 | # encoder:
18 | # dropout: 0.15
19 |
20 | # Linear warmup over warmup_steps.
21 | warmup_steps: 1237
22 |
23 | # Number of updates steps to accumulate before performing a backward/update pass.
24 | gradient_accumulation_steps: 1
25 |
26 | # Total number of training epochs to perform.
27 | num_train_epochs: 30
28 | eval_per_epoch: 1
29 | hard_negatives: 1
30 | other_negatives: 0
31 | val_av_rank_hard_neg: 30
32 | val_av_rank_other_neg: 30
33 | val_av_rank_bsz: 128
34 | val_av_rank_max_qs: 10000
35 |
--------------------------------------------------------------------------------
/src/utils/log_utils.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | import logging
3 |
4 | @contextmanager
5 | def all_logging_disabled(highest_level=logging.CRITICAL):
6 | """
7 | A context manager that will prevent any logging messages
8 | triggered during the body from being processed.
9 | :param highest_level: the maximum logging level in use.
10 | This would only need to be changed if a custom level greater than CRITICAL
11 | is defined.
12 | """
13 | # two kind-of hacks here:
14 | # * can't get the highest logging level in effect => delegate to the user
15 | # * can't get the current module-level override => use an undocumented
16 | # (but non-private!) interface
17 |
18 | previous_level = logging.root.manager.disable
19 |
20 | logging.disable(highest_level)
21 |
22 | try:
23 | yield
24 | finally:
25 | logging.disable(previous_level)
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/scripts/qdmr_to_logical_form/utils_.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | DELIMITER = ';'
4 | REF = '#'
5 |
6 |
7 | def parse_decomposition(qdmr):
8 | """Parses the decomposition into an ordered list of steps
9 |
10 | Parameters
11 | ----------
12 | qdmr : str
13 | String representation of the QDMR
14 |
15 | Returns
16 | -------
17 | list
18 | returns ordered list of qdmr steps
19 | """
20 | # parse commas as separate tokens
21 | qdmr = qdmr.replace(",", " , ")
22 | crude_steps = qdmr.split(DELIMITER)
23 | steps = []
24 | for i in range(len(crude_steps)):
25 | step = crude_steps[i]
26 | tokens = step.split()
27 | step = ""
28 | # remove 'return' prefix
29 | for tok in tokens[1:]:
30 | step += tok.strip() + " "
31 | step = step.strip()
32 | steps += [step]
33 | return steps
--------------------------------------------------------------------------------
/src/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | import types
2 | import pathlib
3 | from os.path import dirname, isfile, join
4 | import os
5 | import glob
6 | import json
7 |
8 |
9 | modules = {}
10 | modules_list = glob.glob(join(dirname(__file__), "*.py"))
11 | for path in modules_list:
12 | if isfile(path) and not path.endswith('__init__.py') and not path.endswith('__main__.py'):
13 | mod_name = pathlib.Path(path).name[:-3]
14 | module = types.ModuleType(mod_name)
15 | with open(path) as f:
16 | module_str = f.read()
17 | exec(module_str, module.__dict__)
18 | modules[mod_name] = module
19 |
20 | dataset_dict = {}
21 | for module_name, module in modules.items():
22 | for el in dir(module):
23 | if el.endswith("Dataset"):
24 | obj = module.__dict__[el]
25 | dataset_dict[module_name] = obj
26 |
27 | def get_dataset(name):
28 | return dataset_dict[name]()
29 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/base_collapser.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import List, Tuple
3 |
4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies, DependencyType
5 |
6 |
7 | class BaseCollapser(ABC):
8 | def __init__(self):
9 | self.additional_tokens = []
10 |
11 | """
12 | Deal with empty spans in SpansDependencies graph
13 | """
14 | @abstractmethod
15 | def collapse(self, spans_dependencies: SpansDependencies, decomposition: str) -> None:
16 | raise NotImplementedError()
17 |
18 | @abstractmethod
19 | def unwind(self, spans_dependencies: SpansDependencies) -> None:
20 | raise NotImplementedError()
21 |
22 | @staticmethod
23 | def _get_operator(x: str):
24 | return DependencyType(x).get_operator() if DependencyType.has_value(x) else x
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/scripts/data_processing/add_extra_tokens.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import pandas as pd
3 | from os import path
4 |
5 | from qdecomp_with_dependency_graphs.dependencies_graph.create_dependencies_graphs import get_extra_tokens
6 |
7 |
8 | def main(root_dir: str):
9 | files = Path(root_dir).rglob('*_seq2seq.csv')
10 | extra_tokens, _ = get_extra_tokens()
11 | extra_tokens = ' '.join(extra_tokens)
12 | for fp in files:
13 | fp = str(fp)
14 | try:
15 | print(f'process {fp}...')
16 | df = pd.read_csv(fp)
17 | df['question_text'] = df['question_text'].apply(lambda x: f'{x} {extra_tokens}')
18 | dst_fp = path.splitext(fp)[0]+'__extra_tok.csv'
19 | df.to_csv(dst_fp, index=False)
20 | except Exception as ex:
21 | print(f'ERROR: {ex}')
22 |
23 | if __name__ == '__main__':
24 | main(root_dir= 'datasets/Break/QDMR/')
--------------------------------------------------------------------------------
/break_evaluator/utils/timeout.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import print_function
3 |
4 | import sys
5 | import threading
6 |
7 |
8 | try:
9 | import thread
10 | except ImportError:
11 | import _thread as thread
12 |
13 |
14 | def quit_function(fn_name):
15 | print('{0} took too long'.format(fn_name), file=sys.stderr)
16 | sys.stderr.flush()
17 | # raises KeyboardInterrupt
18 | thread.interrupt_main()
19 |
20 |
21 | def exit_after(s):
22 | """
23 | use as decorator to exit process if
24 | function takes longer than s seconds
25 | """
26 | def outer(fn):
27 | def inner(*args, **kwargs):
28 | timer = threading.Timer(s, quit_function, args=[fn.__name__])
29 | timer.start()
30 | try:
31 | result = fn(*args, **kwargs)
32 | finally:
33 | timer.cancel()
34 | return result
35 | return inner
36 | return outer
37 |
38 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/earley/test_agenda.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.earley.agenda import Agenda, Meta
5 |
6 |
7 | def test_push_pop():
8 | a = Agenda()
9 | z = Meta.zero()
10 | assert a.push(3, z)
11 | assert a.push(5, z)
12 | # duplicate should be ignored
13 | assert not a.push(3, z)
14 | assert a.popped == []
15 | assert a.remaining == [3, 5]
16 | assert a.pop() == 3
17 | assert a.popped == [3]
18 | assert a.remaining == [5]
19 | # duplicate should be ignored
20 | assert not a.push(3, z)
21 | assert a.push(7, z)
22 |
23 | assert a.popped == [3]
24 | assert a.remaining == [5, 7]
25 |
26 | def it():
27 | while a:
28 | yield a.pop()
29 |
30 | assert list(it()) == [5, 7]
31 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/utils/timeout.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import print_function
3 |
4 | import sys
5 | import threading
6 |
7 |
8 | try:
9 | import thread
10 | except ImportError:
11 | import _thread as thread
12 |
13 |
14 | def quit_function(fn_name):
15 | print('{0} took too long'.format(fn_name), file=sys.stderr)
16 | sys.stderr.flush()
17 | # raises KeyboardInterrupt
18 | thread.interrupt_main()
19 |
20 |
21 | def exit_after(s):
22 | """
23 | use as decorator to exit process if
24 | function takes longer than s seconds
25 | """
26 | def outer(fn):
27 | def inner(*args, **kwargs):
28 | timer = threading.Timer(s, quit_function, args=[fn.__name__])
29 | timer.start()
30 | try:
31 | result = fn(*args, **kwargs)
32 | finally:
33 | timer.cancel()
34 | return result
35 | return inner
36 | return outer
37 |
38 |
--------------------------------------------------------------------------------
/break_evaluator/tmp/results/decomp_summary.txt:
--------------------------------------------------------------------------------
1 | overall scores:
2 | ged score: mean 0.371 max 0.998 min 0.000
3 | normalized_exact_match score: mean 0.280 max 1.000 min 0.000
4 | skipped 9 examples when computing ged.
5 | ged normalized_exact_match
6 | dataset
7 | ATIS 0.209 0.417
8 | CLEVR 0.560 0.167
9 | COMQA 0.422 0.111
10 | CWQ 0.375 0.000
11 | DROP 0.237 0.333
12 | GEO 0.000 1.000
13 | NLVR2 0.454 0.190
14 | SPIDER 0.239 0.467
15 | ged normalized_exact_match
16 | num_steps
17 | 2 0.282 0.500
18 | 3 0.274 0.350
19 | 4 0.334 0.300
20 | 5 0.425 0.235
21 | 6 0.425 0.231
22 | 7 0.549 0.100
23 | 8 NaN 0.000
24 | 9 NaN 0.000
25 | 10 0.398 0.000
26 | 11 0.839 0.000
27 | 12 0.000 0.333
28 | 13 0.453 0.500
29 | 20 NaN 0.000
30 | {'GED': 0.3709896995734663, 'norm_EM': 0.28}
31 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/utils/timeout.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import print_function
3 |
4 | import sys
5 | import threading
6 |
7 |
8 | try:
9 | import thread
10 | except ImportError:
11 | import _thread as thread
12 |
13 |
14 | def quit_function(fn_name):
15 | print('{0} took too long'.format(fn_name), file=sys.stderr)
16 | sys.stderr.flush()
17 | # raises KeyboardInterrupt
18 | thread.interrupt_main()
19 |
20 |
21 | def exit_after(s):
22 | """
23 | use as decorator to exit process if
24 | function takes longer than s seconds
25 | """
26 | def outer(fn):
27 | def inner(*args, **kwargs):
28 | timer = threading.Timer(s, quit_function, args=[fn.__name__])
29 | timer.start()
30 | try:
31 | result = fn(*args, **kwargs)
32 | finally:
33 | timer.cancel()
34 | return result
35 | return inner
36 | return outer
37 |
38 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/base_tokens_dependencies_extractor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any, Dict, List, Tuple
3 |
4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import TokensDependencies, SpansDependencies
5 |
6 |
7 | class BaseTokensDependenciesExtractor(ABC):
8 | @abstractmethod
9 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None,
10 | debug: dict = None) -> TokensDependencies:
11 | raise NotImplementedError()
12 |
13 | def get_extra_tokens(self) -> List[str]:
14 | return []
15 |
16 | def to_spans_dependencies(self, tokens_dependencies: TokensDependencies,
17 | debug: dict = None) -> SpansDependencies:
18 | # spans dependencies graph
19 | spans_dependencies: SpansDependencies = tokens_dependencies.to_spans_dependencies()
20 | return spans_dependencies
21 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/__init__.py:
--------------------------------------------------------------------------------
1 | import types
2 | import pathlib
3 | from os.path import dirname, isfile, join
4 | import os
5 | import glob
6 | import json
7 |
8 | modules = {}
9 | modules_list = glob.glob(join(dirname(__file__), "*.py"))
10 | for path in modules_list:
11 | if isfile(path) and not path.endswith('__init__.py') and not path.endswith('task_.py'):
12 | mod_name = pathlib.Path(path).name[:-3]
13 | module = types.ModuleType(mod_name)
14 | with open(path) as f:
15 | module_str = f.read()
16 | exec(module_str, module.__dict__)
17 | modules[mod_name] = module
18 |
19 | task_list = {}
20 | for module_name, module in modules.items():
21 | for el in dir(module):
22 | if el.endswith("BM25Task"):
23 | obj = module.__dict__[el]
24 | task_list[obj.name] = obj
25 |
26 |
27 | class BM25Task:
28 | def __init__(self) -> None:
29 | pass
30 | @classmethod
31 | def from_name(cls,name):
32 | return task_list[name]
33 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/__init__.py:
--------------------------------------------------------------------------------
1 | import types
2 | import pathlib
3 | from os.path import dirname, isfile, join
4 | import os
5 | import glob
6 | import json
7 |
8 |
9 | modules = {}
10 | modules_list = glob.glob(join(dirname(__file__), "*.py"))
11 | for path in modules_list:
12 | if isfile(path) and not path.endswith('__init__.py') and not path.endswith('task_.py'):
13 | mod_name = pathlib.Path(path).name[:-3]
14 | module = types.ModuleType(mod_name)
15 | with open(path) as f:
16 | module_str = f.read()
17 | exec(module_str, module.__dict__)
18 | modules[mod_name] = module
19 |
20 | task_list = {}
21 | for module_name, module in modules.items():
22 | for el in dir(module):
23 | if el.endswith("ScorerTask"):
24 | obj = module.__dict__[el]
25 | task_list[obj.name] = obj
26 |
27 |
28 | class ScorerTask:
29 | def __init__(self) -> None:
30 | pass
31 | @classmethod
32 | def from_name(cls,name):
33 | return task_list[name]
34 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "semantic_parsing_with_constrained_lm"
3 | version = "0.1.0"
4 | description = "Tools and instructions for reproducing the experiments in the paper Constrained Language Models Yield Few-Shot Semantic Parsers (EMNLP 2021)."
5 | authors = ["Microsoft Semantic Machines "]
6 |
7 | [tool.poetry.dependencies]
8 | python = "^3.7"
9 | matplotlib = "^3.1.0"
10 | torch = "1.6.0"
11 | pydantic = "^1.4"
12 | lark-parser = "^0.8.2"
13 | requests = "^2.20.1"
14 | cached-property = "^1.5.1"
15 | typer = "^0.3.0"
16 | jsons = "^0.10.1"
17 | more_itertools = "^8.2.0"
18 | transformers = "4.5.0"
19 | httpx = {version = "^0.16.1", extras = ["http2"]}
20 | datasets = "1.1.3"
21 | appdirs = "^1.4.4"
22 | sm-dataflow = {git = "https://github.com/microsoft/task_oriented_dialogue_as_dataflow_synthesis.git"}
23 | blobfile = "^1.2.5"
24 |
25 | [tool.poetry.dev-dependencies]
26 | pytest = "^4.3.1"
27 | black = "19.10b0"
28 | isort = "4.3.21"
29 | mypy = "0.782"
30 | pylint = "2.6.0"
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/inference_tasks/__init__.py:
--------------------------------------------------------------------------------
1 | import types
2 | import pathlib
3 | from os.path import dirname, isfile, join
4 | import os
5 | import glob
6 | import json
7 |
8 | modules = {}
9 | modules_list = glob.glob(join(dirname(__file__), "*.py"))
10 | for path in modules_list:
11 | if isfile(path) and not path.endswith('__init__.py') and not path.endswith('task_.py'):
12 | mod_name = pathlib.Path(path).name[:-3]
13 | module = types.ModuleType(mod_name)
14 | with open(path) as f:
15 | module_str = f.read()
16 | exec(module_str, module.__dict__)
17 | modules[mod_name] = module
18 |
19 | task_list = {}
20 | for module_name, module in modules.items():
21 | for el in dir(module):
22 | if el.endswith("InferenceTask"):
23 | obj = module.__dict__[el]
24 | task_list[obj.name] = obj
25 |
26 |
27 | class InferenceTask:
28 | def __init__(self) -> None:
29 | pass
30 | @classmethod
31 | def from_name(cls,name):
32 | return task_list[name]
33 |
--------------------------------------------------------------------------------
/DPR/eval.sh:
--------------------------------------------------------------------------------
1 |
2 | python generate_dense_embeddings.py model_file=/media/disk1/ohadr/lr1e-5/dpr_biencoder.29 \
3 | ctx_src=dpr_grail shard_id=0 num_shards=1 out_file=/mnt/netapp7/ohadr/GrailSmBop/DPR/entities_c29_lr1_enc
4 |
5 | python dense_retriever.py model_file=/media/disk1/ohadr/lr1e-5/dpr_biencoder.29 qa_dataset=grailqa_train ctx_datatsets=[dpr_grail] \
6 | encoded_ctx_files=["/mnt/netapp7/ohadr/GrailSmBop/DPR/entities_c29_lr1_enc_*"] out_file=/mnt/netapp7/ohadr/GrailSmBop/DPR/dpr_pred_train_c29_lr1.json
7 | python dense_retriever.py model_file=/media/disk1/ohadr/lr1e-5/dpr_biencoder.29 qa_dataset=grailqa_dev ctx_datatsets=[dpr_grail] \
8 | encoded_ctx_files=["/mnt/netapp7/ohadr/GrailSmBop/DPR/entities_c29_lr1_enc_*"] out_file=/mnt/netapp7/ohadr/GrailSmBop/DPR/dpr_pred_dev_c29_lr1.json
9 |
10 |
11 | python dpr/data/download_data.py --resource data.retriever.qas.trivia-dev
12 | [optional --output_dir {your location}]
13 |
14 |
15 | (grail) ohadr@pc-jonathan-g01:~/GrailSmBop/DPR$ head dpr_pred_dev.json -n 1000 |grep --color=always -e "^" -e true
16 |
--------------------------------------------------------------------------------
/easy-elasticsearch/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 |
4 | with open("README.md", "r", encoding="utf-8") as fh:
5 | readme = fh.read()
6 |
7 | setup(
8 | name="easy-elasticsearch",
9 | version="0.0.9",
10 | author="Kexin Wang",
11 | author_email="kexin.wang.2049@gmail.com",
12 | description="An easy-to-use Elasticsearch BM25 interface",
13 | long_description=readme,
14 | long_description_content_type="text/markdown",
15 | url="https://github.com/kwang2049/easy-elasticsearch",
16 | project_urls={
17 | "Bug Tracker": "https://github.com/kwang2049/easy-elasticsearch/issues",
18 | },
19 | packages=find_packages(),
20 | classifiers=[
21 | "Programming Language :: Python :: 3",
22 | "License :: OSI Approved :: Apache Software License",
23 | "Operating System :: OS Independent",
24 | ],
25 | python_requires=">=3.6",
26 | install_requires=[
27 | "elasticsearch>=7.9.1", # BeIR requires es==7.9.1
28 | "tqdm",
29 | "requests",
30 | ],
31 | )
32 |
--------------------------------------------------------------------------------
/configs/inference.yaml:
--------------------------------------------------------------------------------
1 | # cwd: ???
2 | batch_size: 1
3 |
4 | # model_name: 'google/t5-v1_1-xl'
5 | model_name: "EleutherAI/gpt-neo-2.7B"
6 | #model_name: "EleutherAI/gpt-j-6B"
7 | # model_name: "EleutherAI/gpt-neo-125M"
8 | output_file: ???
9 | # length_file: ???
10 | prompt_file: ???
11 | max_length: 2048
12 | num_prompts: -1
13 | data_num: -1
14 | task_name: ???
15 | gen: True
16 | order: ascending
17 | #template_idx: ???
18 | # model_name: 'google/t5-v1_1-small'
19 | dataset_reader:
20 | _target_: src.dataset_readers.few_shot_dsr.FewShotDatasetReader
21 | model_name: ${model_name}
22 | task_name: ${task_name}
23 | # _target_: src.dataset_readers.tasks.break_task.BreakTask
24 | prompt_file: ${prompt_file}
25 | # length_file: ${length_file}
26 | num_prompts: ${num_prompts}
27 | gen: ${gen}
28 | data_num: ${data_num}
29 | order: ${order}
30 | # template_idx: ${template_idx}
31 |
32 | model:
33 | _target_: src.models.model.get_model
34 | # _target_: transformers.AutoModelForCausalLM.from_pretrained
35 | pretrained_model_name_or_path: ${model_name}
36 |
37 |
38 |
--------------------------------------------------------------------------------
/break_evaluator/tmp/results/question_decomp_summary.txt:
--------------------------------------------------------------------------------
1 | overall scores:
2 | ged score: mean 0.333 max 1.056 min 0.000
3 | normalized_exact_match score: mean 0.256 max 1.000 min 0.000
4 | skipped 71 examples when computing ged.
5 | ged normalized_exact_match
6 | dataset
7 | ACADEMIC 0.288 0.400
8 | ATIS 0.233 0.304
9 | CLEVR 0.368 0.311
10 | COMQA 0.218 0.419
11 | CWQ 0.393 0.113
12 | DROP 0.340 0.213
13 | GEO 0.311 0.250
14 | NLVR2 0.350 0.277
15 | SPIDER 0.334 0.158
16 | ged normalized_exact_match
17 | num_steps
18 | 1 0.875 0.000
19 | 2 0.250 0.424
20 | 3 0.273 0.341
21 | 4 0.336 0.204
22 | 5 0.338 0.232
23 | 6 0.432 0.120
24 | 7 0.339 0.257
25 | 8 0.529 0.147
26 | 9 0.437 0.250
27 | 10 0.416 0.273
28 | 11 0.535 0.188
29 | 12 0.166 0.267
30 | 13 0.840 0.000
31 | 14 0.709 0.000
32 | 15 0.000 1.000
33 | 16 NaN 0.000
34 | 18 0.872 0.000
35 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "break-evaluator"
3 | version = "0.1.0"
4 | description = "Forked from https://github.com/allenai/break-evaluator"
5 | authors = []
6 | packages = [
7 | { include = "evaluation" },
8 | { include = "scripts" },
9 | { include = "utils" },
10 | ]
11 |
12 | [tool.poetry.dependencies]
13 | python = "~3.7.0"
14 | edit-distance = "1.0.4"
15 | editdistance = "0.5.3"
16 | matplotlib = "3.1.2"
17 | networkx = "2.4"
18 | neuralcoref = "4.0"
19 | overrides = "2.8.0"
20 | pandas = "0.25.3"
21 | lxml = "4.5.0"
22 | progressbar = "2.5"
23 | scipy = "1.4.1"
24 | spacy = "2.1.9"
25 | en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz"}
26 |
27 | [tool.poetry.dev-dependencies]
28 |
29 | [tool.poetry.scripts]
30 | break_evaluate_predictions = "scripts.evaluate_predictions:real_main"
31 | break_qdmr_to_program = "scripts.qdmr_to_program:main"
32 |
33 | [build-system]
34 | requires = ["poetry-core>=1.0.0"]
35 | build-backend = "poetry.core.masonry.api"
36 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/to_sequential_ids_collapser.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | from overrides import overrides
3 |
4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies
5 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors.tokens_dependencies_extractors.collapsers.base_collapser import BaseCollapser
6 |
7 |
8 | class ToSequentialIdsCollapser(BaseCollapser):
9 | @overrides
10 | def collapse(self, spans_dependencies: SpansDependencies, decomposition: str= None) -> None:
11 | pass
12 |
13 | @overrides
14 | def unwind(self, spans_dependencies: SpansDependencies) -> None:
15 | dependencies_graph = spans_dependencies._dependencies_graph
16 | # fix steps ids
17 | relabel_map = {n_id: i for (n_id, i) in
18 | zip(sorted(dependencies_graph.nodes()), range(1, dependencies_graph.number_of_nodes() + 1))
19 | if n_id != i}
20 | if relabel_map:
21 | nx.relabel_nodes(dependencies_graph, relabel_map, copy=False)
22 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/trie_partial_parse.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import dataclasses
5 | from dataclasses import dataclass
6 | from typing import Optional, Tuple
7 |
8 | import torch
9 |
10 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.util.trie import Trie
11 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.search import PartialParse
12 |
13 |
14 | @dataclass
15 | class TriePartialParse(PartialParse):
16 | trie: Trie[int]
17 | tokens: Tuple[int, ...] = ()
18 |
19 | def allowed_next(
20 | self, ordered_ids: Optional[torch.Tensor] = None, top_k: Optional[int] = None
21 | ) -> Tuple[torch.Tensor, bool]:
22 | allowed, is_complete = self.trie.prefix_next(self.tokens)
23 | return torch.tensor(sorted(allowed), dtype=torch.long), is_complete
24 |
25 | def append(self, token: int) -> "PartialParse":
26 | """Return a new PartialParse creatoted by appending this token."""
27 | return dataclasses.replace(self, tokens=self.tokens + (token,))
28 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/add_operator_properties_collapser.py:
--------------------------------------------------------------------------------
1 | from overrides import overrides
2 | import re
3 |
4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies
5 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors.tokens_dependencies_extractors.collapsers.base_collapser import BaseCollapser
6 |
7 |
8 | class AddOperatorsPropertiesCollapser(BaseCollapser):
9 | @overrides
10 | def collapse(self, spans_dependencies: SpansDependencies, decomposition: str= None) -> None:
11 | for _, _, data in spans_dependencies.dependencies():
12 | if data.properties:
13 | data.dep_type = f'{data.dep_type}[{",".join(data.properties)}]'
14 |
15 | @overrides
16 | def unwind(self, spans_dependencies: SpansDependencies) -> None:
17 | for _, _, data in spans_dependencies.dependencies():
18 | regx = re.search(r'(.*)\[(.+)\]', data.dep_type)
19 | if regx:
20 | dep, prop = regx.groups()
21 | data.dep_type = dep
22 | data.properties = prop.split(",")
23 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/models/seq2seq/simple_seq2seq_custom.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List
2 | from overrides import overrides
3 |
4 | import torch
5 | from allennlp.data import TextFieldTensors
6 | from allennlp.models import Model
7 | from allennlp.nn import InitializerApplicator
8 | from allennlp_models.generation.models import SimpleSeq2Seq
9 |
10 |
11 | @Model.register("simple_seq2seq_custom")
12 | class SimpleSeq2SeqCustom(SimpleSeq2Seq):
13 | def __init__(self,
14 | initializer: InitializerApplicator = InitializerApplicator(),
15 | **kwargs):
16 | super().__init__(**kwargs)
17 | initializer(self)
18 |
19 | @overrides
20 | def forward(
21 | self,
22 | source_tokens: TextFieldTensors,
23 | target_tokens: TextFieldTensors = None,
24 | metadata: List[Dict[str, Any]] = None,
25 | **kwargs # skip extra fields
26 | ) -> Dict[str, torch.Tensor]:
27 | output_dict = super().forward(source_tokens=source_tokens, target_tokens=target_tokens)
28 | if metadata:
29 | output_dict['metadata'] = metadata
30 | return output_dict
31 |
32 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 AI2
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/e2e.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re, os
3 | import json
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class E2eScorerTask:
8 | name = "e2e"
9 | question_field = "question"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_E2E")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "Sentence: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = "Table: "
24 | answer_prefix = "Sentence: "
25 | test_question = question_prefix + entry['test_question']
26 | question = question_prefix + entry['question']
27 | decomposition = answer_prefix + entry['target']
28 | test_decomposition = entry['test_target']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/go.py:
--------------------------------------------------------------------------------
1 |
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import re
4 | import json, os
5 | from src.utils.dataset_utils import load_train_dataset
6 |
7 |
8 | class GoScorerTask:
9 | name = "go"
10 | question_field = "question"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_Go")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = "Comment: "
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = "Code: "
25 | answer_prefix = "Comment: "
26 | test_question = question_prefix + entry['test_question']
27 | question = question_prefix + entry['question']
28 | decomposition = answer_prefix + entry['target']
29 | test_decomposition = entry['test_target']
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/java.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class JavaScorerTask:
8 | name = "java"
9 | question_field = "question"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_Java")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "Comment: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = "Code: "
24 | answer_prefix = "Comment: "
25 | test_question = question_prefix + entry['test_question']
26 | question = question_prefix + entry['question']
27 | decomposition = answer_prefix + entry['target']
28 | test_decomposition = entry['test_target']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/pubmed.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class PubmedScorerTask:
8 | name = "pubmed"
9 | question_field = "question"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_PubMed")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "TL;DR: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = ""
24 | answer_prefix = "TL;DR: "
25 | test_question = question_prefix + entry['test_question']
26 | question = question_prefix + entry['question']
27 | decomposition = answer_prefix + entry['target']
28 | test_decomposition = entry['test_target']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/reddit.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class RedditScorerTask:
8 | name = "reddit"
9 | question_field = "question"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_Reddit")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "TL;DR: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = ""
24 | answer_prefix = "TL;DR: "
25 | test_question = question_prefix + entry['test_question']
26 | question = question_prefix + entry['question']
27 | decomposition = answer_prefix + entry['target']
28 | test_decomposition = entry['test_target']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/dart.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re, os
3 | import json
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class DartScorerTask:
8 | name = "dart"
9 | question_field = "question"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_DART")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "Sentence: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = "Table: "
24 | answer_prefix = "Sentence: "
25 | test_question = question_prefix + entry['test_question']
26 | question = question_prefix + entry['question']
27 | decomposition = answer_prefix + entry['target']
28 | test_decomposition = entry['test_target']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/php.py:
--------------------------------------------------------------------------------
1 |
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import re
4 | import json, os
5 | from src.utils.dataset_utils import load_train_dataset
6 |
7 |
8 | class PhpScorerTask:
9 | name = "php"
10 | question_field = "question"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_PHP")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = "Comment: "
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = "Code: "
25 | answer_prefix = "Comment: "
26 | test_question = question_prefix + entry['test_question']
27 | question = question_prefix + entry['question']
28 | decomposition = answer_prefix + entry['target']
29 | test_decomposition = entry['test_target']
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/python.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class PythonScorerTask:
8 | name = "python"
9 | question_field = "question"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_Python")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "Comment: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = "Code: "
24 | answer_prefix = "Comment: "
25 | test_question = question_prefix + entry['test_question']
26 | question = question_prefix + entry['question']
27 | decomposition = answer_prefix + entry['target']
28 | test_decomposition = entry['test_target']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/predictors/seq2seq/simple_seq2seq_dynamic_predictor.py:
--------------------------------------------------------------------------------
1 | """
2 | based on allennlp_models/generation/predictors/seq2seq.py
3 | tag: v1.1.0
4 | """
5 |
6 | from overrides import overrides
7 |
8 | from allennlp.common.util import JsonDict
9 | from allennlp.data import Instance
10 | from allennlp.predictors.predictor import Predictor
11 |
12 |
13 | @Predictor.register('seq2seq_dynamic')
14 | class Seq2SeqDynamicPredictor(Predictor):
15 | """
16 | Predictor for sequence to sequence models, including
17 | [`SimpleSeq2SeqDynamic`](../models/seq2seq/simple_seq2seq_dynamic.md) and
18 | [`CopyNetSeq2SeqDynamic`](../models/seq2seq/copynet_seq2seq_dynamic.md).
19 | """
20 |
21 | def predict(self, source: str, allowed_tokens: str) -> JsonDict:
22 | return self.predict_json({"source": source, "allowed_tokens": allowed_tokens})
23 |
24 | @overrides
25 | def _json_to_instance(self, json_dict: JsonDict) -> Instance:
26 | """
27 | Expects JSON that looks like `{"source": "..."}`.
28 | """
29 | source = json_dict["source"]
30 | allowed_tokens = json_dict["allowed_tokens"]
31 | return self._dataset_reader.text_to_instance(source, allowed_tokens)
32 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/cnndailymail.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class CNNDailyMailScorerTask:
8 | name = "cnndailymail"
9 | question_field = "article"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_CNNDailyMail")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "TL;DR: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = ""
24 | answer_prefix = "TL;DR: "
25 | test_question = question_prefix + entry['test_article']
26 | question = question_prefix + entry['article']
27 | decomposition = answer_prefix + entry['highlights']
28 | test_decomposition = entry['test_highlights']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/copa.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class CopaScorerTask:
9 | name = "copa"
10 | question_field = "question"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_COPA")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = "Answer: "
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = "Answer: "
26 | test_question = question_prefix + entry['test_question']
27 | question = question_prefix + entry['question']
28 | decomposition = answer_prefix + entry['label']
29 | test_decomposition = entry['test_label']
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/cr.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class CRScorerTask:
9 | name = "cr"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_CR")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('cr', 1, entry['label'])
29 | test_decomposition = get_one_prompt('cr', 1, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/cs_valid.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class CSValidScorerTask:
9 | name = "cs_valid"
10 | question_field = "question"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_ComV")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = "Answer: "
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = "Answer: "
26 | test_question = question_prefix + entry['test_question']
27 | question = question_prefix + entry['question']
28 | decomposition = answer_prefix + entry['label']
29 | test_decomposition = entry['test_label']
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/mr.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class MRScorerTask:
9 | name = "mr"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_MR")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('mr', 1, entry['label'])
29 | test_decomposition = get_one_prompt('mr', 1, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/cs_explan.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class CSExplanScorerTask:
9 | name = "cs_explan"
10 | question_field = "question"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_ComE")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = "Answer: "
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = "Answer: "
26 | test_question = question_prefix + entry['test_question']
27 | question = question_prefix + entry['question']
28 | decomposition = answer_prefix + entry['label']
29 | test_decomposition = entry['test_label']
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/rte.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class RTEScorerTask:
9 | name = "rte"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_RTE")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('rte', 0, entry['label'])
29 | test_decomposition = get_one_prompt('rte', 0, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/cola.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class ColaScorerTask:
9 | name = "cola"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_COLA")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('cola', 1, entry['label'])
29 | test_decomposition = get_one_prompt('cola', 1, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/cosmos_qa.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class CosmosQaScorerTask:
9 | name = "cosmos_qa"
10 | question_field = "question"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_CosmosQA")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = "Answer: "
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = "Answer: "
26 | test_question = question_prefix + entry['test_question']
27 | question = question_prefix + entry['question']
28 | decomposition = answer_prefix + entry['label']
29 | test_decomposition = entry['test_label']
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/mnli.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class MNLIScorerTask:
9 | name = "mnli"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_MNLI")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('mnli', 0, entry['label'])
29 | test_decomposition = get_one_prompt('mnli', 0, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/snli.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class SNLIScorerTask:
9 | name = "snli"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_SNLI")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('snli', 0, entry['label'])
29 | test_decomposition = get_one_prompt('snli', 0, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/subj.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class SubjScorerTask:
9 | name = "subj"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_Subj")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('subj', 2, entry['label'])
29 | test_decomposition = get_one_prompt('subj', 2, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/trec.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class TrecScorerTask:
9 | name = "trec"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_TREC")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('trec', 3, entry['label'])
29 | test_decomposition = get_one_prompt('trec', 3, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/sst2.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class SST2ScorerTask:
9 | name = "sst2"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_SST-2")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('sst2', 1, entry['label'])
29 | test_decomposition = get_one_prompt('sst2', 1, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/sst5.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class SST5ScorerTask:
9 | name = "sst5"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_SST-5")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('sst5', 1, entry['label'])
29 | test_decomposition = get_one_prompt('sst5', 1, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/DPR/conf/datasets/retriever_default.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | qa_epr:
3 | _target_: dpr.data.retriever_data.EPRQASrc
4 | dataset_split: ???
5 | task_name: ???
6 |
7 | nq_test:
8 | _target_: dpr.data.retriever_data.CsvQASrc
9 | file: data.retriever.qas.nq-test
10 |
11 | nq_train:
12 | _target_: dpr.data.retriever_data.CsvQASrc
13 | file: data.retriever.qas.nq-train
14 |
15 | nq_dev:
16 | _target_: dpr.data.retriever_data.CsvQASrc
17 | file: data.retriever.qas.nq-dev
18 |
19 | trivia_test:
20 | _target_: dpr.data.retriever_data.CsvQASrc
21 | file: data.retriever.qas.trivia-test
22 |
23 | trivia_train:
24 | _target_: dpr.data.retriever_data.CsvQASrc
25 | file: data.retriever.qas.trivia-train
26 |
27 | trivia_dev:
28 | _target_: dpr.data.retriever_data.CsvQASrc
29 | file: data.retriever.qas.trivia-dev
30 |
31 | grailqa_dev:
32 | _target_: dpr.data.retriever_data.CsvQASrc
33 | file: data.retriever.qas.grailqa-dev
34 |
35 | grailqa_train:
36 | _target_: dpr.data.retriever_data.CsvQASrc
37 | file: data.retriever.qas.grailqa-train
38 |
39 | webq_test:
40 | _target_: dpr.data.retriever_data.CsvQASrc
41 | file: data.retriever.qas.webq-test
42 |
43 | curatedtrec_test:
44 | _target_: dpr.data.retriever_data.CsvQASrc
45 | file: data.retriever.qas.curatedtrec-test
46 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/agnews.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class AgnewsScorerTask:
9 | name = "agnews"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_AGNews")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('agnews', 0, entry['label'])
29 | test_decomposition = get_one_prompt('agnews', 0, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/amazon.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class AmazonScorerTask:
9 | name = "amazon"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_Amazon")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('amazon', 1, entry['label'])
29 | test_decomposition = get_one_prompt('amazon', 1, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/yahoo.py:
--------------------------------------------------------------------------------
1 |
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import re
4 | import json, os
5 | from src.utils.dataset_utils import load_train_dataset
6 | from Channel_LM_Prompting.util import get_one_prompt
7 |
8 |
9 | class YahooScorerTask:
10 | name = "yahoo"
11 | question_field = "sentence"
12 | prompt_field = "ctxs"
13 |
14 | def __init__(self, example_file, ds_size=None) -> None:
15 | dataset = load_dataset("KaiLv/UDR_Yahoo")
16 |
17 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
18 | self.training_dataset = list(enumerate(self.hf_dataset))
19 | self.example_file = example_file
20 | with open(self.example_file) as f:
21 | self.data = json.load(f)
22 | self.postfix = ""
23 |
24 | def get_fields(self, entry, index=-1):
25 | question_prefix = ""
26 | answer_prefix = ""
27 | test_question = question_prefix + entry['test_sentence']
28 | question = question_prefix + entry['sentence']
29 | decomposition = get_one_prompt('yahoo', 0, entry['label'])
30 | test_decomposition = get_one_prompt('yahoo', 0, entry['test_label'])
31 | return question, decomposition, test_question, test_decomposition
32 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/roc_ending_generation.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class RocEndingGenerationScorerTask:
8 | name = "roc_ending_generation"
9 | question_field = "question"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_RocEnding")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "End of the story: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = "Beginning of the story: "
24 | answer_prefix = "End of the story: "
25 | test_question = question_prefix + entry['test_question']
26 | question = question_prefix + entry['question']
27 | decomposition = answer_prefix + entry['target']
28 | test_decomposition = entry['test_target']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/roc_story_generation.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class RocStoryGenerationScorerTask:
8 | name = "roc_story_generation"
9 | question_field = "question"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_RocStory")
14 |
15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
16 | self.training_dataset = list(enumerate(self.hf_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 | self.postfix = "Rest of the story: "
21 |
22 | def get_fields(self, entry, index=-1):
23 | question_prefix = "Beginning of the story: "
24 | answer_prefix = "Rest of the story: "
25 | test_question = question_prefix + entry['test_question']
26 | question = question_prefix + entry['question']
27 | decomposition = answer_prefix + entry['target']
28 | test_decomposition = entry['test_target']
29 | return question, decomposition, test_question, test_decomposition
30 |
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/NOTICE.md:
--------------------------------------------------------------------------------
1 | # NOTICES
2 |
3 | This repository incorporates material as listed below or described in the code.
4 |
5 | ## break-evaluator
6 |
7 | MIT License
8 |
9 | Copyright (c) 2021 AI2
10 |
11 | Permission is hereby granted, free of charge, to any person obtaining a copy
12 | of this software and associated documentation files (the "Software"), to deal
13 | in the Software without restriction, including without limitation the rights
14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 | copies of the Software, and to permit persons to whom the Software is
16 | furnished to do so, subject to the following conditions:
17 |
18 | The above copyright notice and this permission notice shall be included in all
19 | copies or substantial portions of the Software.
20 |
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 | SOFTWARE.
28 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/dbpedia.py:
--------------------------------------------------------------------------------
1 |
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import re
4 | import json, os
5 | from src.utils.dataset_utils import load_train_dataset
6 | from Channel_LM_Prompting.util import get_one_prompt
7 |
8 |
9 | class DbpediaScorerTask:
10 | name = "dbpedia"
11 | question_field = "sentence"
12 | prompt_field = "ctxs"
13 |
14 | def __init__(self, example_file, ds_size=None) -> None:
15 | dataset = load_dataset("KaiLv/UDR_DBPedia")
16 |
17 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
18 | self.training_dataset = list(enumerate(self.hf_dataset))
19 | self.example_file = example_file
20 | with open(self.example_file) as f:
21 | self.data = json.load(f)
22 | self.postfix = ""
23 |
24 | def get_fields(self, entry, index=-1):
25 | question_prefix = ""
26 | answer_prefix = ""
27 | test_question = question_prefix + entry['test_sentence']
28 | question = question_prefix + entry['sentence']
29 | decomposition = get_one_prompt('dbpedia', 0, entry['label'])
30 | test_decomposition = get_one_prompt('dbpedia', 0, entry['test_label'])
31 | return question, decomposition, test_question, test_decomposition
32 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/yelp_full.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json, os
4 | from src.utils.dataset_utils import load_train_dataset
5 | from Channel_LM_Prompting.util import get_one_prompt
6 |
7 |
8 | class YelpFullScorerTask:
9 | name = "yelp_full"
10 | question_field = "sentence"
11 | prompt_field = "ctxs"
12 |
13 | def __init__(self, example_file, ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_Yelp")
15 |
16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
17 | self.training_dataset = list(enumerate(self.hf_dataset))
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | self.postfix = ""
22 |
23 | def get_fields(self, entry, index=-1):
24 | question_prefix = ""
25 | answer_prefix = ""
26 | test_question = question_prefix + entry['test_sentence']
27 | question = question_prefix + entry['sentence']
28 | decomposition = get_one_prompt('yelp_full', 1, entry['label'])
29 | test_decomposition = get_one_prompt('yelp_full', 1, entry['test_label'])
30 | return question, decomposition, test_question, test_decomposition
31 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/mtop.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, load_from_disk
2 | from src.utils.dataset_utils import load_train_dataset
3 |
4 | import re, os
5 | import json
6 |
7 |
8 | class MtopScorerTask:
9 | name = "mtop"
10 | prompt_field = "ctxs"
11 | question_field = "question"
12 | def __init__(self,example_file,ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_MTOP")
14 | self.hf_dataset = load_train_dataset(dataset,size=ds_size)
15 | self.training_dataset = list(enumerate(self.hf_dataset))
16 |
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 |
21 |
22 | def get_fields(self, entry,index=-1):
23 | test_question = entry['test_question']
24 | question = entry['question']
25 | logical_form = entry['logical_form']
26 | test_logical_form = entry['test_logical_form']
27 | return question,logical_form,test_question,test_logical_form
28 |
29 |
30 | @classmethod
31 | def remove_double_space(cls,string):
32 | return re.sub("[ ]{2,}", " ", string)
33 | @classmethod
34 | def reformat(cls,text):
35 | return " ".join([f"{i+1}#) {x.strip()}" for i,x in enumerate(text.split(";"))])
36 |
--------------------------------------------------------------------------------
/DPR/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 |
8 | from setuptools import setup
9 |
10 | with open("README.md") as f:
11 | readme = f.read()
12 |
13 | setup(
14 | name="dpr",
15 | version="1.0.0",
16 | description="Facebook AI Research Open Domain Q&A Toolkit",
17 | url="https://github.com/facebookresearch/DPR/",
18 | classifiers=[
19 | "Intended Audience :: Science/Research",
20 | "License :: OSI Approved :: MIT License",
21 | "Programming Language :: Python :: 3.6",
22 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
23 | ],
24 | long_description=readme,
25 | long_description_content_type="text/markdown",
26 | setup_requires=[
27 | "setuptools>=18.0",
28 | ],
29 | install_requires=[
30 | "faiss-cpu>=1.6.1",
31 | "jsonlines",
32 | "filelock",
33 | "numpy",
34 | "regex",
35 | "torch>=1.5.0",
36 | "transformers>=3.0.0,<3.1.0",
37 | "tqdm>=4.27",
38 | "wget",
39 | "spacy>=2.1.8",
40 | "hydra-core>=1.0.0",
41 | "omegaconf>=2.0.1",
42 | ],
43 | )
44 |
--------------------------------------------------------------------------------
/DPR/conf/biencoder_train_cfg.yaml:
--------------------------------------------------------------------------------
1 |
2 | # configuration groups
3 | defaults:
4 | - encoder: hf_bert
5 | - train: biencoder_default
6 | - datasets: encoder_train_default
7 |
8 | train_datasets:
9 | dev_datasets:
10 | output_dir:
11 | train_sampling_rates:
12 | loss_scale_factors:
13 | loss_type:
14 | rank_loss_factor:
15 | rank_loss_top_sample:
16 |
17 | # Whether to lower case the input text. Set True for uncased models, False for the cased ones.
18 | do_lower_case: True
19 |
20 | fix_ctx_encoder: False
21 | val_av_rank_start_epoch: 30
22 | seed: 12345
23 | checkpoint_file_name: dpr_biencoder
24 |
25 | # A trained bi-encoder checkpoint file to initialize the model
26 | model_file:
27 |
28 | # TODO: move to a conf group
29 | # local_rank for distributed training on gpus
30 | local_rank: -1
31 | global_loss_buf_sz: 592000
32 | device:
33 | distributed_world_size:
34 | distributed_port:
35 | no_cuda: False
36 | n_gpu:
37 | fp16: False
38 |
39 | # For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
40 | # "See details at https://nvidia.github.io/apex/amp.html
41 | fp16_opt_level: O1
42 |
43 | # tokens which won't be slit by tokenizer
44 | special_tokens:
45 |
46 | ignore_checkpoint_offset: False
47 | ignore_checkpoint_optimizer: False
48 |
49 | # set to >1 to enable multiple query encoders
50 | multi_q_encoder: False
51 |
--------------------------------------------------------------------------------
/DPR/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to DPR
2 | We want to make contributing to this project as easy and transparent as
3 | possible.
4 |
5 | ## Our Development Process
6 | TBD
7 |
8 | ## Pull Requests
9 | We actively welcome your pull requests.
10 |
11 | 1. Fork the repo and create your branch from `master`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 |
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 |
22 | Complete your CLA here:
23 |
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 |
28 | ## Coding Style
29 | * 2 spaces for indentation rather than tabs
30 | * 120 character line length
31 | * ...
32 |
33 | ## License
34 | By contributing to Facebook AI Research Dense Passage Retriever toolkit, you agree that your contributions will be licensed
35 | under the LICENSE file in the root directory of this source tree.
--------------------------------------------------------------------------------
/DPR/conf/gen_embs.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - encoder: hf_bert
3 | - ctx_sources: default_sources
4 |
5 | # A trained bi-encoder checkpoint file to initialize the model
6 | model_file:
7 |
8 | # Name of the all-passages resource
9 | ctx_src:
10 |
11 | # which (ctx or query) encoder to be used for embedding generation
12 | encoder_type: ctx
13 |
14 | # output .tsv file path to write results to
15 | out_file:
16 |
17 | # Whether to lower case the input text. Set True for uncased models, False for the cased ones.
18 | do_lower_case: True
19 |
20 | # Number(0-based) of data shard to process
21 | shard_id: 0
22 |
23 | # Total amount of data shards
24 | num_shards: 1
25 |
26 | # Batch size for the passage encoder forward pass (works in DataParallel mode)
27 | batch_size: 512
28 |
29 | tables_as_passages: False
30 |
31 | # tokens which won't be slit by tokenizer
32 | special_tokens:
33 |
34 | tables_chunk_sz: 100
35 |
36 | # TODO
37 | tables_split_type: type1
38 |
39 |
40 | # TODO: move to a conf group
41 | # local_rank for distributed training on gpus
42 | local_rank: -1
43 | device:
44 | distributed_world_size:
45 | distributed_port:
46 | no_cuda: False
47 | n_gpu:
48 | fp16: False
49 |
50 | # For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
51 | # "See details at https://nvidia.github.io/apex/amp.html
52 | fp16_opt_level: O1
--------------------------------------------------------------------------------
/DPR/dpr/utils/conf_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import hydra
4 | from omegaconf import DictConfig
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | class BiencoderDatasetsCfg(object):
10 | def __init__(self, cfg: DictConfig):
11 | # print('cfg in BiencoderDatasetsCfg')
12 | # for k,v in cfg.items():
13 | # logger.info('{}:{}'.format(k,v))
14 | # exit()
15 | datasets = cfg.datasets
16 | self.train_datasets_names = cfg.train_datasets
17 | logger.info("train_datasets: %s", self.train_datasets_names)
18 | # print(datasets)
19 | if self.train_datasets_names:
20 | self.train_datasets = [
21 | ]
22 | for ds_name in self.train_datasets_names:
23 | datasets[ds_name]['loss_type'] = cfg.loss_type
24 | tmp_dataset = hydra.utils.instantiate(datasets[ds_name])
25 | self.train_datasets.append(tmp_dataset)
26 |
27 | else:
28 | self.train_datasets = []
29 | if cfg.dev_datasets:
30 | self.dev_datasets_names = cfg.dev_datasets
31 | logger.info("dev_datasets: %s", self.dev_datasets_names)
32 | self.dev_datasets = [
33 | hydra.utils.instantiate(datasets[ds_name])
34 | for ds_name in self.dev_datasets_names
35 | ]
36 | self.sampling_rates = cfg.train_sampling_rates
37 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/scripts/tune/studies/biaffine-graph-parser--transformer-encoder.py:
--------------------------------------------------------------------------------
1 | import optuna
2 | import os
3 |
4 | from optuna.study import StudyDirection
5 |
6 |
7 | def get_experiment():
8 | config_file = "scripts/tune/experiments/biaffine-graph-parser--transformer-encoder.jsonnet"
9 | metrics = "best_validation_logical_form_em"
10 | direction = StudyDirection.MAXIMIZE
11 | return config_file, metrics, direction
12 |
13 |
14 | def get_constants():
15 | return [
16 | ("transformer_model", "bert-base-uncased"),
17 | ("max_length", 128),
18 | ("transformer_dim", 768),
19 |
20 | ("arc_tags_only", "false"),
21 | ("multi_label", "false"),
22 |
23 | ("pos_embedding_dim", 100),
24 | ("tag_representation_dim", 100)
25 | ]
26 |
27 |
28 | def set_parameters(trial: optuna.Trial):
29 | # hyper parameters
30 | trial.suggest_float("input_dropout", 0.0, 0.8, step=0.1)
31 | trial.suggest_float("dropout", 0.0, 0.8, step=0.1)
32 | trial.suggest_int("arc_representation_dim", 300, 700, step=100)
33 | trial.suggest_int("arc_num_layers", 1, 3)
34 | trial.suggest_int("tag_num_layers", 1, 3)
35 |
36 | trial.suggest_categorical("lr", [1e-4, 1e-3, 1e-2, 1e-1])
37 | # trial.suggest_categorical("transformer_lr", [2e-5, 3e-5, 5e-5])
38 | trial.suggest_categorical("transformer_lr", [3e-5, 5e-5, 7e-5])
39 | trial.suggest_categorical("seed", [24, 42, 64])
40 |
--------------------------------------------------------------------------------
/scripts/find_bm25.sh:
--------------------------------------------------------------------------------
1 | # before run this script, you should:
2 | # 1. download elasticsearch-7.9.1
3 | # 2. run ES_JAVA_OPTS="-Xms31g -Xmx31g" ./elasticsearch-7.9.1/bin/elasticsearch to start elasticsearch
4 | datasets_full=("agnews" "amazon" "break" "cola" "common_gen" \
5 | "copa" "cosmos_qa" "cr" "cs_explan" "cs_valid" "dart" "dbpedia" \
6 | "e2e" "mr" "mtop" "pubmed" "reddit" "roc_ending_generation" "roc_story_generation" \
7 | "rte" "smcalflow" "sst2" "sst5" "subj" "trec" "yahoo" "yelp_full")
8 | datasets_sampled=("cnndailymail" "go" "java" "mnli" "php" "python" "snli" "wikiauto")
9 |
10 | if [ ! -d "$PWD/data_bm25" ]; then
11 | mkdir "$PWD/data_bm25"
12 | fi
13 |
14 | for train_set in "train" "debug"; do
15 | if [ "$train_set" == "train" ]; then
16 | datasets=("${datasets_full[@]}")
17 | else
18 | datasets=("${datasets_sampled[@]}")
19 | fi
20 |
21 | for dataset in "${datasets[@]}"; do
22 | find_bm25_py_output_path="$PWD/data_bm25/${dataset}_${train_set}.json"
23 | echo -e "\n\n-find_bm25 ${dataset}-\n\n"
24 | if [ ! -f "${find_bm25_py_output_path}" ]; then
25 | HYDRA_FULL_ERROR=1 \
26 | python find_bm25_es.py \
27 | output_path="$find_bm25_py_output_path" \
28 | dataset_split=${train_set} \
29 | setup_type="a" \
30 | task_name=${dataset} \
31 | +ds_size=null \
32 | L=50
33 | hydra.run.dir="$PWD/exps/find_bm25/${dataset}/logs"
34 | fi
35 | done
36 | done
--------------------------------------------------------------------------------
/scripts/score_bm25.sh:
--------------------------------------------------------------------------------
1 | main_process_port=$((RANDOM % 5001 + 25000))
2 | cvd=0,1,2,3,4,5,6,7
3 | num_gpus=8
4 |
5 | datasets_full=("agnews" "amazon" "break" "cola" "common_gen" \
6 | "copa" "cosmos_qa" "cr" "cs_explan" "cs_valid" "dart" "dbpedia" \
7 | "e2e" "mr" "mtop" "pubmed" "reddit" "roc_ending_generation" "roc_story_generation" \
8 | "rte" "smcalflow" "sst2" "sst5" "subj" "trec" "yahoo" "yelp_full")
9 | datasets_sampled=("cnndailymail" "go" "java" "mnli" "php" "python" "snli" "wikiauto")
10 |
11 | if [ ! -d "$PWD/data_score" ]; then
12 | mkdir "$PWD/data_score"
13 | fi
14 |
15 | for train_set in "train" "debug"; do
16 | if [ "$train_set" == "train" ]; then
17 | datasets=("${datasets_full[@]}")
18 | else
19 | datasets=("${datasets_sampled[@]}")
20 | fi
21 |
22 | for dataset in "${datasets[@]}"; do
23 | echo -e "\n\n-------score ${dataset}-------\n\n"
24 | scorer_py_output_path="$PWD/data_score/${dataset}_bm25.json"
25 | if [ ! -f "$scorer_py_output_path" ]; then
26 | CUDA_VISIBLE_DEVICES=$cvd \
27 | accelerate launch --num_processes $num_gpus --main_process_port ${main_process_port} --multi_gpu \
28 | scorer.py \
29 | example_file="$PWD/data_bm25/${dataset}_${train_set}.json" \
30 | setup_type=qa \
31 | output_file="$scorer_py_output_path" \
32 | batch_size=20 \
33 | +task_name=$dataset +dataset_reader.ds_size=null \
34 | hydra.run.dir="$PWD/exps/score_bm25/${dataset}/logs"
35 | fi
36 | done
37 | done
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/common_gen.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class CommonGenScorerTask:
8 | name = "common_gen"
9 | question_field = "joined_concepts"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | dataset = load_dataset("KaiLv/UDR_CommonGen")
14 |
15 | # if 'q' in example_file.split('/')[-1]:
16 | # self.hf_dataset = dataset['train_dedup']
17 | # elif 'a' in example_file.split('/')[-1]:
18 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
19 |
20 | self.training_dataset = list(enumerate(self.hf_dataset))
21 | self.example_file = example_file
22 | with open(self.example_file) as f:
23 | self.data = json.load(f)
24 | self.postfix = "Generated sentence: "
25 |
26 | def get_fields(self, entry, index=-1):
27 | question_prefix = "Generate a sentence using these concepts: "
28 | answer_prefix = "Generated sentence: "
29 | test_question = question_prefix + entry['test_joined_concepts']
30 | question = question_prefix + entry['joined_concepts']
31 | decomposition = answer_prefix + entry['target']
32 | test_decomposition = entry['test_target']
33 | return question, decomposition, test_question, test_decomposition
34 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/models/seq2seq/custom_copynet_seq2seq_for_soft_rat.py:
--------------------------------------------------------------------------------
1 | """
2 | Based on custom_copynet_seq2seq.py
3 | """
4 | import logging
5 | from typing import Dict
6 |
7 | from overrides import overrides
8 | import torch
9 |
10 | from allennlp.models.model import Model
11 | from allennlp.nn import util
12 |
13 | from qdecomp_nlp.models.seq2seq.custom_copynet_seq2seq import CustomCopyNetSeq2Seq
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | @Model.register("custom_copynet_seq2seq_for_soft_rat")
19 | class CustomCopyNetForRatSeq2Seq(CustomCopyNetSeq2Seq):
20 | @overrides
21 | def _encode(self, source_tokens: Dict[str, torch.Tensor], relations_probs: torch.Tensor) -> Dict[str, torch.Tensor]:
22 | """
23 | Encode source input sentences.
24 | """
25 | # shape: (batch_size, source_sequence_length, encoder_input_dim)
26 | embedded_input = self._source_embedder(source_tokens)
27 | # shape: (batch_size, source_sequence_length)
28 | source_mask = util.get_text_field_mask(source_tokens)
29 | # shape: (batch_size, source_sequence_length, source_sequence_length)
30 | relations_mask = source_mask.unsqueeze(-1)*source_mask.unsqueeze(-2)
31 | # shape: (batch_size, source_sequence_length, encoder_output_dim)
32 | encoder_outputs = self._encoder(embedded_input, relations_probs, relations_mask)
33 | return {"source_mask": source_mask, "encoder_outputs": encoder_outputs}
34 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/break.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, load_from_disk
2 | import re, os
3 | import json
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class BreakScorerTask:
8 | name = "break"
9 | question_field = "question_text"
10 | dataset_name = "break_data"
11 | split = "QDMR"
12 | prompt_field = "near_examples"
13 | def __init__(self,example_file,ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_BREAK")
15 | self.orig_training_dataset = load_train_dataset(dataset,size=ds_size)
16 | self.training_dataset = list(enumerate(self.orig_training_dataset))
17 | self.example_file = example_file
18 | with open(self.example_file) as f:
19 | self.data = json.load(f)
20 |
21 |
22 | def get_fields(self, entry,index=-1):
23 | test_question = self.remove_double_space(entry['test_question_text'])
24 | question = self.remove_double_space(entry['question_text'])
25 | decomposition = self.remove_double_space(self.reformat(entry['decomposition']))
26 | test_decomposition = self.remove_double_space(self.reformat(entry['test_decomposition']))
27 | return question,decomposition,test_question,test_decomposition
28 |
29 | @classmethod
30 | def remove_double_space(cls,string):
31 | return re.sub("[ ]{2,}", " ", string)
32 | @classmethod
33 | def reformat(cls,text):
34 | return " ".join([f"{i+1}#) {x.strip()}" for i,x in enumerate(text.split(";"))])
35 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_spans_extractors/base_steps_spans_extractor.py:
--------------------------------------------------------------------------------
1 | import re
2 | from abc import ABC, abstractmethod
3 | from typing import List, Tuple
4 |
5 | import spacy
6 | from spacy.tokens.doc import Doc
7 |
8 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import QDMROperation, StepsSpans
9 |
10 |
11 | class BaseSpansExtractor(ABC):
12 | def __init__(self, tokens_parser=None):
13 | self._parser = tokens_parser or spacy.load("en_core_web_sm")
14 |
15 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None,
16 | debug: dict = None) -> StepsSpans:
17 | def format(text: str):
18 | return re.sub(r'\s+', ' ', text)
19 |
20 | question_tokens = self._parser(format(question))
21 | decomposition = re.sub(r'#(\d+)', '@@\g<1>@@', decomposition)
22 | steps_tokens = [self._parser(' '.join(format(x).split(' ')[1:])) for x in decomposition.split(';')]
23 | steps_operators = operators and [QDMROperation(x) for x in operators]
24 | return self._extract(question_id=question_id, question_tokens=question_tokens,
25 | steps_tokens=steps_tokens, steps_operators=steps_operators)
26 |
27 | @abstractmethod
28 | def _extract(self, question_id: str, question_tokens: Doc, steps_tokens: List[Doc], steps_operators: List[QDMROperation] = None,
29 | debug: dict = None) -> StepsSpans:
30 | raise NotImplementedError()
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/qdecomp_nlp/data/dataset_readers/util.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Callable, List
2 | import csv
3 |
4 | from allennlp.common.checks import ConfigurationError
5 | from allennlp.common.file_utils import cached_path
6 | from allennlp.data.fields import TextField, MetadataField, NamespaceSwappingField
7 | from allennlp.data.instance import Instance
8 |
9 |
10 | def read_break_data(file_path: str, delimiter: str,
11 | text_to_instance: Callable[..., Instance],
12 | args_columns: List[str],
13 | metadata_columns: List[str] = ['question_id'],
14 | quoting: int = csv.QUOTE_MINIMAL):
15 | with open(cached_path(file_path), "r") as data_file:
16 | lines = csv.reader(data_file, delimiter=delimiter, quoting=quoting)
17 | header = next(lines, None)
18 | header_to_index = {x: i for i, x in enumerate(header)}
19 | for line_num, row in enumerate(lines):
20 | if len(row) != len(header):
21 | raise ConfigurationError(
22 | "Invalid line format: %s (line number %d)" % (row, line_num + 1)
23 | )
24 |
25 | instance = text_to_instance(*[row[header_to_index[x]] for x in args_columns if x in header_to_index])
26 | metadata = {x: row[header_to_index[x]] for x in metadata_columns}
27 | if 'metadata' in instance:
28 | metadata.update(instance['metadata'])
29 | instance.add_field('metadata', MetadataField(metadata))
30 | yield instance
31 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/scripts/tune/studies/operators-aware-biaffine-graph-parser--transformer-encoder.py:
--------------------------------------------------------------------------------
1 | import optuna
2 | import os
3 |
4 | from optuna.study import StudyDirection
5 |
6 |
7 | def get_experiment():
8 | config_file = "scripts/tune/experiments/operators-aware-biaffine-graph-parser--transformer-encoder.jsonnet"
9 | metrics = "best_validation_logical_form_em"
10 | direction = StudyDirection.MAXIMIZE
11 | return config_file, metrics, direction
12 |
13 |
14 | def get_constants():
15 | return [
16 | ("transformer_model", "bert-base-uncased"),
17 | ("max_length", 128),
18 | ("transformer_dim", 768),
19 |
20 | ("decode_strategy", "operators_mask"),
21 |
22 | ("pos_embedding_dim", 100),
23 | ]
24 |
25 |
26 | def set_parameters(trial: optuna.Trial):
27 | # hyper parameters
28 | trial.suggest_float("input_dropout", 0.0, 0.8, step=0.1)
29 | trial.suggest_float("dropout", 0.0, 0.8, step=0.1)
30 | trial.suggest_int("operator_representation_dim", 100, 700, step=100)
31 | trial.suggest_int("tag_representation_dim", 100, 700, step=100)
32 | trial.suggest_int("operator_ff_num_layers", 1, 3)
33 | trial.suggest_int("tag_ff_num_layers", 1, 3)
34 | trial.suggest_int("operator_embeddings_dim", 0, 300, step=100)
35 |
36 | trial.suggest_categorical("lr", [1e-4, 1e-3, 1e-2, 1e-1])
37 | # trial.suggest_categorical("transformer_lr", [2e-5, 3e-5, 5e-5])
38 | trial.suggest_categorical("transformer_lr", [2e-5, 3e-5, 5e-5, 7e-5])
39 | trial.suggest_categorical("seed", [24, 42, 64])
40 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/config/configuration_loader.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable, List
2 | import os
3 | from dataclasses import dataclass
4 | from ast import literal_eval
5 |
6 | from qdecomp_with_dependency_graphs.dependencies_graph.evaluation.spans_dependencies_to_logical_form_tokens import SpansDepToQDMRStepTokensConverter
7 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors import BaseTokensDependenciesExtractor, BaseSpansExtractor, \
8 | BaseTokensDependenciesToQDMRExtractor, BaseCollapser
9 |
10 |
11 | @dataclass
12 | class Configuration:
13 | spans_extractor: BaseSpansExtractor = None
14 | tokens_dependencies_extractor: BaseTokensDependenciesExtractor = None
15 | tokens_dependencies_to_qdmr_extractor: BaseTokensDependenciesToQDMRExtractor = None
16 | spans_dependencies_to_logical_form_converter: SpansDepToQDMRStepTokensConverter = None
17 |
18 |
19 | config: Configuration = Configuration()
20 | _config_str = None
21 | def load(config_file: str):
22 | global config, _config_str
23 | with open(config_file, 'rt') as fp:
24 | _config_str = fp.read()
25 | _locals = {}
26 | exec(_config_str, globals(), _locals)
27 | for attr, value in config.__dict__.items():
28 | config.__setattr__(attr, _locals[attr])
29 |
30 | conf = os.environ.get('DEP_CONF', 'default')
31 | load(f'dependencies_graph/config/config_{conf}.py')
32 |
33 |
34 | def save(dir_path: str):
35 | path = os.path.join(dir_path, 'dependencies_graph_config.py')
36 | with open(path, 'wt') as fp:
37 | fp.write(_config_str)
38 |
--------------------------------------------------------------------------------
/src/utils/cache_util.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pathlib
3 | class BufferedJsonWriter(object):
4 | def __init__(self, file_name, buffer_size=50):
5 | self.file_path = file_name
6 | self.buffer = []
7 | self.buffer_size = buffer_size
8 |
9 | def __enter__(self):
10 | return self
11 |
12 | def __exit__(self, type, value, traceback):
13 | self.write_buffer()
14 |
15 | def write(self, obj=None):
16 | if obj is not None:
17 | self.buffer.append(obj)
18 | if len(self.buffer)>=self.buffer_size:
19 | self.write_buffer()
20 |
21 | def write_buffer(self):
22 | with open(self.file_path, "a") as data_file:
23 | data_file.write(json.dumps(self.buffer))
24 | data_file.write("\n")
25 | self.buffer = []
26 |
27 | class BufferedJsonReader(object):
28 | def __init__(self, file_name):
29 | self.file_path = file_name
30 |
31 | def __enter__(self):
32 | return self
33 |
34 | def __exit__(self, type, value, traceback):
35 | pass
36 |
37 | def __itr__(self):
38 | with open(self.file_path, "r") as data_file:
39 | for line in data_file:
40 | yield from json.loads(line)
41 |
42 | def read(self):
43 | return list(self.__itr__())
44 |
45 |
46 |
47 | def get_cache_path(dataset):
48 | cache_files = dataset.cache_files
49 | if isinstance(cache_files,dict):
50 | cache_files = next(iter(cache_files.values()))
51 | return pathlib.Path(cache_files[0]['filename']).parent
--------------------------------------------------------------------------------
/break_evaluator/utils/graph.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | from queue import Queue, deque
3 |
4 |
5 | def has_cycle(graph: nx.DiGraph):
6 | try:
7 | nx.find_cycle(graph, orientation='original')
8 | return True
9 | except:
10 | return False
11 |
12 |
13 | def get_graph_levels(graph: nx.DiGraph):
14 | """
15 | Find graph level for each node
16 | level[node] := 0 if the node has no successors
17 | level[node] := max[over successors s](level[s])+1
18 | :param graph: directed graph with no cycles
19 | :return: (nodes_level, levels) tuple where:
20 | nodes_level: dictionary of :
21 | levels: dictionary of :[]
22 | """
23 | updated_nodes = Queue()
24 |
25 | # first layer
26 | leafs = [n_id for n_id in graph.nodes if not any(graph.successors(n_id))]
27 | nodes_levels = {n_id: 0 for n_id in leafs}
28 | updated_nodes.queue = deque(leafs)
29 |
30 | # update predecessors
31 | while not updated_nodes.empty():
32 | n_id = updated_nodes.get()
33 | low_bound = nodes_levels[n_id] + 1
34 | if low_bound > graph.number_of_nodes():
35 | raise ValueError("Cyclic graphs are not allowed")
36 | for s_id in graph.predecessors(n_id):
37 | if nodes_levels.get(s_id, -1) < low_bound:
38 | nodes_levels[s_id] = low_bound
39 | updated_nodes.put(s_id)
40 | levels = {}
41 | for n_id, l in nodes_levels.items():
42 | levels[l] = levels.get(l, []) + [n_id]
43 |
44 | return nodes_levels, levels
45 |
--------------------------------------------------------------------------------
/find_random.py:
--------------------------------------------------------------------------------
1 | import hydra
2 | import hydra.utils as hu
3 |
4 | import tqdm
5 | import numpy as np
6 | import json
7 | # from src.utils.app import App
8 | from src.dataset_readers.bm25_tasks import BM25Task
9 | from dataclasses import dataclass
10 | import random
11 |
12 |
13 |
14 | class RandomFinder:
15 | def __init__(self,cfg) -> None:
16 | self.output_path = cfg.output_path
17 | self.task_name = cfg.task_name
18 | # assert cfg.dataset_split in ["train","validation","test"]
19 | self.is_train = cfg.dataset_split=="train"
20 | self.setup_type = "a"
21 |
22 | self.task = BM25Task.from_name(cfg.task_name)(cfg.dataset_split,
23 | self.setup_type)
24 | print("started creating the corpus")
25 | self.corpus = self.task.get_corpus()
26 | print("finished creating the corpus")
27 |
28 |
29 |
30 |
31 |
32 |
33 | def find(cfg):
34 | random_finder = RandomFinder(cfg)
35 | data_list = list(random_finder.task.dataset)
36 | idx_list = list(range(len(random_finder.task.get_corpus())))
37 |
38 | for element in tqdm.tqdm(data_list):
39 | element['ctxs'] = [{"id":int(a)} for a in random.sample(idx_list,k=200)]
40 | return data_list
41 |
42 |
43 | @hydra.main(config_path="configs",config_name="random_finder")
44 | def main(cfg):
45 | print(cfg)
46 |
47 | data_list = find(cfg)
48 | # print(data_list)
49 | with open(cfg.output_path,"w") as f:
50 | json.dump(data_list,f)
51 |
52 |
53 | if __name__ == "__main__":
54 | main()
--------------------------------------------------------------------------------
/semantic_parsing_with_constrained_lm/third_party/break-evaluator/utils/graph.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | from queue import Queue, deque
3 |
4 |
5 | def has_cycle(graph: nx.DiGraph):
6 | try:
7 | nx.find_cycle(graph, orientation='original')
8 | return True
9 | except:
10 | return False
11 |
12 |
13 | def get_graph_levels(graph: nx.DiGraph):
14 | """
15 | Find graph level for each node
16 | level[node] := 0 if the node has no successors
17 | level[node] := max[over successors s](level[s])+1
18 | :param graph: directed graph with no cycles
19 | :return: (nodes_level, levels) tuple where:
20 | nodes_level: dictionary of :
21 | levels: dictionary of :[]
22 | """
23 | updated_nodes = Queue()
24 |
25 | # first layer
26 | leafs = [n_id for n_id in graph.nodes if not any(graph.successors(n_id))]
27 | nodes_levels = {n_id: 0 for n_id in leafs}
28 | updated_nodes.queue = deque(leafs)
29 |
30 | # update predecessors
31 | while not updated_nodes.empty():
32 | n_id = updated_nodes.get()
33 | low_bound = nodes_levels[n_id] + 1
34 | if low_bound > graph.number_of_nodes():
35 | raise ValueError("Cyclic graphs are not allowed")
36 | for s_id in graph.predecessors(n_id):
37 | if nodes_levels.get(s_id, -1) < low_bound:
38 | nodes_levels[s_id] = low_bound
39 | updated_nodes.put(s_id)
40 | levels = {}
41 | for n_id, l in nodes_levels.items():
42 | levels[l] = levels.get(l, []) + [n_id]
43 |
44 | return nodes_levels, levels
45 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/smcalflow.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, load_from_disk
2 | import re, os
3 | import json
4 | import random
5 | from src.utils.dataset_utils import load_train_dataset
6 |
7 |
8 |
9 | class SmcalflowScorerTask:
10 | name = "smcalflow"
11 | prompt_field = "ctxs"
12 | question_field = "user_utterance"
13 | def __init__(self,example_file,ds_size=None) -> None:
14 | dataset = load_dataset("KaiLv/UDR_SMCalFlow")
15 | # dataset = load_dataset("iohadrubin/smcalflow")
16 | self.hf_dataset = load_train_dataset(dataset,size=ds_size)
17 | # self.hf_dataset = ['train']
18 | self.example_file = example_file
19 | with open(self.example_file) as f:
20 | self.data = json.load(f)
21 | idx_list = list(range(len(self.data)))
22 | random.Random(42).shuffle(idx_list)
23 | # self.data = [self.data[x] for x in idx_list[:44000]]
24 | print(f"{len(self.data)} examples")
25 |
26 | self.training_dataset = list(enumerate(self.hf_dataset))
27 |
28 | def get_fields(self, entry,index=-1):
29 | test_question = entry['test_user_utterance']
30 | question = entry['user_utterance']
31 | lispress = entry['lispress']
32 | test_lispress = entry['test_lispress']
33 | return question,lispress,test_question,test_lispress
34 |
35 |
36 | @classmethod
37 | def remove_double_space(cls,string):
38 | return re.sub("[ ]{2,}", " ", string)
39 | @classmethod
40 | def reformat(cls,text):
41 | return " ".join([f"{i+1}#) {x.strip()}" for i,x in enumerate(text.split(";"))])
42 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/php.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return PhpBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return PhpBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return PhpBM25Task.norm(entry['target'])
25 |
26 |
27 | class PhpBM25Task:
28 | name = 'php'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_PHP")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls, text):
50 | # 输出一个list
51 | return word_tokenize(text)
52 |
--------------------------------------------------------------------------------
/src/dataset_readers/inference_tasks/mtop.py:
--------------------------------------------------------------------------------
1 | import re, os
2 | from datasets import load_dataset, load_from_disk
3 | from src.utils.dataset_utils import load_train_dataset
4 |
5 | import json
6 | from src.utils.tokenizer_utils import get_length
7 |
8 |
9 |
10 |
11 | def set_length(example, idx,**kwargs):
12 | tokenizer = kwargs['tokenizer']
13 | q_field = example['question']
14 | a_field = example['logical_form']
15 | prompt_qa = f"{q_field}\t{a_field}"
16 | example['prompt_qa'] = prompt_qa
17 | example['prompt_len'] = get_length(tokenizer,prompt_qa)
18 | return example
19 |
20 | class MtopInferenceTask:
21 | name = "mtop"
22 | def __init__(self, prompt_file, tokenizer,ds_size=None):
23 | self.prompt_file = prompt_file
24 | with open(self.prompt_file) as f:
25 | self.prompts = json.load(f)
26 | dataset = load_dataset("KaiLv/UDR_MTOP")
27 | self.hf_dataset = load_train_dataset(dataset,size=ds_size,listify=False)
28 | self.hf_dataset = self.hf_dataset.map(set_length,with_indices=True,fn_kwargs={'tokenizer':tokenizer})
29 | self.training_dataset = list(self.hf_dataset)
30 | self.postfix = ""
31 | self.prefix = ""
32 |
33 | @classmethod
34 | def postproccess(cls, string):
35 | return string
36 |
37 | def get_fields(self, entry):
38 | answer = entry['logical_form'] if 'logical_form' in entry else entry['answers'][0]
39 | idx_list =[p['id'] for p in entry['ctxs']]
40 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']])
41 | return entry['question'],answer,prompts['prompt_qa'],prompts['prompt_len'],idx_list
42 |
43 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/dart.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return DartBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return DartBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return DartBM25Task.norm(entry['target'])
25 |
26 |
27 | class DartBM25Task:
28 | name = 'dart'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_DART")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls, text):
50 | # 输出一个list
51 | return word_tokenize(text)
52 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/java.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return JavaBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return JavaBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return JavaBM25Task.norm(entry['target'])
25 |
26 |
27 | class JavaBM25Task:
28 | name = 'java'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_Java")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls, text):
50 | # 输出一个list
51 | return word_tokenize(text)
52 |
--------------------------------------------------------------------------------
/src/dataset_readers/scorer_tasks/wikiauto.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | import re
3 | import json
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 |
7 | class WikiautoScorerTask:
8 | name = "wikiauto"
9 | question_field = "source"
10 | prompt_field = "ctxs"
11 |
12 | def __init__(self, example_file, ds_size=None) -> None:
13 | # dataset = load_dataset('GEM/wiki_auto_asset_turk', 'train')
14 | # dataset = dataset.filter(lambda x: len(x['target']) < 1000)
15 | # # add idx column
16 | # for split in ['train', 'validation', 'test_asset', 'test_turk', "test_wiki"]:
17 | # ds_id = Dataset.from_dict({"idx": list(range(len(dataset[split])))})
18 | # dataset[split] = concatenate_datasets([dataset[split], ds_id], axis=1)
19 | dataset = load_dataset("KaiLv/UDR_WikiAuto")
20 |
21 | self.hf_dataset = load_train_dataset(dataset, size=ds_size)
22 | self.training_dataset = list(enumerate(self.hf_dataset))
23 | self.example_file = example_file
24 | with open(self.example_file) as f:
25 | self.data = json.load(f)
26 | self.postfix = "Simplified text: "
27 |
28 | def get_fields(self, entry, index=-1):
29 | question_prefix = "Simplify the text: "
30 | answer_prefix = "Simplified text: "
31 | test_question = question_prefix + entry['test_source']
32 | question = question_prefix + entry['source']
33 | decomposition = answer_prefix + entry['target']
34 | test_decomposition = entry['test_target']
35 | return question, decomposition, test_question, test_decomposition
36 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/operators_sequence.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from ast import literal_eval
4 | import pandas as pd
5 |
6 | def create_operators_seq(dataset_file: str):
7 | df = pd.read_csv(dataset_file)
8 | df['operators_seq'] = df['operators'].apply(lambda x: ' '.join(literal_eval(x)))
9 | df[['question_text', 'operators_seq']].to_csv(os.path.splitext(dataset_file)[0]+'_operators_seq2.tsv',
10 | sep='\t', header=False, index=False)
11 |
12 | def eval_operators_seq(dataset_file: str, predictions_file:str):
13 | df = pd.read_csv(dataset_file)
14 | preds = []
15 | with open(predictions_file, 'rt') as f:
16 | for line in f.readlines():
17 | content = json.loads(line.strip())
18 | pred = content['predicted_tokens']
19 | preds.append(pred)
20 | df['gold_operators_seq'] = df['operators'].apply(lambda x: literal_eval(x))
21 | df['predictions'] = preds
22 | df['exact_match'] = df['gold_operators_seq']==df['predictions']
23 | base_name = os.path.splitext(predictions_file)[0]+'__eval'
24 | df.to_csv(base_name+'.csv', index=False)
25 | summary = df.mean().round(3).to_dict()
26 | with open(base_name+'_summary.json', 'wt') as f:
27 | json.dump(summary,f, indent=2, sort_keys=True)
28 | print(summary)
29 |
30 |
31 | if __name__ == '__main__':
32 | # create_operators_seq('datasets/Break/QDMR/dev.csv')
33 | eval_operators_seq('datasets/Break/QDMR/dev.csv',
34 | 'tmp/datasets_Break_QDMR/dependencies_graph/operators-seq--seq2seq/operators-seq--seq2seq/eval/datasets_Break_QDMR_dev_operators_seq__preds.json')
--------------------------------------------------------------------------------
/dataflow/core/utterance_tokenizer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | import re
4 | from typing import List
5 |
6 | import spacy
7 | from spacy.language import Language
8 |
9 | from dataflow.core.constants import SpecialStrings
10 |
11 |
12 | def tokenize_datetime(text: str) -> str:
13 | """Tokenizes datetime to make it consistent with the seaweed tokens."""
14 | # 5.10 => 5 . 10
15 | # 4:00 => 4 : 00
16 | # 5/7 => 5 / 7
17 | # 5\7 => 5 \ 7
18 | # 3-9 => 3 - 9
19 | text = re.sub(r"(\d)([.:/\\-])(\d)", r"\1 \2 \3", text)
20 |
21 | # 4pm => 4 pm
22 | text = re.sub(r"(\d+)([a-zA-Z])", r"\1 \2", text)
23 |
24 | # safe guard to avoid multiple spaces
25 | text = re.sub(r"\s+", " ", text)
26 | return text
27 |
28 |
29 | class UtteranceTokenizer:
30 | """A Spacy-based tokenizer with some heuristics for user utterances."""
31 |
32 | def __init__(self, spacy_model_name: str = "en_core_web_md") -> None:
33 | self._spacy_nlp: Language = spacy.load(spacy_model_name)
34 |
35 | def tokenize(self, utterance_str: str) -> List[str]:
36 | """Tokenizes the utterance string and returns a list of tokens.
37 | """
38 | if not utterance_str:
39 | return []
40 |
41 | if utterance_str == SpecialStrings.NULL:
42 | # do not tokenize the NULL special string
43 | return [utterance_str]
44 |
45 | tokens: List[str] = sum(
46 | [
47 | tokenize_datetime(token.text).split(" ")
48 | for token in self._spacy_nlp(utterance_str)
49 | ],
50 | [],
51 | )
52 | return tokens
53 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_spans_extractors/from_file_steps_spans_extractor.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Dict, Tuple, List
3 |
4 | from overrides import overrides
5 | from spacy.tokens.doc import Doc
6 |
7 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import QDMROperation, StepsSpans
8 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors.steps_spans_extractors.base_steps_spans_extractor import BaseSpansExtractor
9 |
10 |
11 | class FromFileSpansExtractor(BaseSpansExtractor):
12 | def __init__(self, *file_path: str):
13 | super().__init__()
14 | self._cache: Dict[str, Tuple[StepsSpans, dict]] = {}
15 | for file in file_path:
16 | with open(file, 'r') as f:
17 | for line in f.readlines():
18 | content = json.loads(line.strip())
19 | steps_spans: StepsSpans = StepsSpans.from_dict(content['steps_spans'])
20 | metadata = content['metadata']
21 | self._cache[metadata['question_id']] = steps_spans, metadata
22 |
23 | @overrides
24 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None,
25 | debug: dict = None) -> StepsSpans:
26 | steps_spans, metadata = self._cache[question_id]
27 | if debug is not None: debug.update(**metadata)
28 | return steps_spans
29 |
30 | @overrides
31 | def _extract(self, question_id: str, question_tokens: Doc, steps_tokens: List[Doc], steps_operators: List[QDMROperation] = None,
32 | debug: dict = None) -> StepsSpans:
33 | raise NotImplementedError('use extract()')
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/pubmed.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return PubmedBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return PubmedBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return PubmedBM25Task.norm(entry['target'])
25 |
26 |
27 | class PubmedBM25Task:
28 | name = 'pubmed'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_PubMed")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls, text):
50 | # 输出一个list
51 | return word_tokenize(text)
52 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/python.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return PythonBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return PythonBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return PythonBM25Task.norm(entry['target'])
25 |
26 |
27 | class PythonBM25Task:
28 | name = 'python'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_Python")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls, text):
50 | # 输出一个list
51 | return word_tokenize(text)
52 |
--------------------------------------------------------------------------------
/dataflow/core/utterance_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | import re
4 | from typing import List
5 |
6 | from dataflow.core.constants import SpecialStrings
7 | from dataflow.core.dialogue import AgentUtterance, UserUtterance
8 | from dataflow.core.utterance_tokenizer import UtteranceTokenizer
9 |
10 |
11 | def clean_utterance_text(text: str) -> str:
12 | """Removes line breaking and extra spaces in the user utterance."""
13 | # sometimes the user utterance contains line breaking and extra spaces
14 | text = re.sub(r"\s+", " ", text)
15 | # sometimes the user utterance has leading/ending spaces
16 | text = text.strip()
17 | return text
18 |
19 |
20 | def build_user_utterance(
21 | text: str, utterance_tokenizer: UtteranceTokenizer
22 | ) -> UserUtterance:
23 | text = clean_utterance_text(text)
24 | if not text:
25 | return UserUtterance(
26 | original_text=SpecialStrings.NULL, tokens=[SpecialStrings.NULL]
27 | )
28 | return UserUtterance(original_text=text, tokens=utterance_tokenizer.tokenize(text))
29 |
30 |
31 | def build_agent_utterance(
32 | text: str, utterance_tokenizer: UtteranceTokenizer, described_entities: List[str]
33 | ) -> AgentUtterance:
34 | text = clean_utterance_text(text)
35 | if not text:
36 | return AgentUtterance(
37 | original_text=SpecialStrings.NULL,
38 | tokens=[SpecialStrings.NULL],
39 | described_entities=described_entities,
40 | )
41 | return AgentUtterance(
42 | original_text=text,
43 | tokens=utterance_tokenizer.tokenize(text),
44 | described_entities=described_entities,
45 | )
46 |
--------------------------------------------------------------------------------
/dataflow/core/prediction_report.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import csv
3 | from typing import Dict, List, Sequence, Union
4 |
5 | import pandas as pd
6 |
7 |
8 | class PredictionReportDatum(abc.ABC):
9 | @abc.abstractmethod
10 | def flatten(self) -> Dict[str, Union[str, int]]:
11 | raise NotImplementedError()
12 |
13 |
14 | def save_prediction_report_tsv(
15 | prediction_report: Sequence[PredictionReportDatum], prediction_report_tsv: str,
16 | ) -> None:
17 | """Converts prediction results into a pandas dataframe and saves it a tsv report file.
18 | """
19 | prediction_report_df = pd.DataFrame(
20 | [datum.flatten() for datum in prediction_report]
21 | )
22 | prediction_report_df.to_csv(
23 | prediction_report_tsv,
24 | sep="\t",
25 | index=False,
26 | encoding="utf-8",
27 | quoting=csv.QUOTE_ALL,
28 | )
29 |
30 |
31 | def save_prediction_report_txt(
32 | prediction_report: Sequence[PredictionReportDatum],
33 | prediction_report_txt: str,
34 | field_names: List[str],
35 | ) -> None:
36 | """Prints prediction results into an easy-to-read text report file."""
37 | with open(prediction_report_txt, "w") as fp:
38 | for datum in prediction_report:
39 | fp.write("=" * 16)
40 | fp.write("\n")
41 |
42 | flatten_fields = datum.flatten()
43 | for field_name in field_names:
44 | field_value = flatten_fields[field_name]
45 | # use "hypo" not "prediction" as the name here just to make it visually aligned with "gold"
46 | if field_name == "prediction":
47 | field_name = "hypo"
48 | print(f"{field_name}\t{field_value}", file=fp)
49 |
--------------------------------------------------------------------------------
/dataflow/multiwoz/trade_dst/mapping.pair:
--------------------------------------------------------------------------------
1 | it's it is
2 | don't do not
3 | doesn't does not
4 | didn't did not
5 | you'd you would
6 | you're you are
7 | you'll you will
8 | i'm i am
9 | they're they are
10 | that's that is
11 | what's what is
12 | couldn't could not
13 | i've i have
14 | we've we have
15 | can't cannot
16 | i'd i would
17 | i'd i would
18 | aren't are not
19 | isn't is not
20 | wasn't was not
21 | weren't were not
22 | won't will not
23 | there's there is
24 | there're there are
25 | . . .
26 | restaurants restaurant -s
27 | hotels hotel -s
28 | laptops laptop -s
29 | cheaper cheap -er
30 | dinners dinner -s
31 | lunches lunch -s
32 | breakfasts breakfast -s
33 | expensively expensive -ly
34 | moderately moderate -ly
35 | cheaply cheap -ly
36 | prices price -s
37 | places place -s
38 | venues venue -s
39 | ranges range -s
40 | meals meal -s
41 | locations location -s
42 | areas area -s
43 | policies policy -s
44 | children child -s
45 | kids kid -s
46 | kidfriendly kid friendly
47 | cards card -s
48 | upmarket expensive
49 | inpricey cheap
50 | inches inch -s
51 | uses use -s
52 | dimensions dimension -s
53 | driverange drive range
54 | includes include -s
55 | computers computer -s
56 | machines machine -s
57 | families family -s
58 | ratings rating -s
59 | constraints constraint -s
60 | pricerange price range
61 | batteryrating battery rating
62 | requirements requirement -s
63 | drives drive -s
64 | specifications specification -s
65 | weightrange weight range
66 | harddrive hard drive
67 | batterylife battery life
68 | businesses business -s
69 | hours hour -s
70 | one 1
71 | two 2
72 | three 3
73 | four 4
74 | five 5
75 | six 6
76 | seven 7
77 | eight 8
78 | nine 9
79 | ten 10
80 | eleven 11
81 | twelve 12
82 | anywhere any where
83 | good bye goodbye
84 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/scripts/utils/change_config.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | from tempfile import TemporaryDirectory
5 | import tarfile
6 | from allennlp.common.params import Params
7 | from pathlib import Path
8 |
9 | from typing import Dict, Any, Union
10 |
11 |
12 | def change_config(model_path:str, overrides: Union[str,Dict[str, Any]]):
13 | with TemporaryDirectory() as tmpdirname:
14 | with tarfile.open(model_path, mode='r:gz') as input_tar:
15 | print('Extracting model...')
16 | input_tar.extractall(tmpdirname)
17 |
18 | os.rename(model_path, os.path.join(os.path.dirname(model_path), 'model_bu.tar.gz'))
19 |
20 | # rewrite config
21 | conf_path = os.path.join(tmpdirname, 'config.json')
22 | p = Params.from_file(conf_path, overrides)
23 | p.to_file(conf_path)
24 |
25 | with tarfile.open(model_path, "w:gz") as output_tar:
26 | print('Archiving model...')
27 | output_tar.add(tmpdirname, arcname ="")
28 |
29 |
30 | if __name__ == "__main__":
31 | def run_change_config(args):
32 | assert args.root_dir and args.overrides
33 | models = Path(args.root_dir).rglob('model.tar.gz')
34 | for x in models:
35 | print(x)
36 | change_config(str(x), args.overrides)
37 |
38 | parse = argparse.ArgumentParser()
39 | parse.set_defaults(func=run_change_config)
40 | parse.add_argument("-r", "--root_dir", type=str, help="Source directory with model.tar.gz to modify")
41 | parse.add_argument("-o", "--overrides", type=str,
42 | help='"settings params to override. dictionary, supports nested fieldsby dots')
43 |
44 | args = parse.parse_args()
45 | args.func(args)
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/e2e.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return E2eBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return E2eBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return E2eBM25Task.norm(entry['target'])
25 |
26 |
27 | class E2eBM25Task:
28 | name = 'e2e'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_E2E")
35 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
36 | if self.dataset_split == "train":
37 | self.dataset = self.train_dataset
38 | else:
39 | self.dataset = list(dataset[self.dataset_split])
40 | self.corpus = None
41 | self.instruction = "Represent the example for retrieving duplicate examples; Input: "
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls, text):
50 | # 输出一个list
51 | return word_tokenize(text)
52 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/mtop.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 |
9 |
10 | field_getter = App()
11 |
12 | def norm(text):
13 | return (" ".join(text.split(";"))).split(" ")
14 |
15 | @field_getter.add("q")
16 | def get_question(entry):
17 | return MtopBM25Task.norm(entry['question'])
18 |
19 | @field_getter.add("qa")
20 | def get_qa(entry):
21 | return MtopBM25Task.norm(f"{entry['question']} {entry['logical_form']}")
22 |
23 | @field_getter.add("a")
24 | def get_decomp(entry):
25 | # print(entry)
26 | return MtopBM25Task.norm(entry['logical_form'])
27 |
28 |
29 | class MtopBM25Task:
30 | name = "mtop"
31 | def __init__(self, dataset_split, setup_type, ds_size=None):
32 | self.setup_type = setup_type
33 | self.get_field = field_getter.functions[self.setup_type]
34 | self.dataset_split = dataset_split
35 | dataset = load_dataset("KaiLv/UDR_MTOP")
36 | self.train_dataset = load_train_dataset(dataset,size=ds_size)
37 | if self.dataset_split=="train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 |
44 | def get_corpus(self):
45 | if self.corpus is None:
46 | self.corpus = [ self.get_field(entry) for entry in self.train_dataset]
47 | return self.corpus
48 |
49 | @classmethod
50 | def norm(cls,text):
51 | # return (" ".join(text.split(";"))).split(" ")
52 | return word_tokenize(text)
53 |
54 |
55 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/go.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return GoBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return GoBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return GoBM25Task.norm(entry['target'])
25 |
26 |
27 | class GoBM25Task:
28 | name = 'go'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_Go")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 | self.instruction = "Represent the code example for retrieving duplicate examples; Input: "
43 |
44 | def get_corpus(self):
45 | if self.corpus is None:
46 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
47 | return self.corpus
48 |
49 | @classmethod
50 | def norm(cls, text):
51 | # 输出一个list
52 | return word_tokenize(text)
53 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/roc_story_generation.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return RocStoryGenerationBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return RocStoryGenerationBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return RocStoryGenerationBM25Task.norm(entry['target'])
25 |
26 |
27 | class RocStoryGenerationBM25Task:
28 | name = 'roc_story_generation'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_RocStory")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls, text):
50 | # 输出一个list
51 | return word_tokenize(text)
52 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/smcalflow.py:
--------------------------------------------------------------------------------
1 | import re, os
2 | from datasets import load_dataset, load_from_disk
3 | from src.utils.dataset_utils import load_train_dataset
4 |
5 | import json
6 | from src.utils.app import App
7 | from nltk.tokenize import word_tokenize
8 |
9 |
10 |
11 | field_getter = App()
12 |
13 |
14 | @field_getter.add("q")
15 | def get_question(entry):
16 | return SmcalflowBM25Task.norm(entry['user_utterance'])
17 |
18 | @field_getter.add("qa")
19 | def get_qa(entry):
20 | return SmcalflowBM25Task.norm(f"{entry['user_utterance']} {entry['lispress']}")
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | # print(entry)
25 | return SmcalflowBM25Task.norm(entry['lispress'])
26 |
27 |
28 | class SmcalflowBM25Task:
29 | name = "smcalflow"
30 | def __init__(self, dataset_split, setup_type,ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | # dataset = load_dataset("iohadrubin/smcalflow")
35 | dataset = load_dataset("KaiLv/UDR_SMCalFlow")
36 | self.train_dataset = load_train_dataset(dataset,size=ds_size)
37 | if self.dataset_split=="train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [ self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls,text):
50 | # return (" ".join(text.split(";"))).split(" ")
51 | return word_tokenize(text)
52 |
53 |
54 |
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/roc_ending_generation.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return RocEndingGenerationBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return RocEndingGenerationBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return RocEndingGenerationBM25Task.norm(entry['target'])
25 |
26 |
27 | class RocEndingGenerationBM25Task:
28 | name = 'roc_ending_generation'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_RocEnding")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 |
43 | def get_corpus(self):
44 | if self.corpus is None:
45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
46 | return self.corpus
47 |
48 | @classmethod
49 | def norm(cls, text):
50 | # 输出一个list
51 | return word_tokenize(text)
52 |
--------------------------------------------------------------------------------
/dataflow/core/turn_prediction.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | from dataclasses import dataclass
4 | from typing import Optional
5 |
6 | from dataflow.core.dialogue import (
7 | Dialogue,
8 | ProgramExecutionOracle,
9 | TurnId,
10 | UserUtterance,
11 | )
12 |
13 |
14 | @dataclass(frozen=True, eq=True, repr=True)
15 | class UtteranceWithContext:
16 | """
17 | A user utterance, with the dialogue history leading up to it.
18 | This is the input to the lispress prediction task.
19 | """
20 |
21 | datum_id: TurnId
22 | user_utterance: UserUtterance
23 | context: Dialogue
24 |
25 |
26 | @dataclass(frozen=True, eq=True, repr=True)
27 | class TurnPrediction:
28 | """
29 | A model prediction of the `lispress` for a single Turn.
30 | This is the output of the lispress prediction task.
31 | """
32 |
33 | datum_id: TurnId
34 | user_utterance: str # redundant. just to make these files easier to read
35 | lispress: str
36 |
37 |
38 | @dataclass(frozen=True, eq=True, repr=True)
39 | class TurnAnswer:
40 | """
41 | A model prediction of the `lispress` for a single Turn.
42 | This is the output of the lispress prediction task.
43 | """
44 |
45 | datum_id: TurnId
46 | user_utterance: str # redundant. just to make these files easier to read
47 | lispress: str
48 | program_execution_oracle: Optional[ProgramExecutionOracle]
49 |
50 |
51 | def missing_prediction(datum_id: TurnId) -> TurnPrediction:
52 | """
53 | A padding `TurnPrediction` that is used when a turn with
54 | `datum_id` is missing from a predictions file.
55 | """
56 | return TurnPrediction(
57 | datum_id=datum_id, user_utterance="", lispress="",
58 | )
59 |
--------------------------------------------------------------------------------
/src/dataset_readers/inference_tasks/e2e.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
2 | from src.utils.dataset_utils import load_train_dataset
3 |
4 | import json, os
5 | from src.utils.tokenizer_utils import get_length
6 |
7 |
8 | def set_length(example, idx,**kwargs):
9 | tokenizer = kwargs['tokenizer']
10 | question_prefix = "Table: "
11 | answer_prefix = "Sentence: "
12 | q_field = question_prefix + example['question']
13 | a_field = answer_prefix + example['target']
14 | prompt_qa = f"{q_field}\t{a_field}"
15 | example['prompt_qa'] = prompt_qa
16 | example['prompt_len'] = get_length(tokenizer,prompt_qa)
17 | return example
18 |
19 |
20 | class E2eInferenceTask:
21 | name = "e2e"
22 |
23 | def __init__(self, prompt_file, tokenizer, ds_size=None):
24 | self.prompt_file = prompt_file
25 | with open(self.prompt_file) as f:
26 | self.prompts = json.load(f)
27 | dataset = load_dataset("KaiLv/UDR_E2E")
28 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False)
29 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer})
30 | self.training_dataset = list(self.hf_dataset)
31 | self.postfix = 'Sentence: '
32 |
33 | @classmethod
34 | def postproccess(cls, string):
35 | return string
36 |
37 | def get_fields(self, entry):
38 | question = entry['question']
39 | answer = entry['target'] if "target" in entry else entry['answers'][0]
40 | idx_list = [p['id'] for p in entry['ctxs']]
41 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']])
42 | return "Table: " + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list
43 |
--------------------------------------------------------------------------------
/src/dataset_readers/inference_tasks/go.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from datasets import load_from_disk, Dataset, concatenate_datasets
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 | import json
7 | from src.utils.tokenizer_utils import get_length
8 |
9 |
10 | def set_length(example, idx,**kwargs):
11 | tokenizer = kwargs['tokenizer']
12 | question_prefix = "Code: "
13 | answer_prefix = "Comment: "
14 | q_field = question_prefix + example['question']
15 | a_field = answer_prefix + example['target']
16 | prompt_qa = f"{q_field}\t{a_field}"
17 | example['prompt_qa'] = prompt_qa
18 | example['prompt_len'] = get_length(tokenizer,prompt_qa)
19 | return example
20 |
21 |
22 | class GoInferenceTask:
23 | name = "go"
24 |
25 | def __init__(self, prompt_file, tokenizer, ds_size=None):
26 | self.prompt_file = prompt_file
27 | with open(self.prompt_file) as f:
28 | self.prompts = json.load(f)
29 | dataset = load_dataset("KaiLv/UDR_Go")
30 |
31 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False)
32 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer})
33 | self.training_dataset = list(self.hf_dataset)
34 | self.postfix = 'Comment: '
35 |
36 | @classmethod
37 | def postproccess(cls, string):
38 | return string
39 |
40 | def get_fields(self, entry):
41 | question = entry['question']
42 | answer = entry['target'] if "target" in entry else entry['answers'][0]
43 | idx_list = [p['id'] for p in entry['ctxs']]
44 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']])
45 | return "Code: " + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list
46 |
--------------------------------------------------------------------------------
/src/dataset_readers/inference_tasks/pubmed.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from datasets import load_from_disk, Dataset, concatenate_datasets
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 | import json
7 | from src.utils.tokenizer_utils import get_length
8 |
9 |
10 | def set_length(example, idx,**kwargs):
11 | tokenizer = kwargs['tokenizer']
12 | question_prefix = ""
13 | answer_prefix = "TL;DR: "
14 | q_field = question_prefix + example['question']
15 | a_field = answer_prefix + example['target']
16 | prompt_qa = f"{q_field}\t{a_field}"
17 | example['prompt_qa'] = prompt_qa
18 | example['prompt_len'] = get_length(tokenizer,prompt_qa)
19 | return example
20 |
21 |
22 | class PubmedInferenceTask:
23 | name = "pubmed"
24 |
25 | def __init__(self, prompt_file, tokenizer, ds_size=None):
26 | self.prompt_file = prompt_file
27 | with open(self.prompt_file) as f:
28 | self.prompts = json.load(f)
29 | dataset = load_dataset("KaiLv/UDR_PubMed")
30 |
31 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False)
32 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer})
33 | self.training_dataset = list(self.hf_dataset)
34 | self.postfix = 'TL;DR: '
35 |
36 | @classmethod
37 | def postproccess(cls, string):
38 | return string
39 |
40 | def get_fields(self, entry):
41 | question = entry['question']
42 | answer = entry['target'] if "target" in entry else entry['answers'][0]
43 | idx_list = [p['id'] for p in entry['ctxs']]
44 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']])
45 | return "" + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list
46 |
--------------------------------------------------------------------------------
/src/dataset_readers/inference_tasks/reddit.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from datasets import load_from_disk, Dataset, concatenate_datasets
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 | import json
7 | from src.utils.tokenizer_utils import get_length
8 |
9 |
10 | def set_length(example, idx,**kwargs):
11 | tokenizer = kwargs['tokenizer']
12 | question_prefix = ""
13 | answer_prefix = "TL;DR: "
14 | q_field = question_prefix + example['question']
15 | a_field = answer_prefix + example['target']
16 | prompt_qa = f"{q_field}\t{a_field}"
17 | example['prompt_qa'] = prompt_qa
18 | example['prompt_len'] = get_length(tokenizer,prompt_qa)
19 | return example
20 |
21 |
22 | class RedditInferenceTask:
23 | name = "reddit"
24 |
25 | def __init__(self, prompt_file, tokenizer, ds_size=None):
26 | self.prompt_file = prompt_file
27 | with open(self.prompt_file) as f:
28 | self.prompts = json.load(f)
29 | dataset = load_dataset("KaiLv/UDR_Reddit")
30 |
31 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False)
32 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer})
33 | self.training_dataset = list(self.hf_dataset)
34 | self.postfix = 'TL;DR: '
35 |
36 | @classmethod
37 | def postproccess(cls, string):
38 | return string
39 |
40 | def get_fields(self, entry):
41 | question = entry['question']
42 | answer = entry['target'] if "target" in entry else entry['answers'][0]
43 | idx_list = [p['id'] for p in entry['ctxs']]
44 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']])
45 | return "" + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list
46 |
--------------------------------------------------------------------------------
/qdecomp_with_dependency_graphs/scripts/eval/eval_copy_files.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from shutil import copyfile, copytree, errno, ignore_patterns
3 | import argparse
4 | import os
5 |
6 |
7 | def copy(exp_path:str, dest_path:str):
8 | patterns: [str] = ["**/evals", "**/plots", "**/config.json", "**/metrics.json"]
9 | exclude_patterns : [str] = ["*_preds.json", "*_summary.tsv"]
10 | experiments = [str(p.parent) for p in Path(exp_path).glob("**/evals/")]
11 |
12 | for exp in experiments:
13 | for pattern in patterns:
14 | exclude = [p for ex_patt in exclude_patterns for p in Path(exp).glob(ex_patt)]
15 | pathlist = [p for p in Path(exp).glob(pattern) if p not in exclude]
16 | for path in pathlist:
17 | path_in_str = str(path)
18 | d=os.path.join(dest_path, path_in_str)
19 | os.makedirs(os.path.dirname(d), exist_ok=True)
20 | print("{} -> {}".format(path_in_str, d))
21 | try:
22 | copytree(path_in_str, d, ignore=ignore_patterns(*exclude_patterns))
23 | except OSError as exc: # python >2.5
24 | if exc.errno == errno.ENOTDIR:
25 | copyfile(path_in_str, d)
26 | else:
27 | raise
28 |
29 |
30 | if __name__ == '__main__':
31 | parser = argparse.ArgumentParser(description="copy aside evaluations files")
32 | parser.add_argument('--exp_dir', type=str, help='path to experiments directory')
33 | parser.add_argument('--dest_dir', type=str, help='path to destination directory')
34 | args = parser.parse_args()
35 | assert os.path.exists(args.exp_dir)
36 | assert args.exp_dir != args.dest_dir
37 |
38 | copy(args.exp_dir, args.dest_dir)
--------------------------------------------------------------------------------
/src/dataset_readers/bm25_tasks/reddit.py:
--------------------------------------------------------------------------------
1 | import re
2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
3 | import json, os
4 | from src.utils.app import App
5 | from nltk.tokenize import word_tokenize
6 | from src.utils.dataset_utils import load_train_dataset
7 |
8 | field_getter = App()
9 |
10 |
11 | @field_getter.add("q")
12 | def get_question(entry):
13 | # 与mtop等不同,kp20k的question是一个list,不需要norm
14 | return RedditBM25Task.norm(entry['question'])
15 |
16 |
17 | @field_getter.add("qa")
18 | def get_qa(entry):
19 | return RedditBM25Task.norm(f"{entry['question']} {entry['target']}")
20 |
21 |
22 | @field_getter.add("a")
23 | def get_decomp(entry):
24 | return RedditBM25Task.norm(entry['target'])
25 |
26 |
27 | class RedditBM25Task:
28 | name = 'reddit'
29 |
30 | def __init__(self, dataset_split, setup_type, ds_size=None):
31 | self.setup_type = setup_type
32 | self.get_field = field_getter.functions[self.setup_type]
33 | self.dataset_split = dataset_split
34 | dataset = load_dataset("KaiLv/UDR_Reddit")
35 | print(dataset)
36 | self.train_dataset = load_train_dataset(dataset, size=ds_size)
37 | if self.dataset_split == "train":
38 | self.dataset = self.train_dataset
39 | else:
40 | self.dataset = list(dataset[self.dataset_split])
41 | self.corpus = None
42 | self.instruction = "Represent the reddit example for retrieving duplicate examples; Input: "
43 |
44 | def get_corpus(self):
45 | if self.corpus is None:
46 | self.corpus = [self.get_field(entry) for entry in self.train_dataset]
47 | return self.corpus
48 |
49 | @classmethod
50 | def norm(cls, text):
51 | # 输出一个list
52 | return word_tokenize(text)
53 |
--------------------------------------------------------------------------------
/src/dataset_readers/inference_tasks/php.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from datasets import load_from_disk, Dataset, concatenate_datasets
4 | from src.utils.dataset_utils import load_train_dataset
5 |
6 | import json
7 | from src.utils.tokenizer_utils import get_length
8 |
9 |
10 | def set_length(example, idx,**kwargs):
11 | tokenizer = kwargs['tokenizer']
12 | question_prefix = "Code: "
13 | answer_prefix = "Comment: "
14 | q_field = question_prefix + example['question']
15 | a_field = answer_prefix + example['target']
16 | prompt_qa = f"{q_field}\t{a_field}"
17 | example['prompt_qa'] = prompt_qa
18 | example['prompt_len'] = get_length(tokenizer,prompt_qa)
19 | return example
20 |
21 |
22 | class PhpInferenceTask:
23 | name = "php"
24 |
25 | def __init__(self, prompt_file, tokenizer, ds_size=None):
26 | self.prompt_file = prompt_file
27 | with open(self.prompt_file) as f:
28 | self.prompts = json.load(f)
29 | dataset = load_dataset("KaiLv/UDR_PHP")
30 |
31 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False)
32 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer})
33 | self.training_dataset = list(self.hf_dataset)
34 | self.postfix = 'Comment: '
35 |
36 | @classmethod
37 | def postproccess(cls, string):
38 | return string
39 |
40 | def get_fields(self, entry):
41 | question = entry['question']
42 | answer = entry['target'] if "target" in entry else entry['answers'][0]
43 | idx_list = [p['id'] for p in entry['ctxs']]
44 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']])
45 | return "Code: " + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list
46 |
--------------------------------------------------------------------------------