├── DPR ├── CHANGELOG.md ├── dpr │ ├── __init__.py │ ├── data │ │ └── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── conf_utils.py │ └── .DS_Store ├── .DS_Store ├── conf │ ├── .DS_Store │ ├── ctx_sources │ │ └── default_sources.yaml │ ├── train │ │ ├── extractive_reader_default.yaml │ │ ├── biencoder_nq.yaml │ │ ├── biencoder_default.yaml │ │ └── biencoder_local.yaml │ ├── encoder │ │ └── hf_bert.yaml │ ├── datasets │ │ └── retriever_default.yaml │ ├── biencoder_train_cfg.yaml │ └── gen_embs.yaml ├── req.txt ├── edit.txt ├── run.sh ├── eval.sh ├── setup.py └── CONTRIBUTING.md ├── dataflow ├── py.typed ├── analysis │ └── __init__.py ├── .DS_Store ├── __init__.py ├── core │ ├── __init__.py │ ├── constants.py │ ├── utterance_tokenizer.py │ ├── utterance_utils.py │ ├── prediction_report.py │ └── turn_prediction.py ├── multiwoz │ ├── __init__.py │ └── trade_dst │ │ ├── __init__.py │ │ └── mapping.pair ├── leaderboard │ └── __init__.py └── onmt_helpers │ └── __init__.py ├── src ├── __init__.py ├── data │ └── datasets │ │ ├── mtop.py.lock │ │ ├── smcalflow.py.lock │ │ ├── totto.py │ │ └── __init__.py ├── .DS_Store ├── dataset_readers │ ├── .DS_Store │ ├── bm25_tasks │ │ ├── __init__.py │ │ ├── php.py │ │ ├── dart.py │ │ ├── java.py │ │ ├── pubmed.py │ │ ├── python.py │ │ ├── e2e.py │ │ ├── mtop.py │ │ ├── go.py │ │ ├── roc_story_generation.py │ │ ├── smcalflow.py │ │ ├── roc_ending_generation.py │ │ └── reddit.py │ ├── scorer_tasks │ │ ├── __init__.py │ │ ├── e2e.py │ │ ├── go.py │ │ ├── java.py │ │ ├── pubmed.py │ │ ├── reddit.py │ │ ├── dart.py │ │ ├── php.py │ │ ├── python.py │ │ ├── cnndailymail.py │ │ ├── copa.py │ │ ├── cr.py │ │ ├── cs_valid.py │ │ ├── mr.py │ │ ├── cs_explan.py │ │ ├── rte.py │ │ ├── cola.py │ │ ├── cosmos_qa.py │ │ ├── mnli.py │ │ ├── snli.py │ │ ├── subj.py │ │ ├── trec.py │ │ ├── sst2.py │ │ ├── sst5.py │ │ ├── agnews.py │ │ ├── amazon.py │ │ ├── yahoo.py │ │ ├── roc_ending_generation.py │ │ ├── roc_story_generation.py │ │ ├── dbpedia.py │ │ ├── yelp_full.py │ │ ├── mtop.py │ │ ├── common_gen.py │ │ ├── break.py │ │ ├── smcalflow.py │ │ └── wikiauto.py │ └── inference_tasks │ │ ├── __init__.py │ │ ├── mtop.py │ │ ├── e2e.py │ │ ├── go.py │ │ ├── pubmed.py │ │ ├── reddit.py │ │ └── php.py ├── utils │ ├── tokenizer_utils.py │ ├── app.py │ ├── dataset_utils.py │ ├── log_utils.py │ └── cache_util.py └── models │ ├── instructor_embedder.py │ ├── model.py │ └── embedder.py ├── easy-elasticsearch ├── easy_elasticsearch │ ├── __init__.py │ └── examples │ │ ├── __init__.py │ │ └── download_and_run.sh └── setup.py ├── qdecomp_with_dependency_graphs ├── qdecomp_nlp │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── samplers │ │ │ └── __init__.py │ │ ├── tokenizers │ │ │ └── __init__.py │ │ ├── dataset_readers │ │ │ ├── __init__.py │ │ │ └── util.py │ │ └── token_indexers │ │ │ └── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── hybrid │ │ │ └── __init__.py │ │ ├── seq2seq │ │ │ ├── __init__.py │ │ │ ├── simple_seq2seq_custom.py │ │ │ └── custom_copynet_seq2seq_for_soft_rat.py │ │ └── dependencies_graph │ │ │ └── __init__.py │ ├── modules │ │ ├── __init__.py │ │ ├── token_embedders │ │ │ └── __init__.py │ │ └── seq2seq_encoders │ │ │ └── __init__.py │ ├── training │ │ ├── __init__.py │ │ ├── metrics │ │ │ └── __init__.py │ │ └── learning_rate_schedulers │ │ │ └── __init__.py │ ├── predictors │ │ ├── __init__.py │ │ ├── seq2seq │ │ │ ├── __init__.py │ │ │ └── simple_seq2seq_dynamic_predictor.py │ │ └── dependencies_graph │ │ │ └── __init__.py │ └── .DS_Store ├── .gitignore ├── .DS_Store ├── scripts │ ├── .DS_Store │ ├── qdmr_to_logical_form │ │ └── utils_.py │ ├── data_processing │ │ └── add_extra_tokens.py │ ├── tune │ │ └── studies │ │ │ ├── biaffine-graph-parser--transformer-encoder.py │ │ │ └── operators-aware-biaffine-graph-parser--transformer-encoder.py │ ├── utils │ │ └── change_config.py │ └── eval │ │ └── eval_copy_files.py ├── dependencies_graph │ ├── .DS_Store │ ├── extractors │ │ ├── spans_dependencies_extractors │ │ │ ├── __init__.py │ │ │ └── base_spans_dependencies_extractor.py │ │ ├── tokens_dependencies_to_qdmr_extractors │ │ │ ├── converters │ │ │ │ ├── __init__.py │ │ │ │ └── base_spans_dep_to_qdmr_converter.py │ │ │ ├── __init__.py │ │ │ └── base_tokens_dep_to_qdmr_extractor.py │ │ ├── tokens_dependencies_extractors │ │ │ ├── __init__.py │ │ │ ├── collapsers │ │ │ │ ├── __init__.py │ │ │ │ ├── to_dependency_type_collapser.py │ │ │ │ ├── base_collapser.py │ │ │ │ ├── to_sequential_ids_collapser.py │ │ │ │ └── add_operator_properties_collapser.py │ │ │ └── base_tokens_dependencies_extractor.py │ │ ├── __init__.py │ │ ├── steps_dependencies_extractors │ │ │ ├── __init__.py │ │ │ └── base_steps_dependencies_extractor.py │ │ └── steps_spans_extractors │ │ │ ├── __init__.py │ │ │ ├── aligners │ │ │ └── base_aligner.py │ │ │ ├── base_steps_spans_extractor.py │ │ │ └── from_file_steps_spans_extractor.py │ ├── data_types │ │ ├── qdmr_operation.py │ │ └── __init__.py │ ├── config │ │ └── configuration_loader.py │ └── operators_sequence.py ├── utils │ ├── timeout_test.py │ └── timeout.py ├── requirements_core.txt └── debug.py ├── configs ├── random_finder.yaml ├── bm25_finder.yaml ├── create_index.yaml ├── knn_finder.yaml ├── scorer.yaml ├── client.yaml ├── api_scorer.yaml └── inference.yaml ├── break_evaluator ├── .DS_Store ├── tmp │ ├── results │ │ ├── metrics.json │ │ ├── decomp_summary.txt │ │ └── question_decomp_summary.txt │ └── .DS_Store ├── example_test_predictions │ └── .DS_Store ├── requirements.txt ├── utils │ ├── timeout_test.py │ ├── timeout.py │ └── graph.py ├── allennlp_preds_format.py ├── Dockerfile └── evaluate.yaml ├── Channel_LM_Prompting ├── img │ ├── teaser.png │ ├── tuning.png │ ├── data_download.png │ └── demonstration.png └── .gitignore ├── semantic_parsing_with_constrained_lm ├── .DS_Store ├── src │ └── semantic_parsing_with_constrained_lm │ │ ├── domains │ │ ├── calflow │ │ │ └── grammar │ │ │ │ ├── start.scfg │ │ │ │ ├── entities.scfg │ │ │ │ ├── enum_wrappers.scfg │ │ │ │ └── quoted.scfg │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── configs │ │ ├── __init__.py │ │ ├── lib │ │ │ └── __init__.py │ │ └── smpa_20210929_zeroshot.py │ │ ├── earley │ │ └── __init__.py │ │ ├── scfg │ │ ├── __init__.py │ │ ├── parser │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ └── types.py │ │ └── string_utils.py │ │ ├── scripts │ │ └── __init__.py │ │ ├── async_tools │ │ └── __init__.py │ │ ├── finetune │ │ └── __init__.py │ │ ├── paths.py │ │ ├── util │ │ ├── types.py │ │ └── missing_sentinel.py │ │ ├── cache.py │ │ ├── datum.py │ │ └── trie_partial_parse.py ├── tests │ └── semantic_parsing_with_constrained_lm │ │ ├── __init__.py │ │ ├── scfg │ │ ├── __init__.py │ │ ├── test_read_grammar.py │ │ └── test_string_utils.py │ │ ├── domains │ │ └── __init__.py │ │ ├── earley │ │ ├── __init__.py │ │ ├── test_input.py │ │ └── test_agenda.py │ │ └── async_tools │ │ └── __init__.py ├── third_party │ └── break-evaluator │ │ ├── tmp │ │ └── results │ │ │ └── metrics.json │ │ ├── requirements.txt │ │ ├── utils │ │ ├── timeout_test.py │ │ ├── timeout.py │ │ └── graph.py │ │ ├── allennlp_preds_format.py │ │ ├── Dockerfile │ │ ├── evaluate.yaml │ │ ├── pyproject.toml │ │ └── LICENSE ├── .gitignore ├── SUPPORT.md ├── CODE_OF_CONDUCT.md ├── pyproject.toml ├── LICENSE └── NOTICE.md ├── .idea └── deployment.xml ├── scripts ├── find_bm25.sh └── score_bm25.sh └── find_random.py /DPR/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataflow/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DPR/dpr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DPR/dpr/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DPR/dpr/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataflow/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/datasets/mtop.py.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/data/datasets/smcalflow.py.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /easy-elasticsearch/easy_elasticsearch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /easy-elasticsearch/easy_elasticsearch/examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/models/hybrid/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/predictors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DPR/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/DPR/.DS_Store -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/data/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/models/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/predictors/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/training/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/src/.DS_Store -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/data/dataset_readers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/data/token_indexers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/modules/token_embedders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DPR/conf/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/DPR/conf/.DS_Store -------------------------------------------------------------------------------- /DPR/dpr/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/DPR/dpr/.DS_Store -------------------------------------------------------------------------------- /dataflow/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/dataflow/.DS_Store -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/models/dependencies_graph/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/modules/seq2seq_encoders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /configs/random_finder.yaml: -------------------------------------------------------------------------------- 1 | output_path: ??? 2 | dataset_split: ??? 3 | task_name: ??? -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/predictors/dependencies_graph/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/training/learning_rate_schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /break_evaluator/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/break_evaluator/.DS_Store -------------------------------------------------------------------------------- /break_evaluator/tmp/results/metrics.json: -------------------------------------------------------------------------------- 1 | {"ged": 0.3659574013246697, "normalized_exact_match": 0.15} -------------------------------------------------------------------------------- /dataflow/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | datasets/ 4 | misc/ 5 | *.pyc 6 | -------------------------------------------------------------------------------- /break_evaluator/tmp/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/break_evaluator/tmp/.DS_Store -------------------------------------------------------------------------------- /dataflow/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | -------------------------------------------------------------------------------- /src/dataset_readers/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/src/dataset_readers/.DS_Store -------------------------------------------------------------------------------- /dataflow/multiwoz/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | -------------------------------------------------------------------------------- /Channel_LM_Prompting/img/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/Channel_LM_Prompting/img/teaser.png -------------------------------------------------------------------------------- /Channel_LM_Prompting/img/tuning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/Channel_LM_Prompting/img/tuning.png -------------------------------------------------------------------------------- /dataflow/leaderboard/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | -------------------------------------------------------------------------------- /dataflow/onmt_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | -------------------------------------------------------------------------------- /dataflow/multiwoz/trade_dst/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/qdecomp_with_dependency_graphs/.DS_Store -------------------------------------------------------------------------------- /Channel_LM_Prompting/img/data_download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/Channel_LM_Prompting/img/data_download.png -------------------------------------------------------------------------------- /Channel_LM_Prompting/img/demonstration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/Channel_LM_Prompting/img/demonstration.png -------------------------------------------------------------------------------- /DPR/req.txt: -------------------------------------------------------------------------------- 1 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz -------------------------------------------------------------------------------- /Channel_LM_Prompting/.gitignore: -------------------------------------------------------------------------------- 1 | original 2 | data 3 | out 4 | *.err 5 | *.out 6 | *.txt 7 | __pycache__ 8 | Makefile 9 | tmp* 10 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/scripts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/qdecomp_with_dependency_graphs/scripts/.DS_Store -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/semantic_parsing_with_constrained_lm/.DS_Store -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/start.scfg: -------------------------------------------------------------------------------- 1 | start -> !" " unit, unit 2 | -------------------------------------------------------------------------------- /break_evaluator/example_test_predictions/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/break_evaluator/example_test_predictions/.DS_Store -------------------------------------------------------------------------------- /configs/bm25_finder.yaml: -------------------------------------------------------------------------------- 1 | output_path: ??? 2 | dataset_split: ??? 3 | setup_type: ??? 4 | task_name: ??? 5 | L: 50 6 | score: False 7 | reindexing: True -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/qdecomp_with_dependency_graphs/qdecomp_nlp/.DS_Store -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KaiLv69/UDR/HEAD/qdecomp_with_dependency_graphs/dependencies_graph/.DS_Store -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/earley/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/scfg/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/async_tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/configs/lib/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/parser/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/domains/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/earley/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /DPR/edit.txt: -------------------------------------------------------------------------------- 1 | DPR/conf/datasets/encoder_train_default.yaml 2 | DPR/dpr/data/download_data.py 3 | DPR/conf/datasets/retriever_default.yaml 4 | DPR/conf/ctx_sources/default_sources.yaml 5 | 6 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/async_tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/tmp/results/metrics.json: -------------------------------------------------------------------------------- 1 | {"exact_match": 0.24242424242424243, "sari": 0.7061778423719823, "ged": 0.4089606835211786, "normalized_exact_match": 0.32323232323232326} -------------------------------------------------------------------------------- /src/utils/tokenizer_utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def get_length(tokenizer, text): 4 | tokenized_example = tokenizer.encode_plus(text,truncation=False,return_tensors='pt') 5 | return int(tokenized_example.input_ids.squeeze().shape[0]) -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/spans_dependencies_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_spans_dependencies_extractor import BaseSpansDependenciesExtractor 2 | from .merge_spans_dependencies_exatractor import MergeSpansDependenciesExtractor -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/paths.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from pathlib import Path 5 | 6 | DOMAINS_DIR = Path(__file__).resolve().parent / "domains" 7 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_to_qdmr_extractors/converters/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_spans_dep_to_qdmr_converter import BaseSpansDepToQdmrConverter 2 | from .rule_based_spans_dep_to_qdmr_converter import RuleBasedSpansDepToQdmrConverter -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_tokens_dependencies_extractor import BaseTokensDependenciesExtractor 2 | from .tokens_dependencies_extractor import TokensDependenciesExtractor 3 | 4 | from .collapsers import * 5 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .steps_spans_extractors import * 2 | from .steps_dependencies_extractors import * 3 | from .spans_dependencies_extractors import * 4 | from .tokens_dependencies_extractors import * 5 | from .tokens_dependencies_to_qdmr_extractors import * 6 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_to_qdmr_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_tokens_dep_to_qdmr_extractor import BaseTokensDependenciesToQDMRExtractor 2 | from .spans_based_tokens_dep_to_qdmr_extractor import SpansBasedTokensDependenciesToQDMRExtractor 3 | from .converters import * 4 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/parser/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | def is_skippable(string: str): 5 | """A string is skippable if it's empty or begins with a '#'""" 6 | return not string or string[0] == "#" 7 | -------------------------------------------------------------------------------- /DPR/run.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES="0,1" python train_dense_encoder.py train_datasets=[grailqa_train] dev_datasets=[grailqa_dev] train=biencoder_local output_dir=/media/disk1/ohadr/dropout0.15 2 | CUDA_VISIBLE_DEVICES="4" python train_dense_encoder.py train_datasets=[break_train_qd] train=biencoder_local output_dir=/media/disk1/ohadr/break_qd -------------------------------------------------------------------------------- /break_evaluator/requirements.txt: -------------------------------------------------------------------------------- 1 | # python 3.7.6 2 | 3 | edit-distance==1.0.4 4 | editdistance==0.5.3 5 | matplotlib==3.1.2 6 | networkx==2.4 7 | neuralcoref==4.0 8 | overrides==2.8.0 9 | pandas==0.25.3 10 | lxml==4.5.0 11 | progressbar==2.5 12 | scipy==1.4.1 13 | spacy==2.1.9 14 | 15 | 16 | # python -m spacy download en_core_web_sm -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .venv/ 3 | .pipx/ 4 | 5 | logs/ 6 | trained_models/ 7 | 8 | src/semantic_parsing_with_constrained_lm/domains/calflow/data/*.jsonl 9 | src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/grammar.scfg 10 | src/semantic_parsing_with_constrained_lm/domains/overnight/data/ -------------------------------------------------------------------------------- /src/utils/app.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class App: 4 | def __init__(self): 5 | self.functions = {} 6 | def add(self, key): 7 | def adder(func): 8 | self.functions[key] = func 9 | return func 10 | return adder 11 | def __getitem__(self, __name: str) : 12 | return self.functions[__name] 13 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_dependencies_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_steps_dependencies_extractor import BaseStepsDependenciesExtractor 2 | from .logical_form_based_steps_dependencies_extractor import LogicalFormBasedStepsDependenciesExtractor 3 | from .pattern_based_steps_dependencies_extractor import PatternBasedStepsDependenciesExtractor -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/util/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os # pylint: disable=unused-import 5 | from typing import Union 6 | 7 | # This can be used to annotate arguments that are supposed to be file paths. 8 | StrPath = Union[str, "os.PathLike[str]"] 9 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/requirements.txt: -------------------------------------------------------------------------------- 1 | # python 3.7.6 2 | 3 | edit-distance==1.0.4 4 | editdistance==0.5.3 5 | matplotlib==3.1.2 6 | networkx==2.4 7 | neuralcoref==4.0 8 | overrides==2.8.0 9 | pandas==0.25.3 10 | lxml==4.5.0 11 | progressbar==2.5 12 | scipy==1.4.1 13 | spacy==2.1.9 14 | 15 | 16 | # python -m spacy download en_core_web_sm -------------------------------------------------------------------------------- /DPR/conf/ctx_sources/default_sources.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | dpr_wiki: 4 | _target_: dpr.data.retriever_data.CsvCtxSrc 5 | file: data.wikipedia_split.psgs_w100 6 | id_prefix: 'wiki:' 7 | dpr_grail: 8 | _target_: dpr.data.retriever_data.CsvCtxSrc 9 | file: data.wikipedia_split.entities 10 | dpr_epr: 11 | _target_: dpr.data.retriever_data.EPRCtxSrc 12 | setup_type: ??? 13 | task_name: ??? 14 | 15 | 16 | # id_prefix: 'grail:' -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new Issue. 8 | 9 | ## Microsoft Support Policy 10 | 11 | Support for this project is limited to the resources listed above. 12 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/configs/smpa_20210929_zeroshot.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # SMPA with no training examples. Things to vary: 5 | # - Normalize over valid tokens 6 | # - Reward per token 7 | # - Length normalization 8 | 9 | 10 | # - Context? 11 | # - Length normalization 12 | # - Normalize over valid tokens 13 | # - EOS penalty 14 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_spans_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_steps_spans_extractor import BaseSpansExtractor 2 | from .from_file_steps_spans_extractor import FromFileSpansExtractor 3 | from .variations_based_steps_spans_extractor import VariationsBasedSpansExtractor 4 | 5 | from .aligners.base_aligner import BaseAligner 6 | from .aligners.ILP_based_aligner import ILPAligner 7 | from .aligners.rule_based_aligner import RuleBasedAligner -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/parser/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from typing import Tuple 5 | 6 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.scfg.parser.token import SCFGToken 7 | 8 | Nonterminal = str 9 | # An Alias is just another name for a nonterminal. 10 | Alias = str 11 | 12 | 13 | Expansion = Tuple[SCFGToken, ...] 14 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/data_types/qdmr_operation.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class QDMROperation(str, Enum): 5 | FIND, SELECT, FILTER, PROJECT, AGGREGATE, GROUP, SUPERLATIVE, COMPARATIVE, UNION, INTERSECTION, DISCARD, SORT, \ 6 | BOOLEAN, ARITHMETIC, COMPARISON, NONE = \ 7 | 'find', 'select', 'filter', 'project', 'aggregate', 'group', 'superlative', 'comparative', 'union', \ 8 | 'intersection', 'discard', 'sort', 'boolean', 'arithmetic', 'comparison', 'None' -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/entities.scfg: -------------------------------------------------------------------------------- 1 | personname -> quoted, " #(PersonName " quoted ")" 2 | string -> quoted, " #(String " quoted ")" 3 | respondcomment -> quoted, " #(RespondComment " quoted ")" 4 | locationkeyphrase -> quoted, " #(LocationKeyphrase " quoted ")" 5 | path -> quoted, " #(Path " quoted ")" 6 | 7 | list_path_ -> !"(empty list)", " #(List[Path] [])" 8 | list_recipient_ -> !"(empty recipient list)", " #(List[Recipient] [])" 9 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/data_types/__init__.py: -------------------------------------------------------------------------------- 1 | from .qdmr_operation import QDMROperation 2 | 3 | from .steps_dependencies_graph import \ 4 | StepsDependencies, DependencyType, StepData, StepDependencyData 5 | 6 | from .steps_spans import StepsSpans, Span 7 | 8 | from .spans_dependencies_graph import SpansData, SpanDependencyData, SpansDependencies 9 | 10 | from .tokens_dependencies_graph import \ 11 | TokenData, TokenDependencyData, TokenDependencyType, TokensDependencies 12 | -------------------------------------------------------------------------------- /break_evaluator/utils/timeout_test.py: -------------------------------------------------------------------------------- 1 | 2 | from time import sleep 3 | 4 | from break_evaluator.utils.timeout import exit_after 5 | 6 | 7 | @exit_after(5) 8 | def countdown(n): 9 | print('countdown started', flush=True) 10 | for i in range(n, -1, -1): 11 | print(i, end=', ', flush=True) 12 | sleep(1) 13 | print('countdown finished') 14 | 15 | 16 | if __name__ == "__main__": 17 | try: 18 | countdown(10) 19 | except KeyboardInterrupt: 20 | print('timeout!') 21 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/enum_wrappers.scfg: -------------------------------------------------------------------------------- 1 | holiday -> holiday_entity, " #(Holiday \"" holiday_entity "\")" 2 | placefeature -> place_feature_entity, " #(PlaceFeature \"" place_feature_entity "\")" 3 | weatherquantifier -> weather_quantifier_entity, " #(WeatherQuantifier \"" weather_quantifier_entity "\")" 4 | responsestatustype -> response_entity, " #(ResponseStatusType \"" response_entity "\")" 5 | number -> number_entity, " #(Number" number_entity ")" 6 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/utils/timeout_test.py: -------------------------------------------------------------------------------- 1 | 2 | from time import sleep 3 | 4 | from qdecomp_with_dependency_graphs.utils.timeout import exit_after 5 | 6 | 7 | @exit_after(5) 8 | def countdown(n): 9 | print('countdown started', flush=True) 10 | for i in range(n, -1, -1): 11 | print(i, end=', ', flush=True) 12 | sleep(1) 13 | print('countdown finished') 14 | 15 | 16 | if __name__ == "__main__": 17 | try: 18 | countdown(10) 19 | except KeyboardInterrupt: 20 | print('timeout!') 21 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/utils/timeout_test.py: -------------------------------------------------------------------------------- 1 | 2 | from time import sleep 3 | 4 | from utils.timeout import exit_after 5 | 6 | 7 | @exit_after(5) 8 | def countdown(n): 9 | print('countdown started', flush=True) 10 | for i in range(n, -1, -1): 11 | print(i, end=', ', flush=True) 12 | sleep(1) 13 | print('countdown finished') 14 | 15 | 16 | if __name__ == "__main__": 17 | try: 18 | countdown(10) 19 | except KeyboardInterrupt: 20 | print('timeout!') 21 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/spans_dependencies_extractors/base_spans_dependencies_extractor.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod, ABC 2 | from typing import List, Tuple 3 | 4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies 5 | 6 | 7 | class BaseSpansDependenciesExtractor(ABC): 8 | @abstractmethod 9 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None, 10 | debug: dict = None) -> SpansDependencies: 11 | raise NotImplementedError() -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_dependencies_extractors/base_steps_dependencies_extractor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Tuple 3 | 4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import StepsDependencies 5 | 6 | 7 | class BaseStepsDependenciesExtractor(ABC): 8 | @abstractmethod 9 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None, 10 | debug: dict = None) -> StepsDependencies: 11 | raise NotImplementedError() -------------------------------------------------------------------------------- /src/utils/dataset_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def load_train_dataset(dataset,size=None,listify=True): 4 | if size is not None: 5 | p = size 6 | data = dataset['train'] 7 | total_size = len(data) 8 | 9 | rand = random.Random(x=int(p*total_size)) 10 | index_list = list(range(total_size)) 11 | rand.shuffle(index_list) 12 | x = data.select(index_list[:int(p*total_size)]) 13 | 14 | 15 | else: 16 | x = dataset['train'] 17 | if listify: 18 | return list(x) 19 | else: 20 | return x -------------------------------------------------------------------------------- /configs/create_index.yaml: -------------------------------------------------------------------------------- 1 | cuda_device: ??? 2 | output_file: ??? 3 | setup_type: ??? 4 | dataset_split: ??? 5 | task_name: ??? 6 | batch_size: 50 7 | model_name: 'sentence-transformers/paraphrase-mpnet-base-v2' 8 | instruction: False 9 | dataset_reader: 10 | _target_: src.dataset_readers.indexer_dsr.IndexerDatasetReader 11 | task_name: ${task_name} 12 | setup_type: ${setup_type} 13 | dataset_split: ${dataset_split} 14 | model_name: ${model_name} 15 | instruction: ${instruction} 16 | model: 17 | _target_: src.models.embedder.IndexEmbedder 18 | model_name: ${model_name} 19 | -------------------------------------------------------------------------------- /src/models/instructor_embedder.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | from typing import Dict 3 | import torch 4 | 5 | 6 | class IndexEmbedder(torch.nn.Module): 7 | def __init__(self, model_name) -> None: 8 | super().__init__() 9 | self.embedder = SentenceTransformer(model_name) 10 | 11 | def forward(self, instruction, enc_text, **kwargs) -> Dict[str, torch.Tensor]: 12 | input = [[i, e, 0] for i, e in zip(instruction, enc_text)] 13 | enc_emb = self.embedder.encode(input, show_progress_bar=False) 14 | return enc_emb -------------------------------------------------------------------------------- /DPR/conf/train/extractive_reader_default.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | eval_step: 2000 4 | batch_size: 16 5 | dev_batch_size: 72 6 | adam_eps: 1e-8 7 | adam_betas: (0.9, 0.999) 8 | max_grad_norm: 1.0 9 | log_batch_step: 100 10 | train_rolling_loss_step: 100 11 | weight_decay: 0.0 12 | learning_rate: 1e-5 13 | 14 | # Linear warmup over warmup_steps. 15 | warmup_steps: 0 16 | 17 | # Number of updates steps to accumulate before performing a backward/update pass. 18 | gradient_accumulation_steps: 1 19 | 20 | # Total number of training epochs to perform. 21 | num_train_epochs: 100000 22 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/requirements_core.txt: -------------------------------------------------------------------------------- 1 | # python 3.8.5 2 | 3 | allennlp==2.0.1 4 | allennlp-models==2.0.1 5 | dash==1.17.0 # optuna visualization 6 | edit-distance==1.0.4 7 | inflect==4.1.0 8 | lxml==4.5.2 9 | matplotlib==3.3.2 10 | more-itertools==8.5.0 11 | networkx==2.5 12 | nltk==3.5 13 | neuralcoref==4.0 14 | optuna==2.3.0 15 | ortools==8.0.8283 16 | pandas==1.1.3 17 | progressbar==2.5 18 | psutil==5.8.0 19 | tensorboard==2.3.0 20 | torch==1.7.1 21 | transformers==4.2.2 22 | 23 | # python -m spacy download en_core_web_sm 24 | # python -c "import nltk; nltk.download('wordnet')" -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_to_qdmr_extractors/converters/base_spans_dep_to_qdmr_converter.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import networkx as nx 4 | 5 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies 6 | from qdecomp_with_dependency_graphs.evaluation.decomposition import Decomposition 7 | 8 | 9 | class BaseSpansDepToQdmrConverter(ABC): 10 | @abstractmethod 11 | def convert(self, spans_dependencies: SpansDependencies) -> Decomposition: 12 | raise NotImplementedError() 13 | -------------------------------------------------------------------------------- /configs/knn_finder.yaml: -------------------------------------------------------------------------------- 1 | index_path: ??? 2 | output_path: ??? 3 | dataset_split: ??? 4 | setup_type: ??? 5 | task_name: ??? 6 | model_name: 'sentence-transformers/paraphrase-mpnet-base-v2' 7 | cuda_device: ??? 8 | instruction: False 9 | batch_size: 50 10 | dataset_reader: 11 | _target_: src.dataset_readers.indexer_dsr.IndexerDatasetReader 12 | task_name: ${task_name} 13 | setup_type: ${setup_type} 14 | dataset_split: ${dataset_split} 15 | model_name: ${model_name} 16 | instruction: ${instruction} 17 | model: 18 | _target_: src.models.embedder.IndexEmbedder 19 | model_name: ${model_name} 20 | 21 | -------------------------------------------------------------------------------- /configs/scorer.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 1 2 | model_name: "EleutherAI/gpt-neo-2.7B" 3 | # model_name: "EleutherAI/gpt-neo-125M" 4 | output_file: ??? 5 | example_file: ??? 6 | setup_type: ??? 7 | task_name: ??? 8 | sort: True 9 | 10 | dataset_reader: 11 | _target_: src.dataset_readers.scorer_dsr.ScorerDatasetReader 12 | example_file: ${example_file} 13 | task_name: ${task_name} 14 | model_name: ${model_name} 15 | setup_type: ${setup_type} 16 | model: 17 | _target_: transformers.AutoModelForCausalLM.from_pretrained 18 | pretrained_model_name_or_path: ${model_name} 19 | local_files_only: True 20 | 21 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_to_qdmr_extractors/base_tokens_dep_to_qdmr_extractor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Tuple 3 | 4 | from qdecomp_with_dependency_graphs.evaluation.decomposition import Decomposition 5 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import TokensDependencies 6 | 7 | 8 | class BaseTokensDependenciesToQDMRExtractor(ABC): 9 | @abstractmethod 10 | def extract(self, tokens_dependencies: TokensDependencies, debug: dict = None) -> Decomposition: 11 | raise NotImplementedError() -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_spans_extractors/aligners/base_aligner.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | from typing import List, Tuple, Set 4 | 5 | import spacy 6 | from spacy.tokens.doc import Doc 7 | 8 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import QDMROperation, StepsSpans 9 | 10 | 11 | class BaseAligner(ABC): 12 | def align(self, question: Doc, steps: List[Doc], steps_operators: List[QDMROperation], 13 | index_to_steps: List[Set[Tuple[int, int]]]) -> List[Set[Tuple[int, int]]]: 14 | raise NotImplementedError() 15 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from abc import ABC, abstractmethod 5 | from typing import Optional 6 | 7 | 8 | class CacheClient(ABC): 9 | async def __aenter__(self): 10 | pass 11 | 12 | async def __aexit__(self, exc_type, exc_value, traceback): 13 | pass 14 | 15 | @abstractmethod 16 | async def get(self, args: dict) -> Optional[dict]: 17 | pass 18 | 19 | @abstractmethod 20 | async def upload(self, args: dict, result: dict) -> None: 21 | pass 22 | -------------------------------------------------------------------------------- /configs/client.yaml: -------------------------------------------------------------------------------- 1 | # cwd: ??? 2 | 3 | # model_name: 'google/t5-v1_1-xl' 4 | model_name: "EleutherAI/gpt-neo-2.7B" 5 | # model_name: "EleutherAI/gpt-neo-125M" 6 | engine: "ada" 7 | output_file: ??? 8 | batch_size: 5 9 | # length_file: ??? 10 | prompt_file: ??? 11 | max_length: 2048 12 | num_prompts: -1 13 | task_name: ??? 14 | 15 | dataset_reader: 16 | _target_: src.dataset_readers.few_shot_dsr.FewShotDatasetReader 17 | model_name: ${model_name} 18 | task_name: ${task_name} 19 | # _target_: src.dataset_readers.tasks.break_task.BreakTask 20 | prompt_file: ${prompt_file} 21 | num_prompts: ${num_prompts} 22 | # length_file: ${length_file} 23 | -------------------------------------------------------------------------------- /easy-elasticsearch/easy_elasticsearch/examples/download_and_run.sh: -------------------------------------------------------------------------------- 1 | #### Downloading #### 2 | ES=./elasticsearch-7.9.1/bin/elasticsearch 3 | if test -f "$ES"; then 4 | echo "$ES exists. Using the existent one" 5 | else 6 | echo "$ES does not exist. Downloading a new one" 7 | wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.1-linux-x86_64.tar.gz 8 | tar -xf elasticsearch-7.9.1-linux-x86_64.tar.gz 9 | fi 10 | 11 | #### Starting the ES service #### 12 | nohup ./elasticsearch-7.9.1/bin/elasticsearch > elasticsearch.log & 13 | 14 | #### Run the example #### 15 | python -m easy_elasticsearch.examples.quora --mode existing 16 | -------------------------------------------------------------------------------- /configs/api_scorer.yaml: -------------------------------------------------------------------------------- 1 | # cwd: ??? 2 | 3 | # model_name: 'google/t5-v1_1-xl' 4 | model_name: "EleutherAI/gpt-neo-2.7B" 5 | # model_name: "EleutherAI/gpt-neo-125M" 6 | engine: "ada" 7 | output_file: ??? 8 | batch_size: 5 9 | # length_file: ??? 10 | example_file: ??? 11 | setup_type: qa 12 | max_length: 2048 13 | task_name: ??? 14 | 15 | dataset_reader: 16 | _target_: src.dataset_readers.scorer_dsr.ScorerDatasetReader 17 | model_name: ${model_name} 18 | task_name: ${task_name} 19 | # _target_: src.dataset_readers.tasks.break_task.BreakTask 20 | # prompt_file: ${prompt_file} 21 | setup_type: ${setup_type} 22 | example_file: ${example_file} 23 | # length_file: ${length_file} 24 | -------------------------------------------------------------------------------- /break_evaluator/allennlp_preds_format.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | def main(): 5 | # todo: @@SEP@@ to ; , @@#@@ to # 6 | predictions_file = "old_data_dev_low_level_preds.json" 7 | traget_file= predictions_file.replace('.json', '.csv') 8 | with open(predictions_file, "r") as fd: 9 | preds = [json.loads(line) for line in fd.readlines()] 10 | preds = [re.sub(r'@@(\d+)@@', '#\g<1>', re.sub('@@SEP@@',';', ' '.join(p['predicted_tokens'][0]))) for p in preds] 11 | preds.insert(0,'prediction') 12 | preds = [f'"{p}"\n' for p in preds] 13 | with open(traget_file, "wt") as fd: 14 | fd.writelines(preds) 15 | 16 | 17 | if __name__ == '__main__': 18 | main() -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/datum.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from dataclasses import dataclass 5 | from typing import Optional, TypeVar 6 | 7 | 8 | @dataclass(frozen=True, eq=True) 9 | class Datum: 10 | dialogue_id: Optional[str] 11 | turn_part_index: Optional[int] 12 | agent_context: Optional[str] 13 | natural: str 14 | 15 | 16 | @dataclass(frozen=True, eq=True) 17 | class FullDatum(Datum): 18 | canonical: str 19 | 20 | 21 | FullDatumSub = TypeVar("FullDatumSub", bound=FullDatum, contravariant=True) 22 | DatumSub = TypeVar("DatumSub", bound=Datum, contravariant=True) 23 | -------------------------------------------------------------------------------- /dataflow/core/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | class SpecialStrings: 6 | """Special strings in stringified turn parts. 7 | """ 8 | 9 | # an empty value (we need it since some library doesn't like an empty string) 10 | NULL = "__NULL" 11 | # indicates there is a break between the two utterance segments 12 | BREAK = "__BREAK" 13 | # indicates the user is the speaker for the following utterance 14 | SPEAKER_USER = "__User" 15 | # indicates the agent is the speaker for the following utterance 16 | SPEAKER_AGENT = "__Agent" 17 | # start of a program 18 | START_OF_PROGRAM = "__StartOfProgram" 19 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/scfg/string_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from typing import Iterable 5 | 6 | 7 | def detokenize(tokens: Iterable[str], with_treebank: bool = True) -> str: 8 | """ 9 | Given a list of tokens, join them together into a string. 10 | with_treebank = True is typically used when rendering utterances, so we don't need to deal with things like 11 | "andrew's" 12 | with_treebank = False is typically for rendering express. 13 | """ 14 | if with_treebank: 15 | return " ".join(tokens).replace(" ", " ") 16 | 17 | return "".join(tokens) 18 | -------------------------------------------------------------------------------- /src/models/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from transformers import AutoTokenizer, AutoModelForCausalLM 4 | 5 | def no_init(loading_code): 6 | def dummy(self): 7 | return 8 | 9 | modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm] 10 | original = {} 11 | for mod in modules: 12 | original[mod] = mod.reset_parameters 13 | mod.reset_parameters = dummy 14 | 15 | result = loading_code() 16 | for mod in modules: 17 | mod.reset_parameters = original[mod] 18 | 19 | return result 20 | 21 | 22 | def get_model(**kwargs): 23 | return no_init(lambda: AutoModelForCausalLM.from_pretrained(**kwargs, local_files_only=True)) -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/debug.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import shutil 3 | 4 | # past command as script parameter 5 | from allennlp.commands import main 6 | 7 | sys.argv=sys.argv[1:] # remove script name 8 | 9 | serialization_dir = "tmp/debugger_train" 10 | 11 | if "train" in sys.argv: 12 | sys.argv.extend(["-s", serialization_dir]) 13 | 14 | # Training will fail if the serialization directory already 15 | # has stuff in it. If you are running the same training loop 16 | # over and over again for debugging purposes, it will. 17 | # Hence we wipe it out in advance. 18 | # BE VERY CAREFUL NOT TO DO THIS FOR ACTUAL TRAINING! 19 | shutil.rmtree(serialization_dir, ignore_errors=True) 20 | 21 | main() -------------------------------------------------------------------------------- /DPR/conf/train/biencoder_nq.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | batch_size: 16 4 | dev_batch_size: 64 5 | adam_eps: 1e-8 6 | adam_betas: (0.9, 0.999) 7 | max_grad_norm: 2.0 8 | log_batch_step: 100 9 | train_rolling_loss_step: 100 10 | weight_decay: 0.0 11 | learning_rate: 2e-5 12 | 13 | # Linear warmup over warmup_steps. 14 | warmup_steps: 1237 15 | 16 | # Number of updates steps to accumulate before performing a backward/update pass. 17 | gradient_accumulation_steps: 1 18 | 19 | # Total number of training epochs to perform. 20 | num_train_epochs: 40 21 | eval_per_epoch: 1 22 | hard_negatives: 1 23 | other_negatives: 0 24 | val_av_rank_hard_neg: 30 25 | val_av_rank_other_neg: 30 26 | val_av_rank_bsz: 128 27 | val_av_rank_max_qs: 10000 -------------------------------------------------------------------------------- /DPR/conf/train/biencoder_default.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | batch_size: 2 4 | dev_batch_size: 4 5 | adam_eps: 1e-8 6 | adam_betas: (0.9, 0.999) 7 | max_grad_norm: 1.0 8 | log_batch_step: 100 9 | train_rolling_loss_step: 100 10 | weight_decay: 0.0 11 | learning_rate: 1e-5 12 | 13 | # Linear warmup over warmup_steps. 14 | warmup_steps: 100 15 | 16 | # Number of updates steps to accumulate before performing a backward/update pass. 17 | gradient_accumulation_steps: 1 18 | 19 | # Total number of training epochs to perform. 20 | num_train_epochs: 40 21 | eval_per_epoch: 1 22 | hard_negatives: 1 23 | other_negatives: 0 24 | val_av_rank_hard_neg: 30 25 | val_av_rank_other_neg: 30 26 | val_av_rank_bsz: 128 27 | val_av_rank_max_qs: 10000 -------------------------------------------------------------------------------- /break_evaluator/Dockerfile: -------------------------------------------------------------------------------- 1 | # Evaluator for Break dataset on beaker 2 | 3 | FROM python:3.7.6-slim-buster 4 | 5 | ENV PYTHONPATH . 6 | 7 | # set the working directory 8 | 9 | WORKDIR /break-evaluator 10 | 11 | 12 | # install python packages 13 | 14 | ADD ./requirements.txt . 15 | 16 | RUN pip3.7 install -r requirements.txt 17 | RUN python3.7 -m spacy download en_core_web_sm 18 | 19 | 20 | # add in the readme and evaluation scripts 21 | 22 | ADD README.md . 23 | ADD allennlp_preds_format.py . 24 | COPY evaluation ./evaluation 25 | COPY scripts ./scripts 26 | COPY utils ./utils 27 | 28 | RUN mkdir /results 29 | 30 | 31 | # define the default command 32 | # in this case a linux shell where we can run the eval script 33 | CMD ["/bin/bash"] 34 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/domains/calflow/grammar/quoted.scfg: -------------------------------------------------------------------------------- 1 | # Quoted strings *do* begin with a space in this grammar. 2 | # For example, `create event with " Rose"`. 3 | # The space has to be a regex, b/c it gets consumed by CopyTokens, 4 | # and it has to not be inside nonquoteplus, because it doesn't 5 | # appear on the plan side. 6 | quoted -> !"\"" !/ / nonquoteplus !"\"", "\"" nonquoteplus "\"" 7 | 8 | # matches one or more characters that are not double quotes 9 | nonquoteplus -> !/[^"]/ nonquotestar, /[^"]/ nonquotestar 10 | 11 | # matches zero or more characters that are not double quotes 12 | nonquotestar -> !/[^"]/ nonquotestar, /[^"]/ nonquotestar 13 | nonquotestar -> empty, empty 14 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/scfg/test_read_grammar.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | 6 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.scfg.read_grammar import PreprocessedGrammar 7 | 8 | 9 | def test_from_line_iter(): 10 | with pytest.raises(AssertionError) as excinfo: 11 | PreprocessedGrammar.from_line_iter( 12 | ['describe 2> "describe"', 'describe 2> "describe(" ")"'] 13 | ) 14 | assert "Macro describe cannot be defined more than once" in str(excinfo) 15 | # Doesn't throw. 16 | PreprocessedGrammar.from_line_iter(['describe 2> "describe"']) 17 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/allennlp_preds_format.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | def main(): 5 | # todo: @@SEP@@ to ; , @@#@@ to # 6 | predictions_file = "old_data_dev_low_level_preds.json" 7 | traget_file= predictions_file.replace('.json', '.csv') 8 | with open(predictions_file, "r") as fd: 9 | preds = [json.loads(line) for line in fd.readlines()] 10 | preds = [re.sub(r'@@(\d+)@@', '#\g<1>', re.sub('@@SEP@@',';', ' '.join(p['predicted_tokens'][0]))) for p in preds] 11 | preds.insert(0,'prediction') 12 | preds = [f'"{p}"\n' for p in preds] 13 | with open(traget_file, "wt") as fd: 14 | fd.writelines(preds) 15 | 16 | 17 | if __name__ == '__main__': 18 | main() -------------------------------------------------------------------------------- /break_evaluator/evaluate.yaml: -------------------------------------------------------------------------------- 1 | 2 | description: Run the evaluator for the Break dataset. 3 | tasks: 4 | - spec: 5 | blueprint: $BREAK_EVALUATOR 6 | resultPath: /results 7 | args: 8 | - PYTHONPATH="." 9 | - python3.7 10 | - scripts/evaluate_predictions.py 11 | - --dataset_file=data/labels.csv 12 | - --preds_file=data/predictions.csv 13 | - --no_cache 14 | - --output_file_base=/results/results 15 | - --metrics 16 | - ged_scores exact_match sari normalized_exact_match 17 | datasetMounts: 18 | - datasetId: $BREAK_PREDICTIONS 19 | containerPath: /data/predictions 20 | - datasetId: $BREAK_LABELS 21 | containerPath: /data/labels.csv -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/util/missing_sentinel.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | class MissingSentinel: 5 | """One instance of this is created as MISSING_SENTINEL below. 6 | 7 | That instance is used to indicate that a variable lacks a value, and nothing else. 8 | 9 | Usually None is used for this purpose, but sometimes None is in the valid 10 | set of values and cannot be used to mean that a value is missing. 11 | 12 | This is very similar to dataclasses.MISSING, but that value has a private type.""" 13 | 14 | def __repr__(self) -> str: 15 | return "" 16 | 17 | 18 | MISSING_SENTINEL = MissingSentinel() 19 | -------------------------------------------------------------------------------- /src/data/datasets/totto.py: -------------------------------------------------------------------------------- 1 | import json 2 | import datasets 3 | 4 | class ToTToDataset: 5 | def __init__(self): 6 | _URL = "https://storage.googleapis.com/totto-public/totto_data.zip" 7 | dl_manager = datasets.utils.download_manager.DownloadManager() 8 | self.cache_path = dl_manager.download_and_extract(_URL) 9 | self.splits = {} 10 | for split_name in ["train","dev"]: 11 | with open(f"{self.cache_path}/totto_data/totto_{split_name}_data.jsonl", 'r') as f: 12 | proccessed_dataset = [] 13 | for example in f: 14 | dict_example = json.loads(example) 15 | proccessed_dataset.append(dict_example) 16 | self.splits[split_name] = proccessed_dataset -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/earley/test_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.earley.input import SigmaStarTriePosition 5 | 6 | 7 | def test_sigmastar(): 8 | p = SigmaStarTriePosition[str]() 9 | (a_1,) = p.scan("a") 10 | (a_2,) = p.scan("a") 11 | assert id(a_1) == id(a_2), "scans should be cached and reused" 12 | 13 | (as_1,) = a_1.scan("s") 14 | (asd,) = as_1.scan("d") 15 | (asdf,) = asd.scan("f") 16 | assert asdf.last() == "f" 17 | assert asdf.prefix() == ["a", "s", "d", "f"] 18 | 19 | (asde,) = asd.scan("e") 20 | assert asde.prefix() == ["a", "s", "d", "e"] 21 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/Dockerfile: -------------------------------------------------------------------------------- 1 | # Evaluator for Break dataset on beaker 2 | 3 | FROM python:3.7.6-slim-buster 4 | 5 | ENV PYTHONPATH . 6 | 7 | # set the working directory 8 | 9 | WORKDIR /break-evaluator 10 | 11 | 12 | # install python packages 13 | 14 | ADD ./requirements.txt . 15 | 16 | RUN pip3.7 install -r requirements.txt 17 | RUN python3.7 -m spacy download en_core_web_sm 18 | 19 | 20 | # add in the readme and evaluation scripts 21 | 22 | ADD README.md . 23 | ADD allennlp_preds_format.py . 24 | COPY evaluation ./evaluation 25 | COPY scripts ./scripts 26 | COPY utils ./utils 27 | 28 | RUN mkdir /results 29 | 30 | 31 | # define the default command 32 | # in this case a linux shell where we can run the eval script 33 | CMD ["/bin/bash"] 34 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/evaluate.yaml: -------------------------------------------------------------------------------- 1 | 2 | description: Run the evaluator for the Break dataset. 3 | tasks: 4 | - spec: 5 | blueprint: $BREAK_EVALUATOR 6 | resultPath: /results 7 | args: 8 | - PYTHONPATH="." 9 | - python3.7 10 | - scripts/evaluate_predictions.py 11 | - --dataset_file=data/labels.csv 12 | - --preds_file=data/predictions.csv 13 | - --no_cache 14 | - --output_file_base=/results/results 15 | - --metrics 16 | - ged_scores exact_match sari normalized_exact_match 17 | datasetMounts: 18 | - datasetId: $BREAK_PREDICTIONS 19 | containerPath: /data/predictions 20 | - datasetId: $BREAK_LABELS 21 | containerPath: /data/labels.csv -------------------------------------------------------------------------------- /DPR/conf/encoder/hf_bert.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | # model type. One of [hf_bert, pytext_bert, fairseq_roberta] 4 | encoder_model_type: hf_bert 5 | 6 | # HuggingFace's config name for model initialization 7 | pretrained_model_cfg: bert-base-uncased 8 | #pretrained_model_cfg: Luyu/co-condenser-marco 9 | 10 | # Some encoders need to be initialized from a file 11 | pretrained_file: 12 | 13 | # Extra linear layer on top of standard bert/roberta encoder 14 | projection_dim: 0 15 | 16 | # Max length of the encoder input sequence 17 | sequence_length: 256 18 | 19 | dropout: 0.1 20 | 21 | # whether to fix (don't update) context encoder during training or not 22 | fix_ctx_encoder: False 23 | 24 | # if False, the model won't load pre-trained BERT weights 25 | pretrained: True 26 | 27 | gradient_checkpointing: False -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_collapser import BaseCollapser 2 | from .join_collapser import JoinCollapser 3 | from .concat_collapser import ConcatCollapser 4 | from .missing_resources_collapser import MissingResourcesCollapser 5 | from .last_step_collapser import LastStepCollapser 6 | from .to_dependency_type_collapser import ToDependencyTypeCollapser 7 | from .single_to_multiple_steps_pre_collapser import PreSingleToMultipleStepsCollapser 8 | from .not_aligned_dum_collapser import NotAlignedDumCollapser 9 | from .single_to_multiple_steps_dup_collapser import DupSingleToMultipleStepsCollapser 10 | from .add_operator_properties_collapser import AddOperatorsPropertiesCollapser 11 | from .to_sequential_ids_collapser import ToSequentialIdsCollapser -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/to_dependency_type_collapser.py: -------------------------------------------------------------------------------- 1 | from overrides import overrides 2 | 3 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import DependencyType, SpansDependencies 4 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors.tokens_dependencies_extractors.collapsers.base_collapser import BaseCollapser 5 | 6 | 7 | class ToDependencyTypeCollapser(BaseCollapser): 8 | @overrides 9 | def collapse(self, spans_dependencies: SpansDependencies, decomposition: str= None) -> None: 10 | pass 11 | 12 | @overrides 13 | def unwind(self, spans_dependencies: SpansDependencies) -> None: 14 | for _, _, data in spans_dependencies.dependencies(): 15 | data.dep_type = DependencyType(data.dep_type) 16 | -------------------------------------------------------------------------------- /src/models/embedder.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModel 2 | from typing import Dict 3 | import torch 4 | 5 | def mean_pooling(model_output, attention_mask): 6 | token_embeddings = model_output[0] #First element of model_output contains all token embeddings 7 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 8 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 9 | 10 | class IndexEmbedder(torch.nn.Module): 11 | def __init__(self, model_name) -> None: 12 | super().__init__() 13 | self.embedder = AutoModel.from_pretrained(model_name) 14 | 15 | def forward(self, input_ids, attention_mask,**kwargs) -> Dict[str, torch.Tensor]: 16 | enc_emb = self.embedder(input_ids) 17 | return mean_pooling(enc_emb, attention_mask) -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/scfg/test_string_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.scfg.parser.utils import is_skippable 5 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.scfg.string_utils import detokenize 6 | 7 | 8 | def test_is_comment(): 9 | assert is_skippable("#hi") 10 | assert is_skippable("") 11 | assert not is_skippable("hi") 12 | 13 | 14 | def test_detokenize(): 15 | assert ( 16 | detokenize(["find", "Event", "time", ".", "results", "chris", "'s", "car"]) 17 | == "find Event time . results chris 's car" 18 | ) 19 | 20 | assert detokenize(["f", "(", "x", ",", "y", ")"], with_treebank=False) == "f(x,y)" 21 | -------------------------------------------------------------------------------- /DPR/conf/train/biencoder_local.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | # batch_size: 40 4 | batch_size: 120 5 | dev_batch_size: 32 6 | adam_eps: 1e-8 7 | adam_betas: (0.9, 0.999) 8 | max_grad_norm: 2.0 9 | log_batch_step: 1 10 | train_rolling_loss_step: 100 11 | weight_decay: 0.0 12 | # learning_rate: 2e-5 13 | # learning_rate: 0.000213 14 | learning_rate: 0.00013416407864998739 15 | # learning_rate: 0.0001065 16 | 17 | # encoder: 18 | # dropout: 0.15 19 | 20 | # Linear warmup over warmup_steps. 21 | warmup_steps: 1237 22 | 23 | # Number of updates steps to accumulate before performing a backward/update pass. 24 | gradient_accumulation_steps: 1 25 | 26 | # Total number of training epochs to perform. 27 | num_train_epochs: 30 28 | eval_per_epoch: 1 29 | hard_negatives: 1 30 | other_negatives: 0 31 | val_av_rank_hard_neg: 30 32 | val_av_rank_other_neg: 30 33 | val_av_rank_bsz: 128 34 | val_av_rank_max_qs: 10000 35 | -------------------------------------------------------------------------------- /src/utils/log_utils.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import logging 3 | 4 | @contextmanager 5 | def all_logging_disabled(highest_level=logging.CRITICAL): 6 | """ 7 | A context manager that will prevent any logging messages 8 | triggered during the body from being processed. 9 | :param highest_level: the maximum logging level in use. 10 | This would only need to be changed if a custom level greater than CRITICAL 11 | is defined. 12 | """ 13 | # two kind-of hacks here: 14 | # * can't get the highest logging level in effect => delegate to the user 15 | # * can't get the current module-level override => use an undocumented 16 | # (but non-private!) interface 17 | 18 | previous_level = logging.root.manager.disable 19 | 20 | logging.disable(highest_level) 21 | 22 | try: 23 | yield 24 | finally: 25 | logging.disable(previous_level) -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/scripts/qdmr_to_logical_form/utils_.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | DELIMITER = ';' 4 | REF = '#' 5 | 6 | 7 | def parse_decomposition(qdmr): 8 | """Parses the decomposition into an ordered list of steps 9 | 10 | Parameters 11 | ---------- 12 | qdmr : str 13 | String representation of the QDMR 14 | 15 | Returns 16 | ------- 17 | list 18 | returns ordered list of qdmr steps 19 | """ 20 | # parse commas as separate tokens 21 | qdmr = qdmr.replace(",", " , ") 22 | crude_steps = qdmr.split(DELIMITER) 23 | steps = [] 24 | for i in range(len(crude_steps)): 25 | step = crude_steps[i] 26 | tokens = step.split() 27 | step = "" 28 | # remove 'return' prefix 29 | for tok in tokens[1:]: 30 | step += tok.strip() + " " 31 | step = step.strip() 32 | steps += [step] 33 | return steps -------------------------------------------------------------------------------- /src/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import types 2 | import pathlib 3 | from os.path import dirname, isfile, join 4 | import os 5 | import glob 6 | import json 7 | 8 | 9 | modules = {} 10 | modules_list = glob.glob(join(dirname(__file__), "*.py")) 11 | for path in modules_list: 12 | if isfile(path) and not path.endswith('__init__.py') and not path.endswith('__main__.py'): 13 | mod_name = pathlib.Path(path).name[:-3] 14 | module = types.ModuleType(mod_name) 15 | with open(path) as f: 16 | module_str = f.read() 17 | exec(module_str, module.__dict__) 18 | modules[mod_name] = module 19 | 20 | dataset_dict = {} 21 | for module_name, module in modules.items(): 22 | for el in dir(module): 23 | if el.endswith("Dataset"): 24 | obj = module.__dict__[el] 25 | dataset_dict[module_name] = obj 26 | 27 | def get_dataset(name): 28 | return dataset_dict[name]() 29 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/base_collapser.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Tuple 3 | 4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies, DependencyType 5 | 6 | 7 | class BaseCollapser(ABC): 8 | def __init__(self): 9 | self.additional_tokens = [] 10 | 11 | """ 12 | Deal with empty spans in SpansDependencies graph 13 | """ 14 | @abstractmethod 15 | def collapse(self, spans_dependencies: SpansDependencies, decomposition: str) -> None: 16 | raise NotImplementedError() 17 | 18 | @abstractmethod 19 | def unwind(self, spans_dependencies: SpansDependencies) -> None: 20 | raise NotImplementedError() 21 | 22 | @staticmethod 23 | def _get_operator(x: str): 24 | return DependencyType(x).get_operator() if DependencyType.has_value(x) else x -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/scripts/data_processing/add_extra_tokens.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | from os import path 4 | 5 | from qdecomp_with_dependency_graphs.dependencies_graph.create_dependencies_graphs import get_extra_tokens 6 | 7 | 8 | def main(root_dir: str): 9 | files = Path(root_dir).rglob('*_seq2seq.csv') 10 | extra_tokens, _ = get_extra_tokens() 11 | extra_tokens = ' '.join(extra_tokens) 12 | for fp in files: 13 | fp = str(fp) 14 | try: 15 | print(f'process {fp}...') 16 | df = pd.read_csv(fp) 17 | df['question_text'] = df['question_text'].apply(lambda x: f'{x} {extra_tokens}') 18 | dst_fp = path.splitext(fp)[0]+'__extra_tok.csv' 19 | df.to_csv(dst_fp, index=False) 20 | except Exception as ex: 21 | print(f'ERROR: {ex}') 22 | 23 | if __name__ == '__main__': 24 | main(root_dir= 'datasets/Break/QDMR/') -------------------------------------------------------------------------------- /break_evaluator/utils/timeout.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | 4 | import sys 5 | import threading 6 | 7 | 8 | try: 9 | import thread 10 | except ImportError: 11 | import _thread as thread 12 | 13 | 14 | def quit_function(fn_name): 15 | print('{0} took too long'.format(fn_name), file=sys.stderr) 16 | sys.stderr.flush() 17 | # raises KeyboardInterrupt 18 | thread.interrupt_main() 19 | 20 | 21 | def exit_after(s): 22 | """ 23 | use as decorator to exit process if 24 | function takes longer than s seconds 25 | """ 26 | def outer(fn): 27 | def inner(*args, **kwargs): 28 | timer = threading.Timer(s, quit_function, args=[fn.__name__]) 29 | timer.start() 30 | try: 31 | result = fn(*args, **kwargs) 32 | finally: 33 | timer.cancel() 34 | return result 35 | return inner 36 | return outer 37 | 38 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/tests/semantic_parsing_with_constrained_lm/earley/test_agenda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.earley.agenda import Agenda, Meta 5 | 6 | 7 | def test_push_pop(): 8 | a = Agenda() 9 | z = Meta.zero() 10 | assert a.push(3, z) 11 | assert a.push(5, z) 12 | # duplicate should be ignored 13 | assert not a.push(3, z) 14 | assert a.popped == [] 15 | assert a.remaining == [3, 5] 16 | assert a.pop() == 3 17 | assert a.popped == [3] 18 | assert a.remaining == [5] 19 | # duplicate should be ignored 20 | assert not a.push(3, z) 21 | assert a.push(7, z) 22 | 23 | assert a.popped == [3] 24 | assert a.remaining == [5, 7] 25 | 26 | def it(): 27 | while a: 28 | yield a.pop() 29 | 30 | assert list(it()) == [5, 7] 31 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/utils/timeout.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | 4 | import sys 5 | import threading 6 | 7 | 8 | try: 9 | import thread 10 | except ImportError: 11 | import _thread as thread 12 | 13 | 14 | def quit_function(fn_name): 15 | print('{0} took too long'.format(fn_name), file=sys.stderr) 16 | sys.stderr.flush() 17 | # raises KeyboardInterrupt 18 | thread.interrupt_main() 19 | 20 | 21 | def exit_after(s): 22 | """ 23 | use as decorator to exit process if 24 | function takes longer than s seconds 25 | """ 26 | def outer(fn): 27 | def inner(*args, **kwargs): 28 | timer = threading.Timer(s, quit_function, args=[fn.__name__]) 29 | timer.start() 30 | try: 31 | result = fn(*args, **kwargs) 32 | finally: 33 | timer.cancel() 34 | return result 35 | return inner 36 | return outer 37 | 38 | -------------------------------------------------------------------------------- /break_evaluator/tmp/results/decomp_summary.txt: -------------------------------------------------------------------------------- 1 | overall scores: 2 | ged score: mean 0.371 max 0.998 min 0.000 3 | normalized_exact_match score: mean 0.280 max 1.000 min 0.000 4 | skipped 9 examples when computing ged. 5 | ged normalized_exact_match 6 | dataset 7 | ATIS 0.209 0.417 8 | CLEVR 0.560 0.167 9 | COMQA 0.422 0.111 10 | CWQ 0.375 0.000 11 | DROP 0.237 0.333 12 | GEO 0.000 1.000 13 | NLVR2 0.454 0.190 14 | SPIDER 0.239 0.467 15 | ged normalized_exact_match 16 | num_steps 17 | 2 0.282 0.500 18 | 3 0.274 0.350 19 | 4 0.334 0.300 20 | 5 0.425 0.235 21 | 6 0.425 0.231 22 | 7 0.549 0.100 23 | 8 NaN 0.000 24 | 9 NaN 0.000 25 | 10 0.398 0.000 26 | 11 0.839 0.000 27 | 12 0.000 0.333 28 | 13 0.453 0.500 29 | 20 NaN 0.000 30 | {'GED': 0.3709896995734663, 'norm_EM': 0.28} 31 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/utils/timeout.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | 4 | import sys 5 | import threading 6 | 7 | 8 | try: 9 | import thread 10 | except ImportError: 11 | import _thread as thread 12 | 13 | 14 | def quit_function(fn_name): 15 | print('{0} took too long'.format(fn_name), file=sys.stderr) 16 | sys.stderr.flush() 17 | # raises KeyboardInterrupt 18 | thread.interrupt_main() 19 | 20 | 21 | def exit_after(s): 22 | """ 23 | use as decorator to exit process if 24 | function takes longer than s seconds 25 | """ 26 | def outer(fn): 27 | def inner(*args, **kwargs): 28 | timer = threading.Timer(s, quit_function, args=[fn.__name__]) 29 | timer.start() 30 | try: 31 | result = fn(*args, **kwargs) 32 | finally: 33 | timer.cancel() 34 | return result 35 | return inner 36 | return outer 37 | 38 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/base_tokens_dependencies_extractor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict, List, Tuple 3 | 4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import TokensDependencies, SpansDependencies 5 | 6 | 7 | class BaseTokensDependenciesExtractor(ABC): 8 | @abstractmethod 9 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None, 10 | debug: dict = None) -> TokensDependencies: 11 | raise NotImplementedError() 12 | 13 | def get_extra_tokens(self) -> List[str]: 14 | return [] 15 | 16 | def to_spans_dependencies(self, tokens_dependencies: TokensDependencies, 17 | debug: dict = None) -> SpansDependencies: 18 | # spans dependencies graph 19 | spans_dependencies: SpansDependencies = tokens_dependencies.to_spans_dependencies() 20 | return spans_dependencies 21 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | import types 2 | import pathlib 3 | from os.path import dirname, isfile, join 4 | import os 5 | import glob 6 | import json 7 | 8 | modules = {} 9 | modules_list = glob.glob(join(dirname(__file__), "*.py")) 10 | for path in modules_list: 11 | if isfile(path) and not path.endswith('__init__.py') and not path.endswith('task_.py'): 12 | mod_name = pathlib.Path(path).name[:-3] 13 | module = types.ModuleType(mod_name) 14 | with open(path) as f: 15 | module_str = f.read() 16 | exec(module_str, module.__dict__) 17 | modules[mod_name] = module 18 | 19 | task_list = {} 20 | for module_name, module in modules.items(): 21 | for el in dir(module): 22 | if el.endswith("BM25Task"): 23 | obj = module.__dict__[el] 24 | task_list[obj.name] = obj 25 | 26 | 27 | class BM25Task: 28 | def __init__(self) -> None: 29 | pass 30 | @classmethod 31 | def from_name(cls,name): 32 | return task_list[name] 33 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | import types 2 | import pathlib 3 | from os.path import dirname, isfile, join 4 | import os 5 | import glob 6 | import json 7 | 8 | 9 | modules = {} 10 | modules_list = glob.glob(join(dirname(__file__), "*.py")) 11 | for path in modules_list: 12 | if isfile(path) and not path.endswith('__init__.py') and not path.endswith('task_.py'): 13 | mod_name = pathlib.Path(path).name[:-3] 14 | module = types.ModuleType(mod_name) 15 | with open(path) as f: 16 | module_str = f.read() 17 | exec(module_str, module.__dict__) 18 | modules[mod_name] = module 19 | 20 | task_list = {} 21 | for module_name, module in modules.items(): 22 | for el in dir(module): 23 | if el.endswith("ScorerTask"): 24 | obj = module.__dict__[el] 25 | task_list[obj.name] = obj 26 | 27 | 28 | class ScorerTask: 29 | def __init__(self) -> None: 30 | pass 31 | @classmethod 32 | def from_name(cls,name): 33 | return task_list[name] 34 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "semantic_parsing_with_constrained_lm" 3 | version = "0.1.0" 4 | description = "Tools and instructions for reproducing the experiments in the paper Constrained Language Models Yield Few-Shot Semantic Parsers (EMNLP 2021)." 5 | authors = ["Microsoft Semantic Machines "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.7" 9 | matplotlib = "^3.1.0" 10 | torch = "1.6.0" 11 | pydantic = "^1.4" 12 | lark-parser = "^0.8.2" 13 | requests = "^2.20.1" 14 | cached-property = "^1.5.1" 15 | typer = "^0.3.0" 16 | jsons = "^0.10.1" 17 | more_itertools = "^8.2.0" 18 | transformers = "4.5.0" 19 | httpx = {version = "^0.16.1", extras = ["http2"]} 20 | datasets = "1.1.3" 21 | appdirs = "^1.4.4" 22 | sm-dataflow = {git = "https://github.com/microsoft/task_oriented_dialogue_as_dataflow_synthesis.git"} 23 | blobfile = "^1.2.5" 24 | 25 | [tool.poetry.dev-dependencies] 26 | pytest = "^4.3.1" 27 | black = "19.10b0" 28 | isort = "4.3.21" 29 | mypy = "0.782" 30 | pylint = "2.6.0" 31 | -------------------------------------------------------------------------------- /src/dataset_readers/inference_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | import types 2 | import pathlib 3 | from os.path import dirname, isfile, join 4 | import os 5 | import glob 6 | import json 7 | 8 | modules = {} 9 | modules_list = glob.glob(join(dirname(__file__), "*.py")) 10 | for path in modules_list: 11 | if isfile(path) and not path.endswith('__init__.py') and not path.endswith('task_.py'): 12 | mod_name = pathlib.Path(path).name[:-3] 13 | module = types.ModuleType(mod_name) 14 | with open(path) as f: 15 | module_str = f.read() 16 | exec(module_str, module.__dict__) 17 | modules[mod_name] = module 18 | 19 | task_list = {} 20 | for module_name, module in modules.items(): 21 | for el in dir(module): 22 | if el.endswith("InferenceTask"): 23 | obj = module.__dict__[el] 24 | task_list[obj.name] = obj 25 | 26 | 27 | class InferenceTask: 28 | def __init__(self) -> None: 29 | pass 30 | @classmethod 31 | def from_name(cls,name): 32 | return task_list[name] 33 | -------------------------------------------------------------------------------- /DPR/eval.sh: -------------------------------------------------------------------------------- 1 | 2 | python generate_dense_embeddings.py model_file=/media/disk1/ohadr/lr1e-5/dpr_biencoder.29 \ 3 | ctx_src=dpr_grail shard_id=0 num_shards=1 out_file=/mnt/netapp7/ohadr/GrailSmBop/DPR/entities_c29_lr1_enc 4 | 5 | python dense_retriever.py model_file=/media/disk1/ohadr/lr1e-5/dpr_biencoder.29 qa_dataset=grailqa_train ctx_datatsets=[dpr_grail] \ 6 | encoded_ctx_files=["/mnt/netapp7/ohadr/GrailSmBop/DPR/entities_c29_lr1_enc_*"] out_file=/mnt/netapp7/ohadr/GrailSmBop/DPR/dpr_pred_train_c29_lr1.json 7 | python dense_retriever.py model_file=/media/disk1/ohadr/lr1e-5/dpr_biencoder.29 qa_dataset=grailqa_dev ctx_datatsets=[dpr_grail] \ 8 | encoded_ctx_files=["/mnt/netapp7/ohadr/GrailSmBop/DPR/entities_c29_lr1_enc_*"] out_file=/mnt/netapp7/ohadr/GrailSmBop/DPR/dpr_pred_dev_c29_lr1.json 9 | 10 | 11 | python dpr/data/download_data.py --resource data.retriever.qas.trivia-dev 12 | [optional --output_dir {your location}] 13 | 14 | 15 | (grail) ohadr@pc-jonathan-g01:~/GrailSmBop/DPR$ head dpr_pred_dev.json -n 1000 |grep --color=always -e "^" -e true 16 | -------------------------------------------------------------------------------- /easy-elasticsearch/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | with open("README.md", "r", encoding="utf-8") as fh: 5 | readme = fh.read() 6 | 7 | setup( 8 | name="easy-elasticsearch", 9 | version="0.0.9", 10 | author="Kexin Wang", 11 | author_email="kexin.wang.2049@gmail.com", 12 | description="An easy-to-use Elasticsearch BM25 interface", 13 | long_description=readme, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/kwang2049/easy-elasticsearch", 16 | project_urls={ 17 | "Bug Tracker": "https://github.com/kwang2049/easy-elasticsearch/issues", 18 | }, 19 | packages=find_packages(), 20 | classifiers=[ 21 | "Programming Language :: Python :: 3", 22 | "License :: OSI Approved :: Apache Software License", 23 | "Operating System :: OS Independent", 24 | ], 25 | python_requires=">=3.6", 26 | install_requires=[ 27 | "elasticsearch>=7.9.1", # BeIR requires es==7.9.1 28 | "tqdm", 29 | "requests", 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /configs/inference.yaml: -------------------------------------------------------------------------------- 1 | # cwd: ??? 2 | batch_size: 1 3 | 4 | # model_name: 'google/t5-v1_1-xl' 5 | model_name: "EleutherAI/gpt-neo-2.7B" 6 | #model_name: "EleutherAI/gpt-j-6B" 7 | # model_name: "EleutherAI/gpt-neo-125M" 8 | output_file: ??? 9 | # length_file: ??? 10 | prompt_file: ??? 11 | max_length: 2048 12 | num_prompts: -1 13 | data_num: -1 14 | task_name: ??? 15 | gen: True 16 | order: ascending 17 | #template_idx: ??? 18 | # model_name: 'google/t5-v1_1-small' 19 | dataset_reader: 20 | _target_: src.dataset_readers.few_shot_dsr.FewShotDatasetReader 21 | model_name: ${model_name} 22 | task_name: ${task_name} 23 | # _target_: src.dataset_readers.tasks.break_task.BreakTask 24 | prompt_file: ${prompt_file} 25 | # length_file: ${length_file} 26 | num_prompts: ${num_prompts} 27 | gen: ${gen} 28 | data_num: ${data_num} 29 | order: ${order} 30 | # template_idx: ${template_idx} 31 | 32 | model: 33 | _target_: src.models.model.get_model 34 | # _target_: transformers.AutoModelForCausalLM.from_pretrained 35 | pretrained_model_name_or_path: ${model_name} 36 | 37 | 38 | -------------------------------------------------------------------------------- /break_evaluator/tmp/results/question_decomp_summary.txt: -------------------------------------------------------------------------------- 1 | overall scores: 2 | ged score: mean 0.333 max 1.056 min 0.000 3 | normalized_exact_match score: mean 0.256 max 1.000 min 0.000 4 | skipped 71 examples when computing ged. 5 | ged normalized_exact_match 6 | dataset 7 | ACADEMIC 0.288 0.400 8 | ATIS 0.233 0.304 9 | CLEVR 0.368 0.311 10 | COMQA 0.218 0.419 11 | CWQ 0.393 0.113 12 | DROP 0.340 0.213 13 | GEO 0.311 0.250 14 | NLVR2 0.350 0.277 15 | SPIDER 0.334 0.158 16 | ged normalized_exact_match 17 | num_steps 18 | 1 0.875 0.000 19 | 2 0.250 0.424 20 | 3 0.273 0.341 21 | 4 0.336 0.204 22 | 5 0.338 0.232 23 | 6 0.432 0.120 24 | 7 0.339 0.257 25 | 8 0.529 0.147 26 | 9 0.437 0.250 27 | 10 0.416 0.273 28 | 11 0.535 0.188 29 | 12 0.166 0.267 30 | 13 0.840 0.000 31 | 14 0.709 0.000 32 | 15 0.000 1.000 33 | 16 NaN 0.000 34 | 18 0.872 0.000 35 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "break-evaluator" 3 | version = "0.1.0" 4 | description = "Forked from https://github.com/allenai/break-evaluator" 5 | authors = [] 6 | packages = [ 7 | { include = "evaluation" }, 8 | { include = "scripts" }, 9 | { include = "utils" }, 10 | ] 11 | 12 | [tool.poetry.dependencies] 13 | python = "~3.7.0" 14 | edit-distance = "1.0.4" 15 | editdistance = "0.5.3" 16 | matplotlib = "3.1.2" 17 | networkx = "2.4" 18 | neuralcoref = "4.0" 19 | overrides = "2.8.0" 20 | pandas = "0.25.3" 21 | lxml = "4.5.0" 22 | progressbar = "2.5" 23 | scipy = "1.4.1" 24 | spacy = "2.1.9" 25 | en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz"} 26 | 27 | [tool.poetry.dev-dependencies] 28 | 29 | [tool.poetry.scripts] 30 | break_evaluate_predictions = "scripts.evaluate_predictions:real_main" 31 | break_qdmr_to_program = "scripts.qdmr_to_program:main" 32 | 33 | [build-system] 34 | requires = ["poetry-core>=1.0.0"] 35 | build-backend = "poetry.core.masonry.api" 36 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/to_sequential_ids_collapser.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | from overrides import overrides 3 | 4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies 5 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors.tokens_dependencies_extractors.collapsers.base_collapser import BaseCollapser 6 | 7 | 8 | class ToSequentialIdsCollapser(BaseCollapser): 9 | @overrides 10 | def collapse(self, spans_dependencies: SpansDependencies, decomposition: str= None) -> None: 11 | pass 12 | 13 | @overrides 14 | def unwind(self, spans_dependencies: SpansDependencies) -> None: 15 | dependencies_graph = spans_dependencies._dependencies_graph 16 | # fix steps ids 17 | relabel_map = {n_id: i for (n_id, i) in 18 | zip(sorted(dependencies_graph.nodes()), range(1, dependencies_graph.number_of_nodes() + 1)) 19 | if n_id != i} 20 | if relabel_map: 21 | nx.relabel_nodes(dependencies_graph, relabel_map, copy=False) 22 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/src/semantic_parsing_with_constrained_lm/trie_partial_parse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import dataclasses 5 | from dataclasses import dataclass 6 | from typing import Optional, Tuple 7 | 8 | import torch 9 | 10 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.util.trie import Trie 11 | from semantic_parsing_with_constrained_lm.src.semantic_parsing_with_constrained_lm.search import PartialParse 12 | 13 | 14 | @dataclass 15 | class TriePartialParse(PartialParse): 16 | trie: Trie[int] 17 | tokens: Tuple[int, ...] = () 18 | 19 | def allowed_next( 20 | self, ordered_ids: Optional[torch.Tensor] = None, top_k: Optional[int] = None 21 | ) -> Tuple[torch.Tensor, bool]: 22 | allowed, is_complete = self.trie.prefix_next(self.tokens) 23 | return torch.tensor(sorted(allowed), dtype=torch.long), is_complete 24 | 25 | def append(self, token: int) -> "PartialParse": 26 | """Return a new PartialParse creatoted by appending this token.""" 27 | return dataclasses.replace(self, tokens=self.tokens + (token,)) 28 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/tokens_dependencies_extractors/collapsers/add_operator_properties_collapser.py: -------------------------------------------------------------------------------- 1 | from overrides import overrides 2 | import re 3 | 4 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import SpansDependencies 5 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors.tokens_dependencies_extractors.collapsers.base_collapser import BaseCollapser 6 | 7 | 8 | class AddOperatorsPropertiesCollapser(BaseCollapser): 9 | @overrides 10 | def collapse(self, spans_dependencies: SpansDependencies, decomposition: str= None) -> None: 11 | for _, _, data in spans_dependencies.dependencies(): 12 | if data.properties: 13 | data.dep_type = f'{data.dep_type}[{",".join(data.properties)}]' 14 | 15 | @overrides 16 | def unwind(self, spans_dependencies: SpansDependencies) -> None: 17 | for _, _, data in spans_dependencies.dependencies(): 18 | regx = re.search(r'(.*)\[(.+)\]', data.dep_type) 19 | if regx: 20 | dep, prop = regx.groups() 21 | data.dep_type = dep 22 | data.properties = prop.split(",") 23 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/models/seq2seq/simple_seq2seq_custom.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List 2 | from overrides import overrides 3 | 4 | import torch 5 | from allennlp.data import TextFieldTensors 6 | from allennlp.models import Model 7 | from allennlp.nn import InitializerApplicator 8 | from allennlp_models.generation.models import SimpleSeq2Seq 9 | 10 | 11 | @Model.register("simple_seq2seq_custom") 12 | class SimpleSeq2SeqCustom(SimpleSeq2Seq): 13 | def __init__(self, 14 | initializer: InitializerApplicator = InitializerApplicator(), 15 | **kwargs): 16 | super().__init__(**kwargs) 17 | initializer(self) 18 | 19 | @overrides 20 | def forward( 21 | self, 22 | source_tokens: TextFieldTensors, 23 | target_tokens: TextFieldTensors = None, 24 | metadata: List[Dict[str, Any]] = None, 25 | **kwargs # skip extra fields 26 | ) -> Dict[str, torch.Tensor]: 27 | output_dict = super().forward(source_tokens=source_tokens, target_tokens=target_tokens) 28 | if metadata: 29 | output_dict['metadata'] = metadata 30 | return output_dict 31 | 32 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 AI2 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/e2e.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re, os 3 | import json 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class E2eScorerTask: 8 | name = "e2e" 9 | question_field = "question" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_E2E") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "Sentence: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "Table: " 24 | answer_prefix = "Sentence: " 25 | test_question = question_prefix + entry['test_question'] 26 | question = question_prefix + entry['question'] 27 | decomposition = answer_prefix + entry['target'] 28 | test_decomposition = entry['test_target'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/go.py: -------------------------------------------------------------------------------- 1 | 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import re 4 | import json, os 5 | from src.utils.dataset_utils import load_train_dataset 6 | 7 | 8 | class GoScorerTask: 9 | name = "go" 10 | question_field = "question" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_Go") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "Comment: " 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "Code: " 25 | answer_prefix = "Comment: " 26 | test_question = question_prefix + entry['test_question'] 27 | question = question_prefix + entry['question'] 28 | decomposition = answer_prefix + entry['target'] 29 | test_decomposition = entry['test_target'] 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/java.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class JavaScorerTask: 8 | name = "java" 9 | question_field = "question" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_Java") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "Comment: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "Code: " 24 | answer_prefix = "Comment: " 25 | test_question = question_prefix + entry['test_question'] 26 | question = question_prefix + entry['question'] 27 | decomposition = answer_prefix + entry['target'] 28 | test_decomposition = entry['test_target'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/pubmed.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class PubmedScorerTask: 8 | name = "pubmed" 9 | question_field = "question" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_PubMed") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "TL;DR: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "" 24 | answer_prefix = "TL;DR: " 25 | test_question = question_prefix + entry['test_question'] 26 | question = question_prefix + entry['question'] 27 | decomposition = answer_prefix + entry['target'] 28 | test_decomposition = entry['test_target'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/reddit.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class RedditScorerTask: 8 | name = "reddit" 9 | question_field = "question" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_Reddit") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "TL;DR: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "" 24 | answer_prefix = "TL;DR: " 25 | test_question = question_prefix + entry['test_question'] 26 | question = question_prefix + entry['question'] 27 | decomposition = answer_prefix + entry['target'] 28 | test_decomposition = entry['test_target'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/dart.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re, os 3 | import json 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class DartScorerTask: 8 | name = "dart" 9 | question_field = "question" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_DART") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "Sentence: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "Table: " 24 | answer_prefix = "Sentence: " 25 | test_question = question_prefix + entry['test_question'] 26 | question = question_prefix + entry['question'] 27 | decomposition = answer_prefix + entry['target'] 28 | test_decomposition = entry['test_target'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/php.py: -------------------------------------------------------------------------------- 1 | 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import re 4 | import json, os 5 | from src.utils.dataset_utils import load_train_dataset 6 | 7 | 8 | class PhpScorerTask: 9 | name = "php" 10 | question_field = "question" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_PHP") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "Comment: " 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "Code: " 25 | answer_prefix = "Comment: " 26 | test_question = question_prefix + entry['test_question'] 27 | question = question_prefix + entry['question'] 28 | decomposition = answer_prefix + entry['target'] 29 | test_decomposition = entry['test_target'] 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/python.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class PythonScorerTask: 8 | name = "python" 9 | question_field = "question" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_Python") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "Comment: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "Code: " 24 | answer_prefix = "Comment: " 25 | test_question = question_prefix + entry['test_question'] 26 | question = question_prefix + entry['question'] 27 | decomposition = answer_prefix + entry['target'] 28 | test_decomposition = entry['test_target'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/predictors/seq2seq/simple_seq2seq_dynamic_predictor.py: -------------------------------------------------------------------------------- 1 | """ 2 | based on allennlp_models/generation/predictors/seq2seq.py 3 | tag: v1.1.0 4 | """ 5 | 6 | from overrides import overrides 7 | 8 | from allennlp.common.util import JsonDict 9 | from allennlp.data import Instance 10 | from allennlp.predictors.predictor import Predictor 11 | 12 | 13 | @Predictor.register('seq2seq_dynamic') 14 | class Seq2SeqDynamicPredictor(Predictor): 15 | """ 16 | Predictor for sequence to sequence models, including 17 | [`SimpleSeq2SeqDynamic`](../models/seq2seq/simple_seq2seq_dynamic.md) and 18 | [`CopyNetSeq2SeqDynamic`](../models/seq2seq/copynet_seq2seq_dynamic.md). 19 | """ 20 | 21 | def predict(self, source: str, allowed_tokens: str) -> JsonDict: 22 | return self.predict_json({"source": source, "allowed_tokens": allowed_tokens}) 23 | 24 | @overrides 25 | def _json_to_instance(self, json_dict: JsonDict) -> Instance: 26 | """ 27 | Expects JSON that looks like `{"source": "..."}`. 28 | """ 29 | source = json_dict["source"] 30 | allowed_tokens = json_dict["allowed_tokens"] 31 | return self._dataset_reader.text_to_instance(source, allowed_tokens) 32 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/cnndailymail.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class CNNDailyMailScorerTask: 8 | name = "cnndailymail" 9 | question_field = "article" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_CNNDailyMail") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "TL;DR: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "" 24 | answer_prefix = "TL;DR: " 25 | test_question = question_prefix + entry['test_article'] 26 | question = question_prefix + entry['article'] 27 | decomposition = answer_prefix + entry['highlights'] 28 | test_decomposition = entry['test_highlights'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/copa.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class CopaScorerTask: 9 | name = "copa" 10 | question_field = "question" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_COPA") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "Answer: " 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "Answer: " 26 | test_question = question_prefix + entry['test_question'] 27 | question = question_prefix + entry['question'] 28 | decomposition = answer_prefix + entry['label'] 29 | test_decomposition = entry['test_label'] 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/cr.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class CRScorerTask: 9 | name = "cr" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_CR") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('cr', 1, entry['label']) 29 | test_decomposition = get_one_prompt('cr', 1, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/cs_valid.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class CSValidScorerTask: 9 | name = "cs_valid" 10 | question_field = "question" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_ComV") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "Answer: " 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "Answer: " 26 | test_question = question_prefix + entry['test_question'] 27 | question = question_prefix + entry['question'] 28 | decomposition = answer_prefix + entry['label'] 29 | test_decomposition = entry['test_label'] 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/mr.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class MRScorerTask: 9 | name = "mr" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_MR") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('mr', 1, entry['label']) 29 | test_decomposition = get_one_prompt('mr', 1, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/cs_explan.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class CSExplanScorerTask: 9 | name = "cs_explan" 10 | question_field = "question" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_ComE") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "Answer: " 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "Answer: " 26 | test_question = question_prefix + entry['test_question'] 27 | question = question_prefix + entry['question'] 28 | decomposition = answer_prefix + entry['label'] 29 | test_decomposition = entry['test_label'] 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/rte.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class RTEScorerTask: 9 | name = "rte" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_RTE") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('rte', 0, entry['label']) 29 | test_decomposition = get_one_prompt('rte', 0, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/cola.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class ColaScorerTask: 9 | name = "cola" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_COLA") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('cola', 1, entry['label']) 29 | test_decomposition = get_one_prompt('cola', 1, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/cosmos_qa.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class CosmosQaScorerTask: 9 | name = "cosmos_qa" 10 | question_field = "question" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_CosmosQA") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "Answer: " 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "Answer: " 26 | test_question = question_prefix + entry['test_question'] 27 | question = question_prefix + entry['question'] 28 | decomposition = answer_prefix + entry['label'] 29 | test_decomposition = entry['test_label'] 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/mnli.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class MNLIScorerTask: 9 | name = "mnli" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_MNLI") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('mnli', 0, entry['label']) 29 | test_decomposition = get_one_prompt('mnli', 0, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/snli.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class SNLIScorerTask: 9 | name = "snli" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_SNLI") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('snli', 0, entry['label']) 29 | test_decomposition = get_one_prompt('snli', 0, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/subj.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class SubjScorerTask: 9 | name = "subj" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_Subj") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('subj', 2, entry['label']) 29 | test_decomposition = get_one_prompt('subj', 2, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/trec.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class TrecScorerTask: 9 | name = "trec" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_TREC") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('trec', 3, entry['label']) 29 | test_decomposition = get_one_prompt('trec', 3, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/sst2.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class SST2ScorerTask: 9 | name = "sst2" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_SST-2") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('sst2', 1, entry['label']) 29 | test_decomposition = get_one_prompt('sst2', 1, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/sst5.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class SST5ScorerTask: 9 | name = "sst5" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_SST-5") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('sst5', 1, entry['label']) 29 | test_decomposition = get_one_prompt('sst5', 1, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /DPR/conf/datasets/retriever_default.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | qa_epr: 3 | _target_: dpr.data.retriever_data.EPRQASrc 4 | dataset_split: ??? 5 | task_name: ??? 6 | 7 | nq_test: 8 | _target_: dpr.data.retriever_data.CsvQASrc 9 | file: data.retriever.qas.nq-test 10 | 11 | nq_train: 12 | _target_: dpr.data.retriever_data.CsvQASrc 13 | file: data.retriever.qas.nq-train 14 | 15 | nq_dev: 16 | _target_: dpr.data.retriever_data.CsvQASrc 17 | file: data.retriever.qas.nq-dev 18 | 19 | trivia_test: 20 | _target_: dpr.data.retriever_data.CsvQASrc 21 | file: data.retriever.qas.trivia-test 22 | 23 | trivia_train: 24 | _target_: dpr.data.retriever_data.CsvQASrc 25 | file: data.retriever.qas.trivia-train 26 | 27 | trivia_dev: 28 | _target_: dpr.data.retriever_data.CsvQASrc 29 | file: data.retriever.qas.trivia-dev 30 | 31 | grailqa_dev: 32 | _target_: dpr.data.retriever_data.CsvQASrc 33 | file: data.retriever.qas.grailqa-dev 34 | 35 | grailqa_train: 36 | _target_: dpr.data.retriever_data.CsvQASrc 37 | file: data.retriever.qas.grailqa-train 38 | 39 | webq_test: 40 | _target_: dpr.data.retriever_data.CsvQASrc 41 | file: data.retriever.qas.webq-test 42 | 43 | curatedtrec_test: 44 | _target_: dpr.data.retriever_data.CsvQASrc 45 | file: data.retriever.qas.curatedtrec-test 46 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/agnews.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class AgnewsScorerTask: 9 | name = "agnews" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_AGNews") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('agnews', 0, entry['label']) 29 | test_decomposition = get_one_prompt('agnews', 0, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/amazon.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class AmazonScorerTask: 9 | name = "amazon" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_Amazon") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('amazon', 1, entry['label']) 29 | test_decomposition = get_one_prompt('amazon', 1, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/yahoo.py: -------------------------------------------------------------------------------- 1 | 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import re 4 | import json, os 5 | from src.utils.dataset_utils import load_train_dataset 6 | from Channel_LM_Prompting.util import get_one_prompt 7 | 8 | 9 | class YahooScorerTask: 10 | name = "yahoo" 11 | question_field = "sentence" 12 | prompt_field = "ctxs" 13 | 14 | def __init__(self, example_file, ds_size=None) -> None: 15 | dataset = load_dataset("KaiLv/UDR_Yahoo") 16 | 17 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 18 | self.training_dataset = list(enumerate(self.hf_dataset)) 19 | self.example_file = example_file 20 | with open(self.example_file) as f: 21 | self.data = json.load(f) 22 | self.postfix = "" 23 | 24 | def get_fields(self, entry, index=-1): 25 | question_prefix = "" 26 | answer_prefix = "" 27 | test_question = question_prefix + entry['test_sentence'] 28 | question = question_prefix + entry['sentence'] 29 | decomposition = get_one_prompt('yahoo', 0, entry['label']) 30 | test_decomposition = get_one_prompt('yahoo', 0, entry['test_label']) 31 | return question, decomposition, test_question, test_decomposition 32 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/roc_ending_generation.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class RocEndingGenerationScorerTask: 8 | name = "roc_ending_generation" 9 | question_field = "question" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_RocEnding") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "End of the story: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "Beginning of the story: " 24 | answer_prefix = "End of the story: " 25 | test_question = question_prefix + entry['test_question'] 26 | question = question_prefix + entry['question'] 27 | decomposition = answer_prefix + entry['target'] 28 | test_decomposition = entry['test_target'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/roc_story_generation.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class RocStoryGenerationScorerTask: 8 | name = "roc_story_generation" 9 | question_field = "question" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_RocStory") 14 | 15 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 16 | self.training_dataset = list(enumerate(self.hf_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | self.postfix = "Rest of the story: " 21 | 22 | def get_fields(self, entry, index=-1): 23 | question_prefix = "Beginning of the story: " 24 | answer_prefix = "Rest of the story: " 25 | test_question = question_prefix + entry['test_question'] 26 | question = question_prefix + entry['question'] 27 | decomposition = answer_prefix + entry['target'] 28 | test_decomposition = entry['test_target'] 29 | return question, decomposition, test_question, test_decomposition 30 | -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/NOTICE.md: -------------------------------------------------------------------------------- 1 | # NOTICES 2 | 3 | This repository incorporates material as listed below or described in the code. 4 | 5 | ## break-evaluator 6 | 7 | MIT License 8 | 9 | Copyright (c) 2021 AI2 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining a copy 12 | of this software and associated documentation files (the "Software"), to deal 13 | in the Software without restriction, including without limitation the rights 14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | copies of the Software, and to permit persons to whom the Software is 16 | furnished to do so, subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included in all 19 | copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | SOFTWARE. 28 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/dbpedia.py: -------------------------------------------------------------------------------- 1 | 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import re 4 | import json, os 5 | from src.utils.dataset_utils import load_train_dataset 6 | from Channel_LM_Prompting.util import get_one_prompt 7 | 8 | 9 | class DbpediaScorerTask: 10 | name = "dbpedia" 11 | question_field = "sentence" 12 | prompt_field = "ctxs" 13 | 14 | def __init__(self, example_file, ds_size=None) -> None: 15 | dataset = load_dataset("KaiLv/UDR_DBPedia") 16 | 17 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 18 | self.training_dataset = list(enumerate(self.hf_dataset)) 19 | self.example_file = example_file 20 | with open(self.example_file) as f: 21 | self.data = json.load(f) 22 | self.postfix = "" 23 | 24 | def get_fields(self, entry, index=-1): 25 | question_prefix = "" 26 | answer_prefix = "" 27 | test_question = question_prefix + entry['test_sentence'] 28 | question = question_prefix + entry['sentence'] 29 | decomposition = get_one_prompt('dbpedia', 0, entry['label']) 30 | test_decomposition = get_one_prompt('dbpedia', 0, entry['test_label']) 31 | return question, decomposition, test_question, test_decomposition 32 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/yelp_full.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json, os 4 | from src.utils.dataset_utils import load_train_dataset 5 | from Channel_LM_Prompting.util import get_one_prompt 6 | 7 | 8 | class YelpFullScorerTask: 9 | name = "yelp_full" 10 | question_field = "sentence" 11 | prompt_field = "ctxs" 12 | 13 | def __init__(self, example_file, ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_Yelp") 15 | 16 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 17 | self.training_dataset = list(enumerate(self.hf_dataset)) 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | self.postfix = "" 22 | 23 | def get_fields(self, entry, index=-1): 24 | question_prefix = "" 25 | answer_prefix = "" 26 | test_question = question_prefix + entry['test_sentence'] 27 | question = question_prefix + entry['sentence'] 28 | decomposition = get_one_prompt('yelp_full', 1, entry['label']) 29 | test_decomposition = get_one_prompt('yelp_full', 1, entry['test_label']) 30 | return question, decomposition, test_question, test_decomposition 31 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/mtop.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, load_from_disk 2 | from src.utils.dataset_utils import load_train_dataset 3 | 4 | import re, os 5 | import json 6 | 7 | 8 | class MtopScorerTask: 9 | name = "mtop" 10 | prompt_field = "ctxs" 11 | question_field = "question" 12 | def __init__(self,example_file,ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_MTOP") 14 | self.hf_dataset = load_train_dataset(dataset,size=ds_size) 15 | self.training_dataset = list(enumerate(self.hf_dataset)) 16 | 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | 21 | 22 | def get_fields(self, entry,index=-1): 23 | test_question = entry['test_question'] 24 | question = entry['question'] 25 | logical_form = entry['logical_form'] 26 | test_logical_form = entry['test_logical_form'] 27 | return question,logical_form,test_question,test_logical_form 28 | 29 | 30 | @classmethod 31 | def remove_double_space(cls,string): 32 | return re.sub("[ ]{2,}", " ", string) 33 | @classmethod 34 | def reformat(cls,text): 35 | return " ".join([f"{i+1}#) {x.strip()}" for i,x in enumerate(text.split(";"))]) 36 | -------------------------------------------------------------------------------- /DPR/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from setuptools import setup 9 | 10 | with open("README.md") as f: 11 | readme = f.read() 12 | 13 | setup( 14 | name="dpr", 15 | version="1.0.0", 16 | description="Facebook AI Research Open Domain Q&A Toolkit", 17 | url="https://github.com/facebookresearch/DPR/", 18 | classifiers=[ 19 | "Intended Audience :: Science/Research", 20 | "License :: OSI Approved :: MIT License", 21 | "Programming Language :: Python :: 3.6", 22 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 23 | ], 24 | long_description=readme, 25 | long_description_content_type="text/markdown", 26 | setup_requires=[ 27 | "setuptools>=18.0", 28 | ], 29 | install_requires=[ 30 | "faiss-cpu>=1.6.1", 31 | "jsonlines", 32 | "filelock", 33 | "numpy", 34 | "regex", 35 | "torch>=1.5.0", 36 | "transformers>=3.0.0,<3.1.0", 37 | "tqdm>=4.27", 38 | "wget", 39 | "spacy>=2.1.8", 40 | "hydra-core>=1.0.0", 41 | "omegaconf>=2.0.1", 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /DPR/conf/biencoder_train_cfg.yaml: -------------------------------------------------------------------------------- 1 | 2 | # configuration groups 3 | defaults: 4 | - encoder: hf_bert 5 | - train: biencoder_default 6 | - datasets: encoder_train_default 7 | 8 | train_datasets: 9 | dev_datasets: 10 | output_dir: 11 | train_sampling_rates: 12 | loss_scale_factors: 13 | loss_type: 14 | rank_loss_factor: 15 | rank_loss_top_sample: 16 | 17 | # Whether to lower case the input text. Set True for uncased models, False for the cased ones. 18 | do_lower_case: True 19 | 20 | fix_ctx_encoder: False 21 | val_av_rank_start_epoch: 30 22 | seed: 12345 23 | checkpoint_file_name: dpr_biencoder 24 | 25 | # A trained bi-encoder checkpoint file to initialize the model 26 | model_file: 27 | 28 | # TODO: move to a conf group 29 | # local_rank for distributed training on gpus 30 | local_rank: -1 31 | global_loss_buf_sz: 592000 32 | device: 33 | distributed_world_size: 34 | distributed_port: 35 | no_cuda: False 36 | n_gpu: 37 | fp16: False 38 | 39 | # For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 40 | # "See details at https://nvidia.github.io/apex/amp.html 41 | fp16_opt_level: O1 42 | 43 | # tokens which won't be slit by tokenizer 44 | special_tokens: 45 | 46 | ignore_checkpoint_offset: False 47 | ignore_checkpoint_optimizer: False 48 | 49 | # set to >1 to enable multiple query encoders 50 | multi_q_encoder: False 51 | -------------------------------------------------------------------------------- /DPR/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to DPR 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | TBD 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `master`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Facebook's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | ## Coding Style 29 | * 2 spaces for indentation rather than tabs 30 | * 120 character line length 31 | * ... 32 | 33 | ## License 34 | By contributing to Facebook AI Research Dense Passage Retriever toolkit, you agree that your contributions will be licensed 35 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /DPR/conf/gen_embs.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - encoder: hf_bert 3 | - ctx_sources: default_sources 4 | 5 | # A trained bi-encoder checkpoint file to initialize the model 6 | model_file: 7 | 8 | # Name of the all-passages resource 9 | ctx_src: 10 | 11 | # which (ctx or query) encoder to be used for embedding generation 12 | encoder_type: ctx 13 | 14 | # output .tsv file path to write results to 15 | out_file: 16 | 17 | # Whether to lower case the input text. Set True for uncased models, False for the cased ones. 18 | do_lower_case: True 19 | 20 | # Number(0-based) of data shard to process 21 | shard_id: 0 22 | 23 | # Total amount of data shards 24 | num_shards: 1 25 | 26 | # Batch size for the passage encoder forward pass (works in DataParallel mode) 27 | batch_size: 512 28 | 29 | tables_as_passages: False 30 | 31 | # tokens which won't be slit by tokenizer 32 | special_tokens: 33 | 34 | tables_chunk_sz: 100 35 | 36 | # TODO 37 | tables_split_type: type1 38 | 39 | 40 | # TODO: move to a conf group 41 | # local_rank for distributed training on gpus 42 | local_rank: -1 43 | device: 44 | distributed_world_size: 45 | distributed_port: 46 | no_cuda: False 47 | n_gpu: 48 | fp16: False 49 | 50 | # For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 51 | # "See details at https://nvidia.github.io/apex/amp.html 52 | fp16_opt_level: O1 -------------------------------------------------------------------------------- /DPR/dpr/utils/conf_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import hydra 4 | from omegaconf import DictConfig 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class BiencoderDatasetsCfg(object): 10 | def __init__(self, cfg: DictConfig): 11 | # print('cfg in BiencoderDatasetsCfg') 12 | # for k,v in cfg.items(): 13 | # logger.info('{}:{}'.format(k,v)) 14 | # exit() 15 | datasets = cfg.datasets 16 | self.train_datasets_names = cfg.train_datasets 17 | logger.info("train_datasets: %s", self.train_datasets_names) 18 | # print(datasets) 19 | if self.train_datasets_names: 20 | self.train_datasets = [ 21 | ] 22 | for ds_name in self.train_datasets_names: 23 | datasets[ds_name]['loss_type'] = cfg.loss_type 24 | tmp_dataset = hydra.utils.instantiate(datasets[ds_name]) 25 | self.train_datasets.append(tmp_dataset) 26 | 27 | else: 28 | self.train_datasets = [] 29 | if cfg.dev_datasets: 30 | self.dev_datasets_names = cfg.dev_datasets 31 | logger.info("dev_datasets: %s", self.dev_datasets_names) 32 | self.dev_datasets = [ 33 | hydra.utils.instantiate(datasets[ds_name]) 34 | for ds_name in self.dev_datasets_names 35 | ] 36 | self.sampling_rates = cfg.train_sampling_rates 37 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/scripts/tune/studies/biaffine-graph-parser--transformer-encoder.py: -------------------------------------------------------------------------------- 1 | import optuna 2 | import os 3 | 4 | from optuna.study import StudyDirection 5 | 6 | 7 | def get_experiment(): 8 | config_file = "scripts/tune/experiments/biaffine-graph-parser--transformer-encoder.jsonnet" 9 | metrics = "best_validation_logical_form_em" 10 | direction = StudyDirection.MAXIMIZE 11 | return config_file, metrics, direction 12 | 13 | 14 | def get_constants(): 15 | return [ 16 | ("transformer_model", "bert-base-uncased"), 17 | ("max_length", 128), 18 | ("transformer_dim", 768), 19 | 20 | ("arc_tags_only", "false"), 21 | ("multi_label", "false"), 22 | 23 | ("pos_embedding_dim", 100), 24 | ("tag_representation_dim", 100) 25 | ] 26 | 27 | 28 | def set_parameters(trial: optuna.Trial): 29 | # hyper parameters 30 | trial.suggest_float("input_dropout", 0.0, 0.8, step=0.1) 31 | trial.suggest_float("dropout", 0.0, 0.8, step=0.1) 32 | trial.suggest_int("arc_representation_dim", 300, 700, step=100) 33 | trial.suggest_int("arc_num_layers", 1, 3) 34 | trial.suggest_int("tag_num_layers", 1, 3) 35 | 36 | trial.suggest_categorical("lr", [1e-4, 1e-3, 1e-2, 1e-1]) 37 | # trial.suggest_categorical("transformer_lr", [2e-5, 3e-5, 5e-5]) 38 | trial.suggest_categorical("transformer_lr", [3e-5, 5e-5, 7e-5]) 39 | trial.suggest_categorical("seed", [24, 42, 64]) 40 | -------------------------------------------------------------------------------- /scripts/find_bm25.sh: -------------------------------------------------------------------------------- 1 | # before run this script, you should: 2 | # 1. download elasticsearch-7.9.1 3 | # 2. run ES_JAVA_OPTS="-Xms31g -Xmx31g" ./elasticsearch-7.9.1/bin/elasticsearch to start elasticsearch 4 | datasets_full=("agnews" "amazon" "break" "cola" "common_gen" \ 5 | "copa" "cosmos_qa" "cr" "cs_explan" "cs_valid" "dart" "dbpedia" \ 6 | "e2e" "mr" "mtop" "pubmed" "reddit" "roc_ending_generation" "roc_story_generation" \ 7 | "rte" "smcalflow" "sst2" "sst5" "subj" "trec" "yahoo" "yelp_full") 8 | datasets_sampled=("cnndailymail" "go" "java" "mnli" "php" "python" "snli" "wikiauto") 9 | 10 | if [ ! -d "$PWD/data_bm25" ]; then 11 | mkdir "$PWD/data_bm25" 12 | fi 13 | 14 | for train_set in "train" "debug"; do 15 | if [ "$train_set" == "train" ]; then 16 | datasets=("${datasets_full[@]}") 17 | else 18 | datasets=("${datasets_sampled[@]}") 19 | fi 20 | 21 | for dataset in "${datasets[@]}"; do 22 | find_bm25_py_output_path="$PWD/data_bm25/${dataset}_${train_set}.json" 23 | echo -e "\n\n-find_bm25 ${dataset}-\n\n" 24 | if [ ! -f "${find_bm25_py_output_path}" ]; then 25 | HYDRA_FULL_ERROR=1 \ 26 | python find_bm25_es.py \ 27 | output_path="$find_bm25_py_output_path" \ 28 | dataset_split=${train_set} \ 29 | setup_type="a" \ 30 | task_name=${dataset} \ 31 | +ds_size=null \ 32 | L=50 33 | hydra.run.dir="$PWD/exps/find_bm25/${dataset}/logs" 34 | fi 35 | done 36 | done -------------------------------------------------------------------------------- /scripts/score_bm25.sh: -------------------------------------------------------------------------------- 1 | main_process_port=$((RANDOM % 5001 + 25000)) 2 | cvd=0,1,2,3,4,5,6,7 3 | num_gpus=8 4 | 5 | datasets_full=("agnews" "amazon" "break" "cola" "common_gen" \ 6 | "copa" "cosmos_qa" "cr" "cs_explan" "cs_valid" "dart" "dbpedia" \ 7 | "e2e" "mr" "mtop" "pubmed" "reddit" "roc_ending_generation" "roc_story_generation" \ 8 | "rte" "smcalflow" "sst2" "sst5" "subj" "trec" "yahoo" "yelp_full") 9 | datasets_sampled=("cnndailymail" "go" "java" "mnli" "php" "python" "snli" "wikiauto") 10 | 11 | if [ ! -d "$PWD/data_score" ]; then 12 | mkdir "$PWD/data_score" 13 | fi 14 | 15 | for train_set in "train" "debug"; do 16 | if [ "$train_set" == "train" ]; then 17 | datasets=("${datasets_full[@]}") 18 | else 19 | datasets=("${datasets_sampled[@]}") 20 | fi 21 | 22 | for dataset in "${datasets[@]}"; do 23 | echo -e "\n\n-------score ${dataset}-------\n\n" 24 | scorer_py_output_path="$PWD/data_score/${dataset}_bm25.json" 25 | if [ ! -f "$scorer_py_output_path" ]; then 26 | CUDA_VISIBLE_DEVICES=$cvd \ 27 | accelerate launch --num_processes $num_gpus --main_process_port ${main_process_port} --multi_gpu \ 28 | scorer.py \ 29 | example_file="$PWD/data_bm25/${dataset}_${train_set}.json" \ 30 | setup_type=qa \ 31 | output_file="$scorer_py_output_path" \ 32 | batch_size=20 \ 33 | +task_name=$dataset +dataset_reader.ds_size=null \ 34 | hydra.run.dir="$PWD/exps/score_bm25/${dataset}/logs" 35 | fi 36 | done 37 | done -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/common_gen.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class CommonGenScorerTask: 8 | name = "common_gen" 9 | question_field = "joined_concepts" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | dataset = load_dataset("KaiLv/UDR_CommonGen") 14 | 15 | # if 'q' in example_file.split('/')[-1]: 16 | # self.hf_dataset = dataset['train_dedup'] 17 | # elif 'a' in example_file.split('/')[-1]: 18 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 19 | 20 | self.training_dataset = list(enumerate(self.hf_dataset)) 21 | self.example_file = example_file 22 | with open(self.example_file) as f: 23 | self.data = json.load(f) 24 | self.postfix = "Generated sentence: " 25 | 26 | def get_fields(self, entry, index=-1): 27 | question_prefix = "Generate a sentence using these concepts: " 28 | answer_prefix = "Generated sentence: " 29 | test_question = question_prefix + entry['test_joined_concepts'] 30 | question = question_prefix + entry['joined_concepts'] 31 | decomposition = answer_prefix + entry['target'] 32 | test_decomposition = entry['test_target'] 33 | return question, decomposition, test_question, test_decomposition 34 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/models/seq2seq/custom_copynet_seq2seq_for_soft_rat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Based on custom_copynet_seq2seq.py 3 | """ 4 | import logging 5 | from typing import Dict 6 | 7 | from overrides import overrides 8 | import torch 9 | 10 | from allennlp.models.model import Model 11 | from allennlp.nn import util 12 | 13 | from qdecomp_nlp.models.seq2seq.custom_copynet_seq2seq import CustomCopyNetSeq2Seq 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @Model.register("custom_copynet_seq2seq_for_soft_rat") 19 | class CustomCopyNetForRatSeq2Seq(CustomCopyNetSeq2Seq): 20 | @overrides 21 | def _encode(self, source_tokens: Dict[str, torch.Tensor], relations_probs: torch.Tensor) -> Dict[str, torch.Tensor]: 22 | """ 23 | Encode source input sentences. 24 | """ 25 | # shape: (batch_size, source_sequence_length, encoder_input_dim) 26 | embedded_input = self._source_embedder(source_tokens) 27 | # shape: (batch_size, source_sequence_length) 28 | source_mask = util.get_text_field_mask(source_tokens) 29 | # shape: (batch_size, source_sequence_length, source_sequence_length) 30 | relations_mask = source_mask.unsqueeze(-1)*source_mask.unsqueeze(-2) 31 | # shape: (batch_size, source_sequence_length, encoder_output_dim) 32 | encoder_outputs = self._encoder(embedded_input, relations_probs, relations_mask) 33 | return {"source_mask": source_mask, "encoder_outputs": encoder_outputs} 34 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/break.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, load_from_disk 2 | import re, os 3 | import json 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class BreakScorerTask: 8 | name = "break" 9 | question_field = "question_text" 10 | dataset_name = "break_data" 11 | split = "QDMR" 12 | prompt_field = "near_examples" 13 | def __init__(self,example_file,ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_BREAK") 15 | self.orig_training_dataset = load_train_dataset(dataset,size=ds_size) 16 | self.training_dataset = list(enumerate(self.orig_training_dataset)) 17 | self.example_file = example_file 18 | with open(self.example_file) as f: 19 | self.data = json.load(f) 20 | 21 | 22 | def get_fields(self, entry,index=-1): 23 | test_question = self.remove_double_space(entry['test_question_text']) 24 | question = self.remove_double_space(entry['question_text']) 25 | decomposition = self.remove_double_space(self.reformat(entry['decomposition'])) 26 | test_decomposition = self.remove_double_space(self.reformat(entry['test_decomposition'])) 27 | return question,decomposition,test_question,test_decomposition 28 | 29 | @classmethod 30 | def remove_double_space(cls,string): 31 | return re.sub("[ ]{2,}", " ", string) 32 | @classmethod 33 | def reformat(cls,text): 34 | return " ".join([f"{i+1}#) {x.strip()}" for i,x in enumerate(text.split(";"))]) 35 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_spans_extractors/base_steps_spans_extractor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | from typing import List, Tuple 4 | 5 | import spacy 6 | from spacy.tokens.doc import Doc 7 | 8 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import QDMROperation, StepsSpans 9 | 10 | 11 | class BaseSpansExtractor(ABC): 12 | def __init__(self, tokens_parser=None): 13 | self._parser = tokens_parser or spacy.load("en_core_web_sm") 14 | 15 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None, 16 | debug: dict = None) -> StepsSpans: 17 | def format(text: str): 18 | return re.sub(r'\s+', ' ', text) 19 | 20 | question_tokens = self._parser(format(question)) 21 | decomposition = re.sub(r'#(\d+)', '@@\g<1>@@', decomposition) 22 | steps_tokens = [self._parser(' '.join(format(x).split(' ')[1:])) for x in decomposition.split(';')] 23 | steps_operators = operators and [QDMROperation(x) for x in operators] 24 | return self._extract(question_id=question_id, question_tokens=question_tokens, 25 | steps_tokens=steps_tokens, steps_operators=steps_operators) 26 | 27 | @abstractmethod 28 | def _extract(self, question_id: str, question_tokens: Doc, steps_tokens: List[Doc], steps_operators: List[QDMROperation] = None, 29 | debug: dict = None) -> StepsSpans: 30 | raise NotImplementedError() -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/qdecomp_nlp/data/dataset_readers/util.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, List 2 | import csv 3 | 4 | from allennlp.common.checks import ConfigurationError 5 | from allennlp.common.file_utils import cached_path 6 | from allennlp.data.fields import TextField, MetadataField, NamespaceSwappingField 7 | from allennlp.data.instance import Instance 8 | 9 | 10 | def read_break_data(file_path: str, delimiter: str, 11 | text_to_instance: Callable[..., Instance], 12 | args_columns: List[str], 13 | metadata_columns: List[str] = ['question_id'], 14 | quoting: int = csv.QUOTE_MINIMAL): 15 | with open(cached_path(file_path), "r") as data_file: 16 | lines = csv.reader(data_file, delimiter=delimiter, quoting=quoting) 17 | header = next(lines, None) 18 | header_to_index = {x: i for i, x in enumerate(header)} 19 | for line_num, row in enumerate(lines): 20 | if len(row) != len(header): 21 | raise ConfigurationError( 22 | "Invalid line format: %s (line number %d)" % (row, line_num + 1) 23 | ) 24 | 25 | instance = text_to_instance(*[row[header_to_index[x]] for x in args_columns if x in header_to_index]) 26 | metadata = {x: row[header_to_index[x]] for x in metadata_columns} 27 | if 'metadata' in instance: 28 | metadata.update(instance['metadata']) 29 | instance.add_field('metadata', MetadataField(metadata)) 30 | yield instance 31 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/scripts/tune/studies/operators-aware-biaffine-graph-parser--transformer-encoder.py: -------------------------------------------------------------------------------- 1 | import optuna 2 | import os 3 | 4 | from optuna.study import StudyDirection 5 | 6 | 7 | def get_experiment(): 8 | config_file = "scripts/tune/experiments/operators-aware-biaffine-graph-parser--transformer-encoder.jsonnet" 9 | metrics = "best_validation_logical_form_em" 10 | direction = StudyDirection.MAXIMIZE 11 | return config_file, metrics, direction 12 | 13 | 14 | def get_constants(): 15 | return [ 16 | ("transformer_model", "bert-base-uncased"), 17 | ("max_length", 128), 18 | ("transformer_dim", 768), 19 | 20 | ("decode_strategy", "operators_mask"), 21 | 22 | ("pos_embedding_dim", 100), 23 | ] 24 | 25 | 26 | def set_parameters(trial: optuna.Trial): 27 | # hyper parameters 28 | trial.suggest_float("input_dropout", 0.0, 0.8, step=0.1) 29 | trial.suggest_float("dropout", 0.0, 0.8, step=0.1) 30 | trial.suggest_int("operator_representation_dim", 100, 700, step=100) 31 | trial.suggest_int("tag_representation_dim", 100, 700, step=100) 32 | trial.suggest_int("operator_ff_num_layers", 1, 3) 33 | trial.suggest_int("tag_ff_num_layers", 1, 3) 34 | trial.suggest_int("operator_embeddings_dim", 0, 300, step=100) 35 | 36 | trial.suggest_categorical("lr", [1e-4, 1e-3, 1e-2, 1e-1]) 37 | # trial.suggest_categorical("transformer_lr", [2e-5, 3e-5, 5e-5]) 38 | trial.suggest_categorical("transformer_lr", [2e-5, 3e-5, 5e-5, 7e-5]) 39 | trial.suggest_categorical("seed", [24, 42, 64]) 40 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/config/configuration_loader.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List 2 | import os 3 | from dataclasses import dataclass 4 | from ast import literal_eval 5 | 6 | from qdecomp_with_dependency_graphs.dependencies_graph.evaluation.spans_dependencies_to_logical_form_tokens import SpansDepToQDMRStepTokensConverter 7 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors import BaseTokensDependenciesExtractor, BaseSpansExtractor, \ 8 | BaseTokensDependenciesToQDMRExtractor, BaseCollapser 9 | 10 | 11 | @dataclass 12 | class Configuration: 13 | spans_extractor: BaseSpansExtractor = None 14 | tokens_dependencies_extractor: BaseTokensDependenciesExtractor = None 15 | tokens_dependencies_to_qdmr_extractor: BaseTokensDependenciesToQDMRExtractor = None 16 | spans_dependencies_to_logical_form_converter: SpansDepToQDMRStepTokensConverter = None 17 | 18 | 19 | config: Configuration = Configuration() 20 | _config_str = None 21 | def load(config_file: str): 22 | global config, _config_str 23 | with open(config_file, 'rt') as fp: 24 | _config_str = fp.read() 25 | _locals = {} 26 | exec(_config_str, globals(), _locals) 27 | for attr, value in config.__dict__.items(): 28 | config.__setattr__(attr, _locals[attr]) 29 | 30 | conf = os.environ.get('DEP_CONF', 'default') 31 | load(f'dependencies_graph/config/config_{conf}.py') 32 | 33 | 34 | def save(dir_path: str): 35 | path = os.path.join(dir_path, 'dependencies_graph_config.py') 36 | with open(path, 'wt') as fp: 37 | fp.write(_config_str) 38 | -------------------------------------------------------------------------------- /src/utils/cache_util.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | class BufferedJsonWriter(object): 4 | def __init__(self, file_name, buffer_size=50): 5 | self.file_path = file_name 6 | self.buffer = [] 7 | self.buffer_size = buffer_size 8 | 9 | def __enter__(self): 10 | return self 11 | 12 | def __exit__(self, type, value, traceback): 13 | self.write_buffer() 14 | 15 | def write(self, obj=None): 16 | if obj is not None: 17 | self.buffer.append(obj) 18 | if len(self.buffer)>=self.buffer_size: 19 | self.write_buffer() 20 | 21 | def write_buffer(self): 22 | with open(self.file_path, "a") as data_file: 23 | data_file.write(json.dumps(self.buffer)) 24 | data_file.write("\n") 25 | self.buffer = [] 26 | 27 | class BufferedJsonReader(object): 28 | def __init__(self, file_name): 29 | self.file_path = file_name 30 | 31 | def __enter__(self): 32 | return self 33 | 34 | def __exit__(self, type, value, traceback): 35 | pass 36 | 37 | def __itr__(self): 38 | with open(self.file_path, "r") as data_file: 39 | for line in data_file: 40 | yield from json.loads(line) 41 | 42 | def read(self): 43 | return list(self.__itr__()) 44 | 45 | 46 | 47 | def get_cache_path(dataset): 48 | cache_files = dataset.cache_files 49 | if isinstance(cache_files,dict): 50 | cache_files = next(iter(cache_files.values())) 51 | return pathlib.Path(cache_files[0]['filename']).parent -------------------------------------------------------------------------------- /break_evaluator/utils/graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | from queue import Queue, deque 3 | 4 | 5 | def has_cycle(graph: nx.DiGraph): 6 | try: 7 | nx.find_cycle(graph, orientation='original') 8 | return True 9 | except: 10 | return False 11 | 12 | 13 | def get_graph_levels(graph: nx.DiGraph): 14 | """ 15 | Find graph level for each node 16 | level[node] := 0 if the node has no successors 17 | level[node] := max[over successors s](level[s])+1 18 | :param graph: directed graph with no cycles 19 | :return: (nodes_level, levels) tuple where: 20 | nodes_level: dictionary of : 21 | levels: dictionary of :[] 22 | """ 23 | updated_nodes = Queue() 24 | 25 | # first layer 26 | leafs = [n_id for n_id in graph.nodes if not any(graph.successors(n_id))] 27 | nodes_levels = {n_id: 0 for n_id in leafs} 28 | updated_nodes.queue = deque(leafs) 29 | 30 | # update predecessors 31 | while not updated_nodes.empty(): 32 | n_id = updated_nodes.get() 33 | low_bound = nodes_levels[n_id] + 1 34 | if low_bound > graph.number_of_nodes(): 35 | raise ValueError("Cyclic graphs are not allowed") 36 | for s_id in graph.predecessors(n_id): 37 | if nodes_levels.get(s_id, -1) < low_bound: 38 | nodes_levels[s_id] = low_bound 39 | updated_nodes.put(s_id) 40 | levels = {} 41 | for n_id, l in nodes_levels.items(): 42 | levels[l] = levels.get(l, []) + [n_id] 43 | 44 | return nodes_levels, levels 45 | -------------------------------------------------------------------------------- /find_random.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | import hydra.utils as hu 3 | 4 | import tqdm 5 | import numpy as np 6 | import json 7 | # from src.utils.app import App 8 | from src.dataset_readers.bm25_tasks import BM25Task 9 | from dataclasses import dataclass 10 | import random 11 | 12 | 13 | 14 | class RandomFinder: 15 | def __init__(self,cfg) -> None: 16 | self.output_path = cfg.output_path 17 | self.task_name = cfg.task_name 18 | # assert cfg.dataset_split in ["train","validation","test"] 19 | self.is_train = cfg.dataset_split=="train" 20 | self.setup_type = "a" 21 | 22 | self.task = BM25Task.from_name(cfg.task_name)(cfg.dataset_split, 23 | self.setup_type) 24 | print("started creating the corpus") 25 | self.corpus = self.task.get_corpus() 26 | print("finished creating the corpus") 27 | 28 | 29 | 30 | 31 | 32 | 33 | def find(cfg): 34 | random_finder = RandomFinder(cfg) 35 | data_list = list(random_finder.task.dataset) 36 | idx_list = list(range(len(random_finder.task.get_corpus()))) 37 | 38 | for element in tqdm.tqdm(data_list): 39 | element['ctxs'] = [{"id":int(a)} for a in random.sample(idx_list,k=200)] 40 | return data_list 41 | 42 | 43 | @hydra.main(config_path="configs",config_name="random_finder") 44 | def main(cfg): 45 | print(cfg) 46 | 47 | data_list = find(cfg) 48 | # print(data_list) 49 | with open(cfg.output_path,"w") as f: 50 | json.dump(data_list,f) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() -------------------------------------------------------------------------------- /semantic_parsing_with_constrained_lm/third_party/break-evaluator/utils/graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | from queue import Queue, deque 3 | 4 | 5 | def has_cycle(graph: nx.DiGraph): 6 | try: 7 | nx.find_cycle(graph, orientation='original') 8 | return True 9 | except: 10 | return False 11 | 12 | 13 | def get_graph_levels(graph: nx.DiGraph): 14 | """ 15 | Find graph level for each node 16 | level[node] := 0 if the node has no successors 17 | level[node] := max[over successors s](level[s])+1 18 | :param graph: directed graph with no cycles 19 | :return: (nodes_level, levels) tuple where: 20 | nodes_level: dictionary of : 21 | levels: dictionary of :[] 22 | """ 23 | updated_nodes = Queue() 24 | 25 | # first layer 26 | leafs = [n_id for n_id in graph.nodes if not any(graph.successors(n_id))] 27 | nodes_levels = {n_id: 0 for n_id in leafs} 28 | updated_nodes.queue = deque(leafs) 29 | 30 | # update predecessors 31 | while not updated_nodes.empty(): 32 | n_id = updated_nodes.get() 33 | low_bound = nodes_levels[n_id] + 1 34 | if low_bound > graph.number_of_nodes(): 35 | raise ValueError("Cyclic graphs are not allowed") 36 | for s_id in graph.predecessors(n_id): 37 | if nodes_levels.get(s_id, -1) < low_bound: 38 | nodes_levels[s_id] = low_bound 39 | updated_nodes.put(s_id) 40 | levels = {} 41 | for n_id, l in nodes_levels.items(): 42 | levels[l] = levels.get(l, []) + [n_id] 43 | 44 | return nodes_levels, levels 45 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/smcalflow.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, load_from_disk 2 | import re, os 3 | import json 4 | import random 5 | from src.utils.dataset_utils import load_train_dataset 6 | 7 | 8 | 9 | class SmcalflowScorerTask: 10 | name = "smcalflow" 11 | prompt_field = "ctxs" 12 | question_field = "user_utterance" 13 | def __init__(self,example_file,ds_size=None) -> None: 14 | dataset = load_dataset("KaiLv/UDR_SMCalFlow") 15 | # dataset = load_dataset("iohadrubin/smcalflow") 16 | self.hf_dataset = load_train_dataset(dataset,size=ds_size) 17 | # self.hf_dataset = ['train'] 18 | self.example_file = example_file 19 | with open(self.example_file) as f: 20 | self.data = json.load(f) 21 | idx_list = list(range(len(self.data))) 22 | random.Random(42).shuffle(idx_list) 23 | # self.data = [self.data[x] for x in idx_list[:44000]] 24 | print(f"{len(self.data)} examples") 25 | 26 | self.training_dataset = list(enumerate(self.hf_dataset)) 27 | 28 | def get_fields(self, entry,index=-1): 29 | test_question = entry['test_user_utterance'] 30 | question = entry['user_utterance'] 31 | lispress = entry['lispress'] 32 | test_lispress = entry['test_lispress'] 33 | return question,lispress,test_question,test_lispress 34 | 35 | 36 | @classmethod 37 | def remove_double_space(cls,string): 38 | return re.sub("[ ]{2,}", " ", string) 39 | @classmethod 40 | def reformat(cls,text): 41 | return " ".join([f"{i+1}#) {x.strip()}" for i,x in enumerate(text.split(";"))]) 42 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/php.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return PhpBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return PhpBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return PhpBM25Task.norm(entry['target']) 25 | 26 | 27 | class PhpBM25Task: 28 | name = 'php' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_PHP") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls, text): 50 | # 输出一个list 51 | return word_tokenize(text) 52 | -------------------------------------------------------------------------------- /src/dataset_readers/inference_tasks/mtop.py: -------------------------------------------------------------------------------- 1 | import re, os 2 | from datasets import load_dataset, load_from_disk 3 | from src.utils.dataset_utils import load_train_dataset 4 | 5 | import json 6 | from src.utils.tokenizer_utils import get_length 7 | 8 | 9 | 10 | 11 | def set_length(example, idx,**kwargs): 12 | tokenizer = kwargs['tokenizer'] 13 | q_field = example['question'] 14 | a_field = example['logical_form'] 15 | prompt_qa = f"{q_field}\t{a_field}" 16 | example['prompt_qa'] = prompt_qa 17 | example['prompt_len'] = get_length(tokenizer,prompt_qa) 18 | return example 19 | 20 | class MtopInferenceTask: 21 | name = "mtop" 22 | def __init__(self, prompt_file, tokenizer,ds_size=None): 23 | self.prompt_file = prompt_file 24 | with open(self.prompt_file) as f: 25 | self.prompts = json.load(f) 26 | dataset = load_dataset("KaiLv/UDR_MTOP") 27 | self.hf_dataset = load_train_dataset(dataset,size=ds_size,listify=False) 28 | self.hf_dataset = self.hf_dataset.map(set_length,with_indices=True,fn_kwargs={'tokenizer':tokenizer}) 29 | self.training_dataset = list(self.hf_dataset) 30 | self.postfix = "" 31 | self.prefix = "" 32 | 33 | @classmethod 34 | def postproccess(cls, string): 35 | return string 36 | 37 | def get_fields(self, entry): 38 | answer = entry['logical_form'] if 'logical_form' in entry else entry['answers'][0] 39 | idx_list =[p['id'] for p in entry['ctxs']] 40 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']]) 41 | return entry['question'],answer,prompts['prompt_qa'],prompts['prompt_len'],idx_list 42 | 43 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/dart.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return DartBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return DartBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return DartBM25Task.norm(entry['target']) 25 | 26 | 27 | class DartBM25Task: 28 | name = 'dart' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_DART") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls, text): 50 | # 输出一个list 51 | return word_tokenize(text) 52 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/java.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return JavaBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return JavaBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return JavaBM25Task.norm(entry['target']) 25 | 26 | 27 | class JavaBM25Task: 28 | name = 'java' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_Java") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls, text): 50 | # 输出一个list 51 | return word_tokenize(text) 52 | -------------------------------------------------------------------------------- /src/dataset_readers/scorer_tasks/wikiauto.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | import re 3 | import json 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | 7 | class WikiautoScorerTask: 8 | name = "wikiauto" 9 | question_field = "source" 10 | prompt_field = "ctxs" 11 | 12 | def __init__(self, example_file, ds_size=None) -> None: 13 | # dataset = load_dataset('GEM/wiki_auto_asset_turk', 'train') 14 | # dataset = dataset.filter(lambda x: len(x['target']) < 1000) 15 | # # add idx column 16 | # for split in ['train', 'validation', 'test_asset', 'test_turk', "test_wiki"]: 17 | # ds_id = Dataset.from_dict({"idx": list(range(len(dataset[split])))}) 18 | # dataset[split] = concatenate_datasets([dataset[split], ds_id], axis=1) 19 | dataset = load_dataset("KaiLv/UDR_WikiAuto") 20 | 21 | self.hf_dataset = load_train_dataset(dataset, size=ds_size) 22 | self.training_dataset = list(enumerate(self.hf_dataset)) 23 | self.example_file = example_file 24 | with open(self.example_file) as f: 25 | self.data = json.load(f) 26 | self.postfix = "Simplified text: " 27 | 28 | def get_fields(self, entry, index=-1): 29 | question_prefix = "Simplify the text: " 30 | answer_prefix = "Simplified text: " 31 | test_question = question_prefix + entry['test_source'] 32 | question = question_prefix + entry['source'] 33 | decomposition = answer_prefix + entry['target'] 34 | test_decomposition = entry['test_target'] 35 | return question, decomposition, test_question, test_decomposition 36 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/operators_sequence.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from ast import literal_eval 4 | import pandas as pd 5 | 6 | def create_operators_seq(dataset_file: str): 7 | df = pd.read_csv(dataset_file) 8 | df['operators_seq'] = df['operators'].apply(lambda x: ' '.join(literal_eval(x))) 9 | df[['question_text', 'operators_seq']].to_csv(os.path.splitext(dataset_file)[0]+'_operators_seq2.tsv', 10 | sep='\t', header=False, index=False) 11 | 12 | def eval_operators_seq(dataset_file: str, predictions_file:str): 13 | df = pd.read_csv(dataset_file) 14 | preds = [] 15 | with open(predictions_file, 'rt') as f: 16 | for line in f.readlines(): 17 | content = json.loads(line.strip()) 18 | pred = content['predicted_tokens'] 19 | preds.append(pred) 20 | df['gold_operators_seq'] = df['operators'].apply(lambda x: literal_eval(x)) 21 | df['predictions'] = preds 22 | df['exact_match'] = df['gold_operators_seq']==df['predictions'] 23 | base_name = os.path.splitext(predictions_file)[0]+'__eval' 24 | df.to_csv(base_name+'.csv', index=False) 25 | summary = df.mean().round(3).to_dict() 26 | with open(base_name+'_summary.json', 'wt') as f: 27 | json.dump(summary,f, indent=2, sort_keys=True) 28 | print(summary) 29 | 30 | 31 | if __name__ == '__main__': 32 | # create_operators_seq('datasets/Break/QDMR/dev.csv') 33 | eval_operators_seq('datasets/Break/QDMR/dev.csv', 34 | 'tmp/datasets_Break_QDMR/dependencies_graph/operators-seq--seq2seq/operators-seq--seq2seq/eval/datasets_Break_QDMR_dev_operators_seq__preds.json') -------------------------------------------------------------------------------- /dataflow/core/utterance_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | import re 4 | from typing import List 5 | 6 | import spacy 7 | from spacy.language import Language 8 | 9 | from dataflow.core.constants import SpecialStrings 10 | 11 | 12 | def tokenize_datetime(text: str) -> str: 13 | """Tokenizes datetime to make it consistent with the seaweed tokens.""" 14 | # 5.10 => 5 . 10 15 | # 4:00 => 4 : 00 16 | # 5/7 => 5 / 7 17 | # 5\7 => 5 \ 7 18 | # 3-9 => 3 - 9 19 | text = re.sub(r"(\d)([.:/\\-])(\d)", r"\1 \2 \3", text) 20 | 21 | # 4pm => 4 pm 22 | text = re.sub(r"(\d+)([a-zA-Z])", r"\1 \2", text) 23 | 24 | # safe guard to avoid multiple spaces 25 | text = re.sub(r"\s+", " ", text) 26 | return text 27 | 28 | 29 | class UtteranceTokenizer: 30 | """A Spacy-based tokenizer with some heuristics for user utterances.""" 31 | 32 | def __init__(self, spacy_model_name: str = "en_core_web_md") -> None: 33 | self._spacy_nlp: Language = spacy.load(spacy_model_name) 34 | 35 | def tokenize(self, utterance_str: str) -> List[str]: 36 | """Tokenizes the utterance string and returns a list of tokens. 37 | """ 38 | if not utterance_str: 39 | return [] 40 | 41 | if utterance_str == SpecialStrings.NULL: 42 | # do not tokenize the NULL special string 43 | return [utterance_str] 44 | 45 | tokens: List[str] = sum( 46 | [ 47 | tokenize_datetime(token.text).split(" ") 48 | for token in self._spacy_nlp(utterance_str) 49 | ], 50 | [], 51 | ) 52 | return tokens 53 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/dependencies_graph/extractors/steps_spans_extractors/from_file_steps_spans_extractor.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Dict, Tuple, List 3 | 4 | from overrides import overrides 5 | from spacy.tokens.doc import Doc 6 | 7 | from qdecomp_with_dependency_graphs.dependencies_graph.data_types import QDMROperation, StepsSpans 8 | from qdecomp_with_dependency_graphs.dependencies_graph.extractors.steps_spans_extractors.base_steps_spans_extractor import BaseSpansExtractor 9 | 10 | 11 | class FromFileSpansExtractor(BaseSpansExtractor): 12 | def __init__(self, *file_path: str): 13 | super().__init__() 14 | self._cache: Dict[str, Tuple[StepsSpans, dict]] = {} 15 | for file in file_path: 16 | with open(file, 'r') as f: 17 | for line in f.readlines(): 18 | content = json.loads(line.strip()) 19 | steps_spans: StepsSpans = StepsSpans.from_dict(content['steps_spans']) 20 | metadata = content['metadata'] 21 | self._cache[metadata['question_id']] = steps_spans, metadata 22 | 23 | @overrides 24 | def extract(self, question_id: str, question: str, decomposition:str, operators: List[str] = None, 25 | debug: dict = None) -> StepsSpans: 26 | steps_spans, metadata = self._cache[question_id] 27 | if debug is not None: debug.update(**metadata) 28 | return steps_spans 29 | 30 | @overrides 31 | def _extract(self, question_id: str, question_tokens: Doc, steps_tokens: List[Doc], steps_operators: List[QDMROperation] = None, 32 | debug: dict = None) -> StepsSpans: 33 | raise NotImplementedError('use extract()') -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/pubmed.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return PubmedBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return PubmedBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return PubmedBM25Task.norm(entry['target']) 25 | 26 | 27 | class PubmedBM25Task: 28 | name = 'pubmed' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_PubMed") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls, text): 50 | # 输出一个list 51 | return word_tokenize(text) 52 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/python.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return PythonBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return PythonBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return PythonBM25Task.norm(entry['target']) 25 | 26 | 27 | class PythonBM25Task: 28 | name = 'python' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_Python") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls, text): 50 | # 输出一个list 51 | return word_tokenize(text) 52 | -------------------------------------------------------------------------------- /dataflow/core/utterance_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | import re 4 | from typing import List 5 | 6 | from dataflow.core.constants import SpecialStrings 7 | from dataflow.core.dialogue import AgentUtterance, UserUtterance 8 | from dataflow.core.utterance_tokenizer import UtteranceTokenizer 9 | 10 | 11 | def clean_utterance_text(text: str) -> str: 12 | """Removes line breaking and extra spaces in the user utterance.""" 13 | # sometimes the user utterance contains line breaking and extra spaces 14 | text = re.sub(r"\s+", " ", text) 15 | # sometimes the user utterance has leading/ending spaces 16 | text = text.strip() 17 | return text 18 | 19 | 20 | def build_user_utterance( 21 | text: str, utterance_tokenizer: UtteranceTokenizer 22 | ) -> UserUtterance: 23 | text = clean_utterance_text(text) 24 | if not text: 25 | return UserUtterance( 26 | original_text=SpecialStrings.NULL, tokens=[SpecialStrings.NULL] 27 | ) 28 | return UserUtterance(original_text=text, tokens=utterance_tokenizer.tokenize(text)) 29 | 30 | 31 | def build_agent_utterance( 32 | text: str, utterance_tokenizer: UtteranceTokenizer, described_entities: List[str] 33 | ) -> AgentUtterance: 34 | text = clean_utterance_text(text) 35 | if not text: 36 | return AgentUtterance( 37 | original_text=SpecialStrings.NULL, 38 | tokens=[SpecialStrings.NULL], 39 | described_entities=described_entities, 40 | ) 41 | return AgentUtterance( 42 | original_text=text, 43 | tokens=utterance_tokenizer.tokenize(text), 44 | described_entities=described_entities, 45 | ) 46 | -------------------------------------------------------------------------------- /dataflow/core/prediction_report.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import csv 3 | from typing import Dict, List, Sequence, Union 4 | 5 | import pandas as pd 6 | 7 | 8 | class PredictionReportDatum(abc.ABC): 9 | @abc.abstractmethod 10 | def flatten(self) -> Dict[str, Union[str, int]]: 11 | raise NotImplementedError() 12 | 13 | 14 | def save_prediction_report_tsv( 15 | prediction_report: Sequence[PredictionReportDatum], prediction_report_tsv: str, 16 | ) -> None: 17 | """Converts prediction results into a pandas dataframe and saves it a tsv report file. 18 | """ 19 | prediction_report_df = pd.DataFrame( 20 | [datum.flatten() for datum in prediction_report] 21 | ) 22 | prediction_report_df.to_csv( 23 | prediction_report_tsv, 24 | sep="\t", 25 | index=False, 26 | encoding="utf-8", 27 | quoting=csv.QUOTE_ALL, 28 | ) 29 | 30 | 31 | def save_prediction_report_txt( 32 | prediction_report: Sequence[PredictionReportDatum], 33 | prediction_report_txt: str, 34 | field_names: List[str], 35 | ) -> None: 36 | """Prints prediction results into an easy-to-read text report file.""" 37 | with open(prediction_report_txt, "w") as fp: 38 | for datum in prediction_report: 39 | fp.write("=" * 16) 40 | fp.write("\n") 41 | 42 | flatten_fields = datum.flatten() 43 | for field_name in field_names: 44 | field_value = flatten_fields[field_name] 45 | # use "hypo" not "prediction" as the name here just to make it visually aligned with "gold" 46 | if field_name == "prediction": 47 | field_name = "hypo" 48 | print(f"{field_name}\t{field_value}", file=fp) 49 | -------------------------------------------------------------------------------- /dataflow/multiwoz/trade_dst/mapping.pair: -------------------------------------------------------------------------------- 1 | it's it is 2 | don't do not 3 | doesn't does not 4 | didn't did not 5 | you'd you would 6 | you're you are 7 | you'll you will 8 | i'm i am 9 | they're they are 10 | that's that is 11 | what's what is 12 | couldn't could not 13 | i've i have 14 | we've we have 15 | can't cannot 16 | i'd i would 17 | i'd i would 18 | aren't are not 19 | isn't is not 20 | wasn't was not 21 | weren't were not 22 | won't will not 23 | there's there is 24 | there're there are 25 | . . . 26 | restaurants restaurant -s 27 | hotels hotel -s 28 | laptops laptop -s 29 | cheaper cheap -er 30 | dinners dinner -s 31 | lunches lunch -s 32 | breakfasts breakfast -s 33 | expensively expensive -ly 34 | moderately moderate -ly 35 | cheaply cheap -ly 36 | prices price -s 37 | places place -s 38 | venues venue -s 39 | ranges range -s 40 | meals meal -s 41 | locations location -s 42 | areas area -s 43 | policies policy -s 44 | children child -s 45 | kids kid -s 46 | kidfriendly kid friendly 47 | cards card -s 48 | upmarket expensive 49 | inpricey cheap 50 | inches inch -s 51 | uses use -s 52 | dimensions dimension -s 53 | driverange drive range 54 | includes include -s 55 | computers computer -s 56 | machines machine -s 57 | families family -s 58 | ratings rating -s 59 | constraints constraint -s 60 | pricerange price range 61 | batteryrating battery rating 62 | requirements requirement -s 63 | drives drive -s 64 | specifications specification -s 65 | weightrange weight range 66 | harddrive hard drive 67 | batterylife battery life 68 | businesses business -s 69 | hours hour -s 70 | one 1 71 | two 2 72 | three 3 73 | four 4 74 | five 5 75 | six 6 76 | seven 7 77 | eight 8 78 | nine 9 79 | ten 10 80 | eleven 11 81 | twelve 12 82 | anywhere any where 83 | good bye goodbye 84 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/scripts/utils/change_config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tempfile import TemporaryDirectory 5 | import tarfile 6 | from allennlp.common.params import Params 7 | from pathlib import Path 8 | 9 | from typing import Dict, Any, Union 10 | 11 | 12 | def change_config(model_path:str, overrides: Union[str,Dict[str, Any]]): 13 | with TemporaryDirectory() as tmpdirname: 14 | with tarfile.open(model_path, mode='r:gz') as input_tar: 15 | print('Extracting model...') 16 | input_tar.extractall(tmpdirname) 17 | 18 | os.rename(model_path, os.path.join(os.path.dirname(model_path), 'model_bu.tar.gz')) 19 | 20 | # rewrite config 21 | conf_path = os.path.join(tmpdirname, 'config.json') 22 | p = Params.from_file(conf_path, overrides) 23 | p.to_file(conf_path) 24 | 25 | with tarfile.open(model_path, "w:gz") as output_tar: 26 | print('Archiving model...') 27 | output_tar.add(tmpdirname, arcname ="") 28 | 29 | 30 | if __name__ == "__main__": 31 | def run_change_config(args): 32 | assert args.root_dir and args.overrides 33 | models = Path(args.root_dir).rglob('model.tar.gz') 34 | for x in models: 35 | print(x) 36 | change_config(str(x), args.overrides) 37 | 38 | parse = argparse.ArgumentParser() 39 | parse.set_defaults(func=run_change_config) 40 | parse.add_argument("-r", "--root_dir", type=str, help="Source directory with model.tar.gz to modify") 41 | parse.add_argument("-o", "--overrides", type=str, 42 | help='"settings params to override. dictionary, supports nested fieldsby dots') 43 | 44 | args = parse.parse_args() 45 | args.func(args) -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/e2e.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return E2eBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return E2eBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return E2eBM25Task.norm(entry['target']) 25 | 26 | 27 | class E2eBM25Task: 28 | name = 'e2e' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_E2E") 35 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 36 | if self.dataset_split == "train": 37 | self.dataset = self.train_dataset 38 | else: 39 | self.dataset = list(dataset[self.dataset_split]) 40 | self.corpus = None 41 | self.instruction = "Represent the example for retrieving duplicate examples; Input: " 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls, text): 50 | # 输出一个list 51 | return word_tokenize(text) 52 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/mtop.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | 9 | 10 | field_getter = App() 11 | 12 | def norm(text): 13 | return (" ".join(text.split(";"))).split(" ") 14 | 15 | @field_getter.add("q") 16 | def get_question(entry): 17 | return MtopBM25Task.norm(entry['question']) 18 | 19 | @field_getter.add("qa") 20 | def get_qa(entry): 21 | return MtopBM25Task.norm(f"{entry['question']} {entry['logical_form']}") 22 | 23 | @field_getter.add("a") 24 | def get_decomp(entry): 25 | # print(entry) 26 | return MtopBM25Task.norm(entry['logical_form']) 27 | 28 | 29 | class MtopBM25Task: 30 | name = "mtop" 31 | def __init__(self, dataset_split, setup_type, ds_size=None): 32 | self.setup_type = setup_type 33 | self.get_field = field_getter.functions[self.setup_type] 34 | self.dataset_split = dataset_split 35 | dataset = load_dataset("KaiLv/UDR_MTOP") 36 | self.train_dataset = load_train_dataset(dataset,size=ds_size) 37 | if self.dataset_split=="train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | 44 | def get_corpus(self): 45 | if self.corpus is None: 46 | self.corpus = [ self.get_field(entry) for entry in self.train_dataset] 47 | return self.corpus 48 | 49 | @classmethod 50 | def norm(cls,text): 51 | # return (" ".join(text.split(";"))).split(" ") 52 | return word_tokenize(text) 53 | 54 | 55 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/go.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return GoBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return GoBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return GoBM25Task.norm(entry['target']) 25 | 26 | 27 | class GoBM25Task: 28 | name = 'go' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_Go") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | self.instruction = "Represent the code example for retrieving duplicate examples; Input: " 43 | 44 | def get_corpus(self): 45 | if self.corpus is None: 46 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 47 | return self.corpus 48 | 49 | @classmethod 50 | def norm(cls, text): 51 | # 输出一个list 52 | return word_tokenize(text) 53 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/roc_story_generation.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return RocStoryGenerationBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return RocStoryGenerationBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return RocStoryGenerationBM25Task.norm(entry['target']) 25 | 26 | 27 | class RocStoryGenerationBM25Task: 28 | name = 'roc_story_generation' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_RocStory") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls, text): 50 | # 输出一个list 51 | return word_tokenize(text) 52 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/smcalflow.py: -------------------------------------------------------------------------------- 1 | import re, os 2 | from datasets import load_dataset, load_from_disk 3 | from src.utils.dataset_utils import load_train_dataset 4 | 5 | import json 6 | from src.utils.app import App 7 | from nltk.tokenize import word_tokenize 8 | 9 | 10 | 11 | field_getter = App() 12 | 13 | 14 | @field_getter.add("q") 15 | def get_question(entry): 16 | return SmcalflowBM25Task.norm(entry['user_utterance']) 17 | 18 | @field_getter.add("qa") 19 | def get_qa(entry): 20 | return SmcalflowBM25Task.norm(f"{entry['user_utterance']} {entry['lispress']}") 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | # print(entry) 25 | return SmcalflowBM25Task.norm(entry['lispress']) 26 | 27 | 28 | class SmcalflowBM25Task: 29 | name = "smcalflow" 30 | def __init__(self, dataset_split, setup_type,ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | # dataset = load_dataset("iohadrubin/smcalflow") 35 | dataset = load_dataset("KaiLv/UDR_SMCalFlow") 36 | self.train_dataset = load_train_dataset(dataset,size=ds_size) 37 | if self.dataset_split=="train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [ self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls,text): 50 | # return (" ".join(text.split(";"))).split(" ") 51 | return word_tokenize(text) 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/roc_ending_generation.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return RocEndingGenerationBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return RocEndingGenerationBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return RocEndingGenerationBM25Task.norm(entry['target']) 25 | 26 | 27 | class RocEndingGenerationBM25Task: 28 | name = 'roc_ending_generation' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_RocEnding") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | 43 | def get_corpus(self): 44 | if self.corpus is None: 45 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 46 | return self.corpus 47 | 48 | @classmethod 49 | def norm(cls, text): 50 | # 输出一个list 51 | return word_tokenize(text) 52 | -------------------------------------------------------------------------------- /dataflow/core/turn_prediction.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | 6 | from dataflow.core.dialogue import ( 7 | Dialogue, 8 | ProgramExecutionOracle, 9 | TurnId, 10 | UserUtterance, 11 | ) 12 | 13 | 14 | @dataclass(frozen=True, eq=True, repr=True) 15 | class UtteranceWithContext: 16 | """ 17 | A user utterance, with the dialogue history leading up to it. 18 | This is the input to the lispress prediction task. 19 | """ 20 | 21 | datum_id: TurnId 22 | user_utterance: UserUtterance 23 | context: Dialogue 24 | 25 | 26 | @dataclass(frozen=True, eq=True, repr=True) 27 | class TurnPrediction: 28 | """ 29 | A model prediction of the `lispress` for a single Turn. 30 | This is the output of the lispress prediction task. 31 | """ 32 | 33 | datum_id: TurnId 34 | user_utterance: str # redundant. just to make these files easier to read 35 | lispress: str 36 | 37 | 38 | @dataclass(frozen=True, eq=True, repr=True) 39 | class TurnAnswer: 40 | """ 41 | A model prediction of the `lispress` for a single Turn. 42 | This is the output of the lispress prediction task. 43 | """ 44 | 45 | datum_id: TurnId 46 | user_utterance: str # redundant. just to make these files easier to read 47 | lispress: str 48 | program_execution_oracle: Optional[ProgramExecutionOracle] 49 | 50 | 51 | def missing_prediction(datum_id: TurnId) -> TurnPrediction: 52 | """ 53 | A padding `TurnPrediction` that is used when a turn with 54 | `datum_id` is missing from a predictions file. 55 | """ 56 | return TurnPrediction( 57 | datum_id=datum_id, user_utterance="", lispress="", 58 | ) 59 | -------------------------------------------------------------------------------- /src/dataset_readers/inference_tasks/e2e.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 2 | from src.utils.dataset_utils import load_train_dataset 3 | 4 | import json, os 5 | from src.utils.tokenizer_utils import get_length 6 | 7 | 8 | def set_length(example, idx,**kwargs): 9 | tokenizer = kwargs['tokenizer'] 10 | question_prefix = "Table: " 11 | answer_prefix = "Sentence: " 12 | q_field = question_prefix + example['question'] 13 | a_field = answer_prefix + example['target'] 14 | prompt_qa = f"{q_field}\t{a_field}" 15 | example['prompt_qa'] = prompt_qa 16 | example['prompt_len'] = get_length(tokenizer,prompt_qa) 17 | return example 18 | 19 | 20 | class E2eInferenceTask: 21 | name = "e2e" 22 | 23 | def __init__(self, prompt_file, tokenizer, ds_size=None): 24 | self.prompt_file = prompt_file 25 | with open(self.prompt_file) as f: 26 | self.prompts = json.load(f) 27 | dataset = load_dataset("KaiLv/UDR_E2E") 28 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False) 29 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer}) 30 | self.training_dataset = list(self.hf_dataset) 31 | self.postfix = 'Sentence: ' 32 | 33 | @classmethod 34 | def postproccess(cls, string): 35 | return string 36 | 37 | def get_fields(self, entry): 38 | question = entry['question'] 39 | answer = entry['target'] if "target" in entry else entry['answers'][0] 40 | idx_list = [p['id'] for p in entry['ctxs']] 41 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']]) 42 | return "Table: " + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list 43 | -------------------------------------------------------------------------------- /src/dataset_readers/inference_tasks/go.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets import load_from_disk, Dataset, concatenate_datasets 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | import json 7 | from src.utils.tokenizer_utils import get_length 8 | 9 | 10 | def set_length(example, idx,**kwargs): 11 | tokenizer = kwargs['tokenizer'] 12 | question_prefix = "Code: " 13 | answer_prefix = "Comment: " 14 | q_field = question_prefix + example['question'] 15 | a_field = answer_prefix + example['target'] 16 | prompt_qa = f"{q_field}\t{a_field}" 17 | example['prompt_qa'] = prompt_qa 18 | example['prompt_len'] = get_length(tokenizer,prompt_qa) 19 | return example 20 | 21 | 22 | class GoInferenceTask: 23 | name = "go" 24 | 25 | def __init__(self, prompt_file, tokenizer, ds_size=None): 26 | self.prompt_file = prompt_file 27 | with open(self.prompt_file) as f: 28 | self.prompts = json.load(f) 29 | dataset = load_dataset("KaiLv/UDR_Go") 30 | 31 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False) 32 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer}) 33 | self.training_dataset = list(self.hf_dataset) 34 | self.postfix = 'Comment: ' 35 | 36 | @classmethod 37 | def postproccess(cls, string): 38 | return string 39 | 40 | def get_fields(self, entry): 41 | question = entry['question'] 42 | answer = entry['target'] if "target" in entry else entry['answers'][0] 43 | idx_list = [p['id'] for p in entry['ctxs']] 44 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']]) 45 | return "Code: " + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list 46 | -------------------------------------------------------------------------------- /src/dataset_readers/inference_tasks/pubmed.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets import load_from_disk, Dataset, concatenate_datasets 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | import json 7 | from src.utils.tokenizer_utils import get_length 8 | 9 | 10 | def set_length(example, idx,**kwargs): 11 | tokenizer = kwargs['tokenizer'] 12 | question_prefix = "" 13 | answer_prefix = "TL;DR: " 14 | q_field = question_prefix + example['question'] 15 | a_field = answer_prefix + example['target'] 16 | prompt_qa = f"{q_field}\t{a_field}" 17 | example['prompt_qa'] = prompt_qa 18 | example['prompt_len'] = get_length(tokenizer,prompt_qa) 19 | return example 20 | 21 | 22 | class PubmedInferenceTask: 23 | name = "pubmed" 24 | 25 | def __init__(self, prompt_file, tokenizer, ds_size=None): 26 | self.prompt_file = prompt_file 27 | with open(self.prompt_file) as f: 28 | self.prompts = json.load(f) 29 | dataset = load_dataset("KaiLv/UDR_PubMed") 30 | 31 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False) 32 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer}) 33 | self.training_dataset = list(self.hf_dataset) 34 | self.postfix = 'TL;DR: ' 35 | 36 | @classmethod 37 | def postproccess(cls, string): 38 | return string 39 | 40 | def get_fields(self, entry): 41 | question = entry['question'] 42 | answer = entry['target'] if "target" in entry else entry['answers'][0] 43 | idx_list = [p['id'] for p in entry['ctxs']] 44 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']]) 45 | return "" + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list 46 | -------------------------------------------------------------------------------- /src/dataset_readers/inference_tasks/reddit.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets import load_from_disk, Dataset, concatenate_datasets 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | import json 7 | from src.utils.tokenizer_utils import get_length 8 | 9 | 10 | def set_length(example, idx,**kwargs): 11 | tokenizer = kwargs['tokenizer'] 12 | question_prefix = "" 13 | answer_prefix = "TL;DR: " 14 | q_field = question_prefix + example['question'] 15 | a_field = answer_prefix + example['target'] 16 | prompt_qa = f"{q_field}\t{a_field}" 17 | example['prompt_qa'] = prompt_qa 18 | example['prompt_len'] = get_length(tokenizer,prompt_qa) 19 | return example 20 | 21 | 22 | class RedditInferenceTask: 23 | name = "reddit" 24 | 25 | def __init__(self, prompt_file, tokenizer, ds_size=None): 26 | self.prompt_file = prompt_file 27 | with open(self.prompt_file) as f: 28 | self.prompts = json.load(f) 29 | dataset = load_dataset("KaiLv/UDR_Reddit") 30 | 31 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False) 32 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer}) 33 | self.training_dataset = list(self.hf_dataset) 34 | self.postfix = 'TL;DR: ' 35 | 36 | @classmethod 37 | def postproccess(cls, string): 38 | return string 39 | 40 | def get_fields(self, entry): 41 | question = entry['question'] 42 | answer = entry['target'] if "target" in entry else entry['answers'][0] 43 | idx_list = [p['id'] for p in entry['ctxs']] 44 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']]) 45 | return "" + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list 46 | -------------------------------------------------------------------------------- /qdecomp_with_dependency_graphs/scripts/eval/eval_copy_files.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from shutil import copyfile, copytree, errno, ignore_patterns 3 | import argparse 4 | import os 5 | 6 | 7 | def copy(exp_path:str, dest_path:str): 8 | patterns: [str] = ["**/evals", "**/plots", "**/config.json", "**/metrics.json"] 9 | exclude_patterns : [str] = ["*_preds.json", "*_summary.tsv"] 10 | experiments = [str(p.parent) for p in Path(exp_path).glob("**/evals/")] 11 | 12 | for exp in experiments: 13 | for pattern in patterns: 14 | exclude = [p for ex_patt in exclude_patterns for p in Path(exp).glob(ex_patt)] 15 | pathlist = [p for p in Path(exp).glob(pattern) if p not in exclude] 16 | for path in pathlist: 17 | path_in_str = str(path) 18 | d=os.path.join(dest_path, path_in_str) 19 | os.makedirs(os.path.dirname(d), exist_ok=True) 20 | print("{} -> {}".format(path_in_str, d)) 21 | try: 22 | copytree(path_in_str, d, ignore=ignore_patterns(*exclude_patterns)) 23 | except OSError as exc: # python >2.5 24 | if exc.errno == errno.ENOTDIR: 25 | copyfile(path_in_str, d) 26 | else: 27 | raise 28 | 29 | 30 | if __name__ == '__main__': 31 | parser = argparse.ArgumentParser(description="copy aside evaluations files") 32 | parser.add_argument('--exp_dir', type=str, help='path to experiments directory') 33 | parser.add_argument('--dest_dir', type=str, help='path to destination directory') 34 | args = parser.parse_args() 35 | assert os.path.exists(args.exp_dir) 36 | assert args.exp_dir != args.dest_dir 37 | 38 | copy(args.exp_dir, args.dest_dir) -------------------------------------------------------------------------------- /src/dataset_readers/bm25_tasks/reddit.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk 3 | import json, os 4 | from src.utils.app import App 5 | from nltk.tokenize import word_tokenize 6 | from src.utils.dataset_utils import load_train_dataset 7 | 8 | field_getter = App() 9 | 10 | 11 | @field_getter.add("q") 12 | def get_question(entry): 13 | # 与mtop等不同,kp20k的question是一个list,不需要norm 14 | return RedditBM25Task.norm(entry['question']) 15 | 16 | 17 | @field_getter.add("qa") 18 | def get_qa(entry): 19 | return RedditBM25Task.norm(f"{entry['question']} {entry['target']}") 20 | 21 | 22 | @field_getter.add("a") 23 | def get_decomp(entry): 24 | return RedditBM25Task.norm(entry['target']) 25 | 26 | 27 | class RedditBM25Task: 28 | name = 'reddit' 29 | 30 | def __init__(self, dataset_split, setup_type, ds_size=None): 31 | self.setup_type = setup_type 32 | self.get_field = field_getter.functions[self.setup_type] 33 | self.dataset_split = dataset_split 34 | dataset = load_dataset("KaiLv/UDR_Reddit") 35 | print(dataset) 36 | self.train_dataset = load_train_dataset(dataset, size=ds_size) 37 | if self.dataset_split == "train": 38 | self.dataset = self.train_dataset 39 | else: 40 | self.dataset = list(dataset[self.dataset_split]) 41 | self.corpus = None 42 | self.instruction = "Represent the reddit example for retrieving duplicate examples; Input: " 43 | 44 | def get_corpus(self): 45 | if self.corpus is None: 46 | self.corpus = [self.get_field(entry) for entry in self.train_dataset] 47 | return self.corpus 48 | 49 | @classmethod 50 | def norm(cls, text): 51 | # 输出一个list 52 | return word_tokenize(text) 53 | -------------------------------------------------------------------------------- /src/dataset_readers/inference_tasks/php.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datasets import load_from_disk, Dataset, concatenate_datasets 4 | from src.utils.dataset_utils import load_train_dataset 5 | 6 | import json 7 | from src.utils.tokenizer_utils import get_length 8 | 9 | 10 | def set_length(example, idx,**kwargs): 11 | tokenizer = kwargs['tokenizer'] 12 | question_prefix = "Code: " 13 | answer_prefix = "Comment: " 14 | q_field = question_prefix + example['question'] 15 | a_field = answer_prefix + example['target'] 16 | prompt_qa = f"{q_field}\t{a_field}" 17 | example['prompt_qa'] = prompt_qa 18 | example['prompt_len'] = get_length(tokenizer,prompt_qa) 19 | return example 20 | 21 | 22 | class PhpInferenceTask: 23 | name = "php" 24 | 25 | def __init__(self, prompt_file, tokenizer, ds_size=None): 26 | self.prompt_file = prompt_file 27 | with open(self.prompt_file) as f: 28 | self.prompts = json.load(f) 29 | dataset = load_dataset("KaiLv/UDR_PHP") 30 | 31 | self.hf_dataset = load_train_dataset(dataset, size=ds_size, listify=False) 32 | self.hf_dataset = self.hf_dataset.map(set_length, with_indices=True, fn_kwargs={'tokenizer': tokenizer}) 33 | self.training_dataset = list(self.hf_dataset) 34 | self.postfix = 'Comment: ' 35 | 36 | @classmethod 37 | def postproccess(cls, string): 38 | return string 39 | 40 | def get_fields(self, entry): 41 | question = entry['question'] 42 | answer = entry['target'] if "target" in entry else entry['answers'][0] 43 | idx_list = [p['id'] for p in entry['ctxs']] 44 | prompts = self.hf_dataset.select([p['id'] for p in entry['ctxs']]) 45 | return "Code: " + question, answer, prompts['prompt_qa'], prompts['prompt_len'], idx_list 46 | --------------------------------------------------------------------------------