├── AUTHORS ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── language ├── __init__.py ├── asqa │ ├── README.md │ ├── convert_to_roberta_format.py │ ├── eval.sh │ ├── human_annotation │ │ ├── analysis.py │ │ ├── instructions.txt │ │ ├── preparation.py │ │ ├── prepare_interface.gs │ │ ├── ready_for_drive.tsv │ │ ├── screenshot.png │ │ └── setup.tsv │ ├── install.sh │ ├── requirements.txt │ └── scoring.py ├── bert_extraction │ ├── README.md │ ├── __init__.py │ ├── steal_bert_classifier │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data_generation │ │ │ ├── __init__.py │ │ │ ├── build_aux_membership.py │ │ │ ├── build_membership_dataset.py │ │ │ ├── merge_dataset_pool_active_learning.py │ │ │ ├── preprocess_edit_distance_one.py │ │ │ ├── preprocess_random.py │ │ │ ├── preprocess_thief_dataset.py │ │ │ └── preprocess_util.py │ │ ├── embedding_perturbations │ │ │ ├── __init__.py │ │ │ ├── discrete_invert_embeddings.py │ │ │ ├── embedding_util.py │ │ │ ├── invert_embeddings.py │ │ │ ├── merge_shards.py │ │ │ └── mixup_bert_embeddings.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── run_classifier.py │ │ │ ├── run_classifier_distillation.py │ │ │ └── run_classifier_membership.py │ │ ├── scripts │ │ │ ├── evaluate_agreement.sh │ │ │ ├── run_extraction_random.sh │ │ │ ├── run_extraction_watermark_random.sh │ │ │ ├── run_extraction_watermark_wiki.sh │ │ │ ├── run_extraction_wiki.sh │ │ │ ├── run_membership_classification.sh │ │ │ ├── run_pool_filter.sh │ │ │ ├── run_query_synthesis.sh │ │ │ └── train_victim.sh │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── dataset_analysis.py │ │ │ ├── merge_datasets_simple.py │ │ │ ├── model_diff.py │ │ │ ├── model_diff_dataset.py │ │ │ ├── pairwise_dataset_analysis.py │ │ │ ├── preprocess_distill_input.py │ │ │ ├── preprocess_distill_input_watermark.py │ │ │ ├── verify_watermark.py │ │ │ └── wiki103_sentencize.py │ └── steal_bert_qa │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data_generation │ │ ├── __init__.py │ │ ├── build_aux_membership.py │ │ ├── build_membership_dataset.py │ │ ├── preprocess_fraction_squad.py │ │ ├── preprocess_thief_dataset_boolq.py │ │ ├── preprocess_thief_dataset_squad.py │ │ ├── preprocess_thief_dataset_squad_custom.py │ │ ├── preprocess_thief_dev_squad.py │ │ └── preprocess_util.py │ │ ├── models │ │ ├── __init__.py │ │ ├── run_bert_boolq.py │ │ ├── run_bert_boolq_distill.py │ │ ├── run_squad.py │ │ └── run_squad_membership.py │ │ ├── scripts │ │ ├── run_extraction_boolq.sh │ │ ├── run_extraction_squad.sh │ │ ├── run_extraction_watermark_squad.sh │ │ ├── run_filter_victim_squad.sh │ │ ├── run_membership_squad.sh │ │ ├── train_victim_boolq.sh │ │ └── train_victim_squad.sh │ │ └── utils │ │ ├── __init__.py │ │ ├── combine_qa.py │ │ ├── combine_qa_watermark.py │ │ ├── evaluate_squad.py │ │ ├── evaluate_squad_2.py │ │ ├── evaluate_squad_watermark.py │ │ ├── filter_queries_victim_agreement.py │ │ ├── run_bert_boolq_diff.py │ │ └── wiki103_para_split.py ├── bertology │ └── frequency_effects │ │ ├── README.md │ │ └── data │ │ ├── README.md │ │ ├── nouns.tsv │ │ ├── sentential_contexts.tsv │ │ └── verbs.tsv ├── boolq │ ├── README.md │ ├── __init__.py │ ├── run_bert_boolq.py │ ├── run_recurrent_model_boolq.py │ └── utils │ │ ├── __init__.py │ │ ├── best_checkpoint_exporter.py │ │ ├── ops.py │ │ ├── ops_test.py │ │ ├── py_utils.py │ │ └── tokenization.py ├── canine │ ├── README.md │ ├── bert_modeling.py │ ├── bert_optimization.py │ ├── config_utils.py │ ├── config_utils_test.py │ ├── local_attention.py │ ├── modeling.py │ ├── modeling_test.py │ ├── special_codepoints.py │ ├── tensor_contracts.py │ ├── tensor_contracts_test.py │ └── tydiqa │ │ ├── README.md │ │ ├── char_splitter.py │ │ ├── data.py │ │ ├── debug.py │ │ ├── postproc.py │ │ ├── prepare_tydi_data.py │ │ ├── preproc.py │ │ ├── preproc_test.py │ │ ├── run_tydi.py │ │ ├── run_tydi_lib.py │ │ ├── tf_io.py │ │ ├── tf_io_test.py │ │ ├── tydi_modeling.py │ │ └── tydi_tokenization_interface.py ├── capwap │ ├── README.md │ ├── datasets │ │ ├── __init__.py │ │ ├── captions_dataset.py │ │ ├── rc_dataset.py │ │ ├── text_dataset.py │ │ ├── vqa_dataset.py │ │ └── wsp_dataset.py │ ├── download.sh │ ├── evaluation │ │ ├── infer_captions.py │ │ ├── infer_wsp_captions.py │ │ └── score_captions.py │ ├── img │ │ └── capwap.png │ ├── models │ │ ├── reinforce_model.py │ │ └── supervised_model.py │ ├── preprocessing │ │ ├── coco_ood_captions.py │ │ ├── coco_synthetic_qa.py │ │ ├── coco_text_planner.py │ │ ├── gqa_qa.py │ │ ├── text_synthetic_qa.py │ │ ├── v7w_qa.py │ │ ├── vizwiz_qa.py │ │ ├── vqa_qa.py │ │ └── weakly_supervised.py │ ├── synthetic │ │ ├── filter_round_trip.py │ │ ├── generate_answers.py │ │ └── generate_questions.py │ ├── training │ │ ├── train_reinforce.py │ │ └── train_supervised.py │ └── utils │ │ ├── checkpoint_utils.py │ │ ├── experiment_utils.py │ │ ├── image_utils.py │ │ ├── io_utils.py │ │ ├── metric_utils.py │ │ ├── nltk_utils.py │ │ ├── reward_utils.py │ │ ├── tensor_utils.py │ │ ├── text_utils.py │ │ ├── transformer_utils.py │ │ └── tsv_to_hdf5.py ├── casper │ ├── EXPERIMENTS.md │ ├── README.md │ ├── augment │ │ ├── cached_retrieval_to_dataset.py │ │ ├── cached_retrieval_to_dataset_lib.py │ │ ├── casper_converters.py │ │ ├── casper_converters_test.py │ │ ├── casper_formatters.py │ │ ├── casper_formatters_test.py │ │ └── patch_guiding_tag.py │ ├── evaluate │ │ ├── evaluate_mtop_predictions.py │ │ ├── evaluate_retrieval.py │ │ └── top_metrics.py │ ├── retrieve │ │ ├── cache_query_retrievals.py │ │ ├── query_retrievers.py │ │ └── query_retrievers_test.py │ ├── scripts │ │ ├── domain_bootstrap_cache_retrievals.sh │ │ ├── domain_bootstrap_gen_datasets.sh │ │ ├── parse_guiding_gen_datasets.sh │ │ ├── schema_refactor_gen_dataset.sh │ │ ├── standard_cache_retrievals.sh │ │ └── standard_gen_datasets.sh │ └── utils │ │ ├── data_types.py │ │ ├── mtop_tsv_to_jsonl.py │ │ ├── sample_utils.py │ │ ├── sample_utils_test.py │ │ ├── top_constants.py │ │ ├── top_utils.py │ │ └── top_utils_test.py ├── common │ ├── __init__.py │ ├── inputs │ │ ├── __init__.py │ │ ├── char_utils.py │ │ ├── char_utils_test.py │ │ ├── dataset_utils.py │ │ ├── dataset_utils_test.py │ │ ├── embedding_utils.py │ │ └── embedding_utils_test.py │ ├── layers │ │ ├── __init__.py │ │ ├── affine_transform.py │ │ ├── affine_transform_test.py │ │ ├── common_layers.py │ │ ├── common_layers_test.py │ │ ├── cudnn_layers.py │ │ └── cudnn_layers_test.py │ └── utils │ │ ├── __init__.py │ │ ├── experiment_utils.py │ │ ├── experiment_utils_test.py │ │ ├── export_utils.py │ │ ├── export_utils_test.py │ │ ├── exporters.py │ │ ├── file_utils.py │ │ ├── model_utils.py │ │ ├── nest_utils.py │ │ ├── tensor_utils.py │ │ ├── tensor_utils_test.py │ │ └── tpu_utils.py ├── compgen │ ├── csl │ │ ├── README.md │ │ ├── augment │ │ │ ├── generate_synthetic_examples.py │ │ │ ├── generate_synthetic_examples_beam.py │ │ │ ├── joint_sampler.py │ │ │ ├── joint_sampler_test.py │ │ │ ├── merge_tsvs.py │ │ │ ├── qcfg_sampler.py │ │ │ ├── qcfg_sampler_test.py │ │ │ ├── sampler_utils.py │ │ │ ├── sampler_utils_test.py │ │ │ └── test_utils.py │ │ ├── cky │ │ │ ├── cfg_converter.py │ │ │ ├── cfg_parser.py │ │ │ ├── cfg_parser_test.py │ │ │ ├── cfg_rule.py │ │ │ ├── cfg_sampler.py │ │ │ └── cfg_sampler_test.py │ │ ├── common │ │ │ ├── beam_utils.py │ │ │ ├── json_utils.py │ │ │ ├── txt_utils.py │ │ │ └── writer_utils.py │ │ ├── csl_flowchart.jpg │ │ ├── demo_geoquery.sh │ │ ├── demo_smcalflow.sh │ │ ├── induction │ │ │ ├── action_utils.py │ │ │ ├── derivation_utils.py │ │ │ ├── derivation_utils_test.py │ │ │ ├── greedy_policy.py │ │ │ ├── induction_utils.py │ │ │ ├── objective_utils.py │ │ │ ├── objective_utils_test.py │ │ │ ├── rule_utils.py │ │ │ ├── rule_utils_test.py │ │ │ ├── search_main.py │ │ │ ├── search_main_beam.py │ │ │ ├── unification_utils.py │ │ │ └── unification_utils_test.py │ │ ├── model │ │ │ ├── data │ │ │ │ ├── example_converter.py │ │ │ │ ├── example_converter_test.py │ │ │ │ ├── forest_serialization.py │ │ │ │ ├── parsing_utils.py │ │ │ │ ├── write_examples.py │ │ │ │ └── write_examples_beam.py │ │ │ ├── data_constants.py │ │ │ ├── inference │ │ │ │ ├── eval_model.py │ │ │ │ ├── eval_model_beam.py │ │ │ │ ├── eval_utils.py │ │ │ │ ├── get_predictions.py │ │ │ │ ├── inference_parser.py │ │ │ │ ├── inference_parser_test.py │ │ │ │ ├── inference_utils.py │ │ │ │ └── inference_wrapper.py │ │ │ ├── test_utils.py │ │ │ ├── training │ │ │ │ ├── forest_utils.py │ │ │ │ ├── forest_utils_test.py │ │ │ │ ├── input_utils.py │ │ │ │ ├── train_model.py │ │ │ │ ├── training_utils.py │ │ │ │ └── training_utils_test.py │ │ │ └── weighted_model.py │ │ ├── qcfg │ │ │ ├── compute_recall.py │ │ │ ├── qcfg_file.py │ │ │ ├── qcfg_parser.py │ │ │ ├── qcfg_parser_test.py │ │ │ ├── qcfg_rule.py │ │ │ ├── qcfg_target_parser.py │ │ │ └── qcfg_target_parser_test.py │ │ ├── targets │ │ │ ├── target_grammar.py │ │ │ ├── target_grammar_test.py │ │ │ └── verify_target_grammar.py │ │ └── tasks │ │ │ ├── cogs │ │ │ ├── augment_config.json │ │ │ ├── induction_config.json │ │ │ ├── model_config.json │ │ │ ├── seed_rules.txt │ │ │ ├── target_cfg.txt │ │ │ └── tools │ │ │ │ ├── categorize_errors.py │ │ │ │ ├── cogs_converter.py │ │ │ │ ├── cogs_converter_test.py │ │ │ │ └── preprocess_cogs_data.py │ │ │ ├── exact_match_utils.py │ │ │ ├── exact_match_utils_test.py │ │ │ ├── generate_exact_match_rules.py │ │ │ ├── geoquery │ │ │ ├── augment_config.json │ │ │ ├── induction_config.json │ │ │ ├── model_config.json │ │ │ ├── splits │ │ │ │ ├── length.json │ │ │ │ ├── template_1.json │ │ │ │ ├── template_2.json │ │ │ │ ├── template_3.json │ │ │ │ ├── tmcd_1.json │ │ │ │ ├── tmcd_2.json │ │ │ │ └── tmcd_3.json │ │ │ └── target_cfg.txt │ │ │ ├── scan │ │ │ ├── augment_config.json │ │ │ ├── induction_config.json │ │ │ ├── model_config_t2.json │ │ │ └── model_config_t4.json │ │ │ └── smcalflow │ │ │ ├── augment_config.json │ │ │ ├── induction_config.json │ │ │ ├── manual_seed_rules.txt │ │ │ ├── model_config.json │ │ │ └── tools │ │ │ ├── filter_examples.py │ │ │ ├── format_for_t5.py │ │ │ ├── generate_identity_rules.py │ │ │ ├── generate_target_cfg.py │ │ │ ├── merge_dataset.py │ │ │ ├── restore_oov.py │ │ │ ├── retokenize_inputs.py │ │ │ ├── split_examples.py │ │ │ └── string_utils.py │ └── nqg │ │ ├── README.md │ │ ├── common │ │ └── cky │ │ │ ├── cfg_parser.py │ │ │ ├── cfg_parser_test.py │ │ │ ├── cfg_rule.py │ │ │ ├── cky_utils.py │ │ │ └── trie_utils.py │ │ ├── model │ │ ├── induction │ │ │ ├── codelength_utils.py │ │ │ ├── codelength_utils_test.py │ │ │ ├── derivation_utils.py │ │ │ ├── derivation_utils_test.py │ │ │ ├── exact_match_utils.py │ │ │ ├── exact_match_utils_test.py │ │ │ ├── induce_rules.py │ │ │ ├── induction_utils.py │ │ │ ├── rule_utils.py │ │ │ ├── rule_utils_test.py │ │ │ ├── split_utils.py │ │ │ └── split_utils_test.py │ │ ├── parser │ │ │ ├── config_utils.py │ │ │ ├── configs │ │ │ │ ├── geoquery_config.json │ │ │ │ ├── geoquery_xl_config.json │ │ │ │ ├── scan_config.json │ │ │ │ └── spider_config.json │ │ │ ├── data │ │ │ │ ├── data_constants.py │ │ │ │ ├── example_converter.py │ │ │ │ ├── example_converter_test.py │ │ │ │ ├── forest_serialization.py │ │ │ │ ├── forest_serialization_test.py │ │ │ │ ├── parsing_utils.py │ │ │ │ ├── tokenization_utils.py │ │ │ │ ├── tokenization_utils_test.py │ │ │ │ └── write_examples.py │ │ │ ├── inference │ │ │ │ ├── eval_model.py │ │ │ │ ├── generate_predictions.py │ │ │ │ ├── inference_parser.py │ │ │ │ ├── inference_wrapper.py │ │ │ │ ├── inference_wrapper_test.py │ │ │ │ └── targets │ │ │ │ │ ├── funql.txt │ │ │ │ │ ├── generate_spider_grammars.py │ │ │ │ │ ├── target_grammar.py │ │ │ │ │ └── target_grammar_test.py │ │ │ ├── nqg_model.py │ │ │ ├── nqg_model_test.py │ │ │ ├── test_utils.py │ │ │ └── training │ │ │ │ ├── forest_utils.py │ │ │ │ ├── forest_utils_test.py │ │ │ │ ├── input_utils.py │ │ │ │ ├── train_model.py │ │ │ │ ├── training_utils.py │ │ │ │ └── training_utils_test.py │ │ └── qcfg │ │ │ ├── compute_recall.py │ │ │ ├── qcfg_file.py │ │ │ ├── qcfg_parser.py │ │ │ ├── qcfg_parser_test.py │ │ │ └── qcfg_rule.py │ │ └── tasks │ │ ├── compare_predictions.py │ │ ├── compare_splits.py │ │ ├── gen_length_split.py │ │ ├── gen_random_split.py │ │ ├── geoquery │ │ ├── entity_utils.py │ │ ├── funql_normalization.py │ │ ├── funql_normalization_test.py │ │ ├── gen_template_split.py │ │ ├── gen_tmcd_split.py │ │ ├── geobase_utils.py │ │ ├── measure_compound_divergence.py │ │ ├── measure_unseen_atoms.py │ │ ├── splits │ │ │ ├── length_1.json │ │ │ ├── standard.json │ │ │ ├── template_1.json │ │ │ └── tmcd_1.json │ │ ├── tmcd_utils.py │ │ ├── tmcd_utils_test.py │ │ ├── write_dataset.py │ │ └── xml_file_utils.py │ │ ├── mcd_utils.py │ │ ├── mcd_utils_test.py │ │ ├── scan │ │ ├── convert_to_tsv.py │ │ └── join_txt_to_tsv.py │ │ ├── spider │ │ ├── append_schema.py │ │ ├── database_constants.py │ │ ├── gen_template_split.py │ │ ├── gen_tmcd_split.py │ │ ├── generate_gold.py │ │ ├── measure_compound_divergence.py │ │ ├── measure_unseen_atoms.py │ │ ├── nqg_preprocess.py │ │ ├── nqg_tokenization.py │ │ ├── print_database_counts.py │ │ ├── restore_oov.py │ │ ├── splits │ │ │ ├── length_1.json │ │ │ ├── random_1.json │ │ │ ├── template_1.json │ │ │ └── tmcd_1.json │ │ ├── sql_parser.py │ │ ├── sql_parser_main.py │ │ ├── sql_parser_test.py │ │ ├── sql_tokenizer.py │ │ ├── tmcd_utils.py │ │ ├── tmcd_utils_test.py │ │ └── write_dataset.py │ │ ├── split_dataset.py │ │ ├── strip_targets.py │ │ ├── template_utils.py │ │ └── tsv_utils.py ├── compir │ ├── README.md │ ├── dataset_parsers │ │ ├── cfq_parser.py │ │ ├── dataset_parser.py │ │ ├── scan_parser.py │ │ └── sql_parser.py │ ├── evaluate │ │ ├── evaluate_predictions.py │ │ └── evaluate_predictions_utils.py │ ├── transform │ │ ├── apply_transformation.py │ │ └── apply_transformation_utils.py │ └── utils │ │ ├── dataset_parser_utils.py │ │ └── io_utils.py ├── conpono │ ├── README.md │ ├── binary_order │ │ └── run_binary_coherence.py │ ├── cpc │ │ ├── bilin_model_builder.py │ │ ├── model_builder.py │ │ ├── preproc │ │ │ ├── books_preproc_pipeline.py │ │ │ ├── ccnews_preproc_pipeline.py │ │ │ ├── preprocessing_utils.py │ │ │ ├── raw_books_preproc_pipeline.py │ │ │ └── wiki_preproc_pipeline.py │ │ ├── run_bilin_cpc.py │ │ ├── run_cc_cpc.py │ │ └── run_cpc.py │ ├── create_pretrain_data │ │ ├── books_preproc_pipeline.py │ │ ├── preprocessing_utils.py │ │ └── wiki_preproc_pipeline.py │ ├── evals │ │ ├── classifier_utils.py │ │ ├── coherence_eval.py │ │ ├── discriminative_eval.py │ │ ├── model_builder.py │ │ ├── race_utils.py │ │ ├── run_classifier.py │ │ ├── run_concat_classifier.py │ │ ├── run_finetune_coherence.py │ │ ├── run_hellaswag.py │ │ ├── run_multichoice.py │ │ ├── run_race_sp_eval_all.py │ │ ├── run_record.py │ │ └── run_squad.py │ └── reconstruct │ │ ├── model_builder.py │ │ ├── preprocess.py │ │ └── run_paragraph_reconstruct.py ├── decontext │ ├── README.md │ ├── decontext_util.py │ ├── decontextualization_demo.ipynb │ ├── eval.py │ ├── eval_requirements.txt │ └── eval_util.py ├── diffqg │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── annotation.py │ ├── install_and_test.sh │ ├── metrics.py │ ├── requirements.txt │ └── run_metrics.py ├── emql │ ├── README.md │ ├── __init__.py │ ├── cm_sketch.py │ ├── cm_sketch_test.py │ ├── data_loader.py │ ├── data_loader_test.py │ ├── eval.py │ ├── eval_test.py │ ├── main.py │ ├── model.py │ ├── model_test.py │ ├── module.py │ ├── module_test.py │ ├── preprocess │ │ ├── metaqa_preprocess.py │ │ └── query2box_preprocess.py │ ├── util.py │ └── util_test.py ├── frost │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── create_frost_finetuning_data.py │ ├── spacy_frost_annotator_lib.py │ └── spacy_frost_annotator_lib_test.py ├── fruit │ ├── README.md │ ├── README_PIPELINE.md │ ├── __init__.py │ ├── beam_pipelines.py │ ├── beam_pipelines_test.py │ ├── convert_task_to_jsonl.py │ ├── metrics.py │ ├── metrics_test.py │ ├── postprocessors.py │ ├── postprocessors_test.py │ ├── rendering_utils.py │ ├── rendering_utils_test.py │ ├── requirements.txt │ ├── scripts │ │ ├── __init__.py │ │ ├── convert_to_jsonl.py │ │ ├── evaluate_direct_jsonls.py │ │ ├── evaluate_t5x_jsonl.py │ │ ├── get_topics.py │ │ ├── run_convert_to_jsonl.py │ │ ├── run_filter_for_generation_pipeline.py │ │ ├── run_process_snapshot_pipeline.py │ │ ├── run_redirect_table_pipeline.py │ │ ├── run_to_tfrecords_pipeline.py │ │ └── sample_data │ │ │ ├── inputlabels.jsonl │ │ │ └── pred.jsonl │ ├── t5x │ │ └── configs │ │ │ ├── t5_3b_eval.gin │ │ │ ├── t5_3b_finetune.gin │ │ │ ├── t5_base_eval.gin │ │ │ ├── t5_base_finetune.gin │ │ │ ├── t5_large_eval.gin │ │ │ ├── t5_large_finetune.gin │ │ │ ├── t5_small_eval.gin │ │ │ └── t5_small_finetune.gin │ ├── tasks.py │ ├── testdata │ │ ├── test.diff.tfrecords │ │ ├── test_annotations.jsonl │ │ ├── test_article_pairs.jsonl │ │ ├── test_redirects.tsv │ │ ├── test_source_articles.jsonl │ │ └── test_target_articles.jsonl │ ├── tf_utils.py │ ├── tf_utils_test.py │ ├── wiki_utils.py │ └── wiki_utils_test.py ├── gscan │ ├── data │ │ ├── README.md │ │ ├── dataset.py │ │ ├── grammar.py │ │ ├── main.py │ │ ├── vocabulary.py │ │ └── world.py │ └── xattn_model │ │ ├── README.md │ │ ├── configs │ │ ├── compositional.py │ │ └── spatial_relation.py │ │ ├── dataset │ │ ├── gscan_dataset.py │ │ ├── input_pipeline.py │ │ ├── input_pipeline_test.py │ │ └── preprocess.py │ │ ├── evaluation.py │ │ ├── main.py │ │ ├── model │ │ ├── decode.py │ │ ├── layers.py │ │ ├── layers_test.py │ │ ├── model_utils.py │ │ ├── models.py │ │ └── models_test.py │ │ ├── predict.py │ │ ├── predict_test.py │ │ ├── test_utils.py │ │ ├── testdata │ │ ├── dataset.txt │ │ ├── train.tfrecord │ │ ├── training_input_vocab.txt │ │ └── training_target_vocab.txt │ │ ├── train.py │ │ ├── train_test.py │ │ └── train_utils.py ├── labs │ ├── README │ ├── __init__.py │ ├── consistent_zero_shot_nmt │ │ ├── README.md │ │ ├── __init__.py │ │ ├── bin │ │ │ ├── __init__.py │ │ │ ├── t2t_datagen.py │ │ │ ├── t2t_decoder.py │ │ │ └── t2t_trainer.py │ │ ├── data_generators │ │ │ ├── __init__.py │ │ │ ├── translate_europarl.py │ │ │ ├── translate_iwslt17.py │ │ │ ├── translate_multilingual.py │ │ │ └── translate_uncorpus.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── agreement.py │ │ │ ├── basic.py │ │ │ ├── losses.py │ │ │ └── losses_test.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── attention_mechanisms.py │ │ │ ├── attention_wrappers.py │ │ │ ├── base.py │ │ │ ├── decoders.py │ │ │ ├── encoders.py │ │ │ ├── helpers.py │ │ │ └── language_models.py │ │ ├── scripts │ │ │ ├── __init__.py │ │ │ ├── datagen_europarl.sh │ │ │ ├── datagen_iwslt17.sh │ │ │ ├── datagen_uncorpus.sh │ │ │ ├── decode_europarl.sh │ │ │ ├── decode_iwslt17.sh │ │ │ ├── decode_uncorpus.sh │ │ │ ├── download_and_preproc_europarl.sh │ │ │ ├── download_and_preproc_iwslt2017.sh │ │ │ ├── identify_overlap_europarl.py │ │ │ ├── identify_overlap_iwslt17.py │ │ │ ├── parse-args.sh │ │ │ └── run_nmt_experiment.sh │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── common_utils.py │ │ │ ├── model_utils.py │ │ │ └── t2t_tweaks.py │ ├── drkit │ │ ├── README.md │ │ ├── bert_utils.py │ │ ├── bert_utils_v2.py │ │ ├── evaluate.py │ │ ├── hotpotqa │ │ │ ├── answer_extractor.py │ │ │ ├── demo.py │ │ │ ├── index.py │ │ │ ├── preprocessing │ │ │ │ ├── convert_hotpot_to_mrqa.py │ │ │ │ ├── convert_wikidata_to_mrqa.py │ │ │ │ ├── create_tfrecords.py │ │ │ │ ├── link_questions.py │ │ │ │ └── parse_wiki.py │ │ │ ├── scripts │ │ │ │ ├── index_hotpot_corpus.sh │ │ │ │ ├── run_demo.sh │ │ │ │ ├── run_hotpotqa_answer.sh │ │ │ │ ├── run_hotpotqa_finetuning.sh │ │ │ │ └── run_hotpotqa_pretraining.sh │ │ │ └── web │ │ │ │ ├── static │ │ │ │ └── drkit.css │ │ │ │ └── templates │ │ │ │ └── drkit.html │ │ ├── input_fns.py │ │ ├── metaqa │ │ │ ├── index.py │ │ │ ├── preprocessing │ │ │ │ ├── distantly_supervise.py │ │ │ │ ├── metaqa_preprocess.py │ │ │ │ └── process_wiki.py │ │ │ └── scripts │ │ │ │ ├── index_metaqa_corpus.sh │ │ │ │ ├── preprocess_data.sh │ │ │ │ ├── run_metaqa_finetuning.sh │ │ │ │ └── run_metaqa_pretraining.sh │ │ ├── model_fns.py │ │ ├── preprocessing │ │ │ ├── preprocess_qa.py │ │ │ └── preprocess_utils.py │ │ ├── run_dualencoder_lsf.py │ │ ├── run_dualencoder_qa.py │ │ ├── run_multihop_follow.py │ │ ├── search_utils.py │ │ └── wikidata │ │ │ ├── index.py │ │ │ ├── preprocessing │ │ │ ├── add_negatives.py │ │ │ ├── create_3hop_queries.py │ │ │ ├── create_follow_queries.py │ │ │ └── distantly_supervise.py │ │ │ └── scripts │ │ │ ├── create_multihop_wikidata.sh │ │ │ ├── index_wikidata_corpus.sh │ │ │ ├── run_wikidata_finetuning.sh │ │ │ └── run_wikidata_pretraining.sh │ ├── exemplar_decoding │ │ ├── __init__.py │ │ ├── docs │ │ │ ├── giga_hyperparameters.txt │ │ │ └── nyt_hyperparameters.txt │ │ ├── experiments │ │ │ ├── __init__.py │ │ │ ├── predict.py │ │ │ └── train.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── adam.py │ │ │ ├── attention.py │ │ │ ├── baselines.py │ │ │ ├── common.py │ │ │ ├── hyperlstm.py │ │ │ ├── hypernet.py │ │ │ ├── linear.py │ │ │ ├── model_function.py │ │ │ └── output_wrapper.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── data.py │ │ │ ├── rouge_utils.py │ │ │ ├── tensor_utils.py │ │ │ └── tensor_utils_test.py │ └── memory │ │ ├── README │ │ ├── __init__.py │ │ ├── baseline_models.py │ │ ├── differentiable_plasticity.py │ │ ├── explicit_mem.py │ │ ├── model_utils.py │ │ ├── model_utils_test.py │ │ ├── run_models.py │ │ ├── synthetic_dataset.py │ │ └── synthetic_dataset_test.py ├── massive_translations │ └── README.md ├── mentionmemory │ ├── README.md │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── prepare_complexwq.py │ │ ├── prepare_fever.py │ │ ├── prepare_hover.py │ │ ├── prepare_tacred.py │ │ ├── prepare_webred.py │ │ └── testdata │ │ │ └── tacred │ │ │ └── test_sample.json │ ├── encoders │ │ ├── __init__.py │ │ ├── base_encoder.py │ │ ├── bert_encoder.py │ │ ├── eae_encoder.py │ │ ├── eae_encoder_test.py │ │ ├── encoder_registry.py │ │ ├── encoder_registry_test.py │ │ ├── import_encoders.py │ │ ├── mauto_encoder.py │ │ ├── mauto_encoder_test.py │ │ ├── mention_memory_encoder.py │ │ ├── mention_memory_encoder_test.py │ │ ├── readtwice_encoder.py │ │ └── readtwice_encoder_test.py │ ├── experiments │ │ ├── __init__.py │ │ ├── jax_runner.py │ │ └── memory_generation_main.py │ ├── modules │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── attention_test.py │ │ ├── batch_memory_attention_layer.py │ │ ├── batch_memory_attention_layer_test.py │ │ ├── embedding.py │ │ ├── embedding_test.py │ │ ├── entity_attention_layer.py │ │ ├── entity_attention_layer_test.py │ │ ├── kmeans.py │ │ ├── kmeans_test.py │ │ ├── memory_attention_layer.py │ │ ├── memory_attention_layer_test.py │ │ ├── memory_extraction_layer.py │ │ ├── memory_extraction_layer_test.py │ │ ├── memory_retrieval_layer.py │ │ ├── mention_losses.py │ │ ├── mention_losses_test.py │ │ ├── mlm_layer.py │ │ ├── mlm_layer_test.py │ │ ├── mlp.py │ │ ├── mlp_test.py │ │ ├── retrieval_update_layers.py │ │ ├── retrieval_update_layers_test.py │ │ ├── sparse_topk_similarity_layer.py │ │ ├── sparse_topk_similarity_layer_test.py │ │ ├── topk_similarity_layer.py │ │ ├── topk_similarity_layer_test.py │ │ ├── transformer.py │ │ └── transformer_test.py │ ├── requirements.txt │ ├── run.sh │ ├── run_tests.py │ ├── tasks │ │ ├── __init__.py │ │ ├── base_task.py │ │ ├── downstream_encoder_task.py │ │ ├── eae_task.py │ │ ├── eae_task_test.py │ │ ├── embedding_based_entity_qa_task.py │ │ ├── entity_qa_task.py │ │ ├── example_task.py │ │ ├── example_task_test.py │ │ ├── import_tasks.py │ │ ├── mauto_task.py │ │ ├── mauto_task_test.py │ │ ├── memory_generation_task.py │ │ ├── memory_generation_task_test.py │ │ ├── mention_based_entity_qa_task.py │ │ ├── mention_based_entity_qa_task_test.py │ │ ├── mention_classifier_task.py │ │ ├── mention_encoder_task.py │ │ ├── mention_memory_task.py │ │ ├── mention_memory_task_test.py │ │ ├── readtwice_task.py │ │ ├── readtwice_task_test.py │ │ ├── relation_classifier_task.py │ │ ├── relation_classifier_task_test.py │ │ ├── task_registry.py │ │ ├── task_registry_test.py │ │ ├── testdata │ │ │ ├── tacred │ │ │ │ ├── README.md │ │ │ │ ├── spanbert_tacred_test.txt │ │ │ │ └── test.gold │ │ │ └── ultra_fine_entity_typing │ │ │ │ ├── dev.json │ │ │ │ └── types.txt │ │ ├── text_classifier.py │ │ ├── text_classifier_test.py │ │ ├── ultra_fine_entity_typing_task.py │ │ └── ultra_fine_entity_typing_task_test.py │ ├── training │ │ ├── __init__.py │ │ ├── trainer.py │ │ └── trainer_test.py │ └── utils │ │ ├── __init__.py │ │ ├── checkpoint_utils.py │ │ ├── checkpoint_utils_test.py │ │ ├── custom_types.py │ │ ├── data_utils.py │ │ ├── data_utils_test.py │ │ ├── default_values.py │ │ ├── initializers.py │ │ ├── jax_utils.py │ │ ├── jax_utils_test.py │ │ ├── mention_preprocess_utils.py │ │ ├── mention_preprocess_utils_test.py │ │ ├── mention_utils.py │ │ ├── mention_utils_test.py │ │ ├── metric_utils.py │ │ ├── metric_utils_test.py │ │ ├── optim_utils.py │ │ ├── optim_utils_test.py │ │ ├── test_utils.py │ │ ├── testdata │ │ ├── eae_paper-00000-of-00001 │ │ └── mtb.v5-00000-of-00001 │ │ └── tokenization_utils.py ├── miqa │ ├── README.md │ └── data │ │ └── metaphor_inference_qa.tsv ├── multiberts │ ├── 2m_vs_1m.ipynb │ ├── README.md │ ├── coref.ipynb │ ├── multi_vs_original.ipynb │ └── multibootstrap.py ├── multivec │ ├── README.md │ ├── models │ │ ├── checkpoint_utils.py │ │ ├── export_to_tfhub.py │ │ ├── metrics.py │ │ └── ranking_model_experiment_inbatch.py │ ├── predict │ │ ├── encode_blocks.py │ │ └── retrieval.py │ ├── preprocessing │ │ └── create_training_data.py │ ├── requirements.txt │ └── utils │ │ ├── convert_tsv_to_json.py │ │ ├── data_processor.py │ │ └── download.sh ├── nqg │ └── README.md ├── nql │ ├── demos │ │ ├── Introduction_to_NQL.ipynb │ │ ├── NQL_Gridworld_Pathfollowing.ipynb │ │ ├── data │ │ │ └── royal92 │ │ │ │ ├── README.md │ │ │ │ ├── cfacts.kg.tsv │ │ │ │ ├── fathers.tsv │ │ │ │ └── royal_family.tsv │ │ ├── gridworld_scaling │ │ │ ├── README.txt │ │ │ ├── __init__.py │ │ │ ├── figure1.bash │ │ │ ├── gendata_figure1.bash │ │ │ ├── plot_figure1.py │ │ │ └── scaling_eval.py │ │ ├── metaqa │ │ │ ├── README.txt │ │ │ ├── metaqa.py │ │ │ └── preprocess_data.py │ │ └── nell995 │ │ │ ├── README.txt │ │ │ ├── nell995.py │ │ │ └── preprocess_data.py │ ├── nql │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── dataset_test.py │ │ ├── dist.py │ │ ├── io.py │ │ ├── io_test.py │ │ ├── nql_test.py │ │ ├── nql_test_lib.py │ │ ├── symbol.py │ │ ├── symbol_test.py │ │ ├── util.py │ │ └── util_test.py │ └── setup.py ├── orqa │ ├── README.md │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── ict_dataset.py │ │ ├── orqa_dataset.py │ │ └── text_classification_dataset.py │ ├── evaluation │ │ ├── __init__.py │ │ └── evaluate_predictions.py │ ├── experiments │ │ ├── __init__.py │ │ ├── ict_experiment.py │ │ ├── orqa_experiment.py │ │ └── text_classifier_experiment.py │ ├── models │ │ ├── __init__.py │ │ ├── ict_model.py │ │ ├── orqa_model.py │ │ └── text_classifier_model.py │ ├── ops │ │ ├── __init__.py │ │ ├── orqa_ops.cc │ │ └── orqa_ops_test.py │ ├── predict │ │ ├── __init__.py │ │ ├── encode_blocks.py │ │ ├── orqa_demo.py │ │ ├── orqa_eval.py │ │ ├── orqa_predict.py │ │ ├── text_classifier_predict.py │ │ └── web │ │ │ └── orqa.html │ ├── preprocessing │ │ ├── __init__.py │ │ ├── convert_to_nq_open.py │ │ ├── create_data_splits.py │ │ ├── preprocess_wiki_extractor.py │ │ └── wiki_preprocessor.py │ ├── requirements.txt │ └── utils │ │ ├── __init__.py │ │ ├── bert_utils.py │ │ ├── eval_utils.py │ │ ├── scann_utils.py │ │ └── scann_utils_test.py ├── qa_counterfactuals │ ├── README.md │ └── figure1.jpeg ├── qresp │ └── README.md ├── quest │ ├── README.md │ ├── bm25 │ │ ├── bm25_retriever.py │ │ └── run_bm25_retriever.py │ ├── common │ │ ├── document_utils.py │ │ ├── example_utils.py │ │ ├── jsonl_utils.py │ │ ├── tsv_utils.py │ │ └── vocab_utils.py │ ├── eval │ │ ├── README.md │ │ ├── analyze_retriever.py │ │ ├── eval_utils.py │ │ └── run_eval.py │ ├── t5xr │ │ ├── README.md │ │ ├── convert_examples.py │ │ └── write_doc_idx_maps.py │ └── xattn │ │ ├── README.md │ │ ├── determine_threshold.py │ │ ├── filter_predictions.py │ │ ├── gen_inference_inputs.py │ │ ├── gen_training_examples.py │ │ └── xattn_utils.py ├── question_answering │ ├── __init__.py │ ├── b2t2 │ │ ├── README.md │ │ ├── compute_vcr_features.py │ │ ├── requirements.txt │ │ ├── run_b2t2.py │ │ └── run_dual_encoder.py │ ├── bert_joint │ │ ├── README.md │ │ ├── __init__.py │ │ ├── prepare_nq_data.py │ │ ├── run_nq.py │ │ └── run_nq_test.py │ └── decatt_docreader │ │ ├── README.md │ │ ├── __init__.py │ │ ├── datasets │ │ ├── __init__.py │ │ ├── nq_long_dataset.py │ │ └── nq_short_pipeline_dataset.py │ │ ├── experiments │ │ ├── __init__.py │ │ ├── nq_export_scorer.py │ │ ├── nq_long_experiment.py │ │ └── nq_short_pipeline_experiment.py │ │ ├── layers │ │ ├── __init__.py │ │ ├── decomposable_attention.py │ │ ├── decomposable_attention_test.py │ │ ├── document_reader.py │ │ └── document_reader_test.py │ │ ├── models │ │ ├── __init__.py │ │ ├── nq_long_decatt_model.py │ │ ├── nq_long_model.py │ │ └── nq_short_pipeline_model.py │ │ ├── preprocessing │ │ ├── __init__.py │ │ ├── create_nq_long_examples.py │ │ └── create_nq_short_pipeline_examples.py │ │ └── utils │ │ ├── __init__.py │ │ ├── nq_long_utils.py │ │ ├── nq_long_utils_test.py │ │ ├── span_utils.py │ │ └── span_utils_test.py ├── realm │ ├── README.md │ ├── example_generator.py │ ├── featurization.py │ ├── generate_retrieval_corpus.py │ ├── local_launcher.sh │ ├── model.py │ ├── parallel.py │ ├── preprocessing.proto │ ├── preprocessing.py │ ├── preprocessing_pb2.py │ ├── preprocessing_pb2_grpc.py │ ├── profile.py │ ├── refresh_doc_embeds.py │ ├── retrieval.py │ ├── retrieval_test.py │ └── train_realm.py ├── relation_learning │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ └── fewrel.py │ └── models │ │ ├── __init__.py │ │ └── bert_fewshot_classifier.py ├── search_agents │ ├── README.md │ ├── demo.py │ ├── environment.proto │ ├── environment_server.py │ ├── muzero │ │ ├── agent_lib.py │ │ ├── bert_state_lib.py │ │ ├── bert_state_lib_test.py │ │ ├── common_flags.py │ │ ├── env.py │ │ ├── env_test.py │ │ ├── grammar_lib.py │ │ ├── grammar_lib_test.py │ │ ├── muzero_main.py │ │ ├── network.py │ │ ├── run_inference_beam.py │ │ ├── server.py │ │ ├── state_tree.py │ │ ├── state_tree_test.py │ │ ├── transformer_encoder.py │ │ ├── types.py │ │ ├── types_test.py │ │ ├── utils.py │ │ └── utils_test.py │ ├── requirements.txt │ └── t5 │ │ ├── run_inference_beam.py │ │ ├── t5_agent_lib.py │ │ └── t5_agent_lib_test.py ├── serene │ ├── analysis.py │ ├── boolq_tfds.py │ ├── callbacks.py │ ├── claim_tfds.py │ ├── config.py │ ├── constants.py │ ├── export_embeddings.py │ ├── fever.proto │ ├── fever_cli.py │ ├── fever_tfds.py │ ├── layers.py │ ├── losses.py │ ├── model.py │ ├── preprocessing.py │ ├── retrieval.proto │ ├── scrape_db.py │ ├── serene.py │ ├── text_matcher.py │ ├── tokenizers.py │ ├── training.py │ ├── types.py │ ├── util.py │ ├── web_api.py │ ├── wiki_db.py │ ├── wiki_index.py │ ├── wiki_tfds.py │ └── wikipedia_processing.py ├── spatial_prep │ └── README.md ├── table_text_eval │ ├── README.md │ ├── __init__.py │ ├── preprocess_webnlg.py │ ├── table_text_eval.py │ ├── table_text_eval_test.py │ └── webnlg_correlations.py ├── templama │ ├── README.md │ ├── install.sh │ ├── prepare_data.sh │ ├── sling2facts.py │ ├── templama.py │ └── templates.csv ├── totto │ ├── README.md │ ├── __init__.py │ ├── baseline_preprocessing │ │ ├── __init__.py │ │ ├── preprocess_data_main.py │ │ └── preprocess_utils.py │ ├── create_table_to_text_html.py │ ├── eval_pipeline_test.py │ ├── eval_requirements.txt │ ├── prepare_predictions_for_eval.py │ ├── prepare_references_for_eval.py │ ├── sample │ │ ├── dev_sample.jsonl │ │ ├── example-0.html │ │ ├── example-1.html │ │ ├── example-2.html │ │ ├── example-3.html │ │ ├── example-4.html │ │ ├── output_sample.txt │ │ └── train_sample.jsonl │ ├── table_to_text_html_utils.py │ ├── table_to_text_utils.py │ ├── table_to_text_utils_test.py │ ├── totto_bleurt_eval.py │ ├── totto_eval.sh │ └── totto_parent_eval.py ├── wikipedia_anchors │ └── README.md ├── wino_dict │ ├── README.md │ ├── __init__.py │ ├── create_new_words.py │ ├── generate.py │ ├── morph_rules.txt │ ├── original_words.tsv │ ├── requirements.txt │ ├── utils.py │ └── utils_test.py └── xsp │ ├── README.md │ ├── data_download.sh │ ├── data_preprocessing │ ├── abstract_sql.py │ ├── abstract_sql_converters.py │ ├── abstract_sql_main.py │ ├── abstract_sql_test.py │ ├── compute_asql_coverage_spider.py │ ├── convert_to_examples.py │ ├── convert_to_tfrecords.py │ ├── create_vocabularies.py │ ├── estimate_asql_coverage_michigan.py │ ├── language_utils.py │ ├── michigan_preprocessing.py │ ├── nl_to_sql_example.py │ ├── schema_utils.py │ ├── spider_preprocessing.py │ ├── sql_parsing.py │ ├── sql_utils.py │ ├── sqlparse_keyword_utils.py │ └── wikisql_preprocessing.py │ ├── data_utils │ ├── academic-prefix.txt │ ├── add_indices.py │ ├── advising-prefix.txt │ ├── atis-prefix.txt │ ├── create_cache.py │ ├── empty_database.py │ ├── extra_academic_indices.txt │ ├── extra_imdb_indices.txt │ ├── extra_scholar_indices.txt │ ├── geoquery-prefix.txt │ ├── imdb-prefix.txt │ ├── scholar-prefix.txt │ └── yelp-prefix.txt │ ├── evaluation │ ├── convert_preds_for_spider_eval.py │ ├── filter_results.py │ ├── official_evaluation.py │ ├── official_evaluation_test.py │ ├── restore_from_asql.py │ └── restore_from_asql_main.py │ ├── model │ ├── adam_weight_decay.py │ ├── beam_search.py │ ├── bert_utils.py │ ├── common_layers.py │ ├── constants.py │ ├── decode_utils.py │ ├── embeddings.py │ ├── input_pipeline.py │ ├── input_utils.py │ ├── load_from_checkpoint.py │ ├── local_model_config.json │ ├── loss.py │ ├── metrics.py │ ├── model_builder.py │ ├── model_config.json │ ├── model_config.py │ ├── run_inference.py │ ├── sequence_example_decoder.py │ ├── tpu_utils.py │ └── transformer.py │ ├── requirements.txt │ └── training │ └── train_model.py └── setup.py /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the list of the Google AI Language Team authors for copyright purposes. 2 | # 3 | # This does not necessarily list everyone who has contributed code, since in 4 | # some cases, their employer may be the copyright holder. To see the full list 5 | # of contributors, see the revision history in source control. 6 | 7 | Google Inc. 8 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Research: Language 2 | 3 | Shared repository for open-sourced projects from the 4 | [Google Research Language](https://research.google/teams/language/) team. 5 | 6 | This is not an official Google product. 7 | -------------------------------------------------------------------------------- /language/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/asqa/eval.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | RESULTS_PATH=$1 #path to result json file 18 | EXP_NAME=$2 #name of your experiment 19 | OUTPUT_DIR=./results/${EXP_NAME} 20 | 21 | mkdir -p ${OUTPUT_DIR} 22 | python convert_to_roberta_format.py \ 23 | --asqa ./dataset/ASQA.json \ 24 | --predictions $RESULTS_PATH \ 25 | --split dev \ 26 | --output_path ${OUTPUT_DIR} 27 | 28 | python transformers/examples/pytorch/question-answering/run_qa.py \ 29 | --model_name_or_path ./roberta/roberta-squad \ 30 | --validation_file ${OUTPUT_DIR}/qa.json \ 31 | --do_eval \ 32 | --version_2_with_negative \ 33 | --max_seq_length 384 \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --null_score_diff_threshold 0 36 | 37 | python scoring.py \ 38 | --asqa ./dataset/ASQA.json \ 39 | --predictions $RESULTS_PATH \ 40 | --roberta_output ${OUTPUT_DIR}/eval_predictions.json \ 41 | --split dev \ 42 | --out_dir $OUTPUT_DIR 43 | -------------------------------------------------------------------------------- /language/asqa/human_annotation/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/asqa/human_annotation/screenshot.png -------------------------------------------------------------------------------- /language/asqa/install.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | # Install requirements.txt 18 | pip install -r requirements.txt 19 | 20 | # Install huggingface transformers from github so that we have access to example 21 | # scripts. 22 | git clone https://github.com/huggingface/transformers.git 23 | cd transformers 24 | pip install . 25 | 26 | # Download Roberta checkpoint. 27 | cd ../ 28 | mkdir roberta 29 | gsutil cp -R gs://gresearch/ASQA/ckpts/roberta-squad roberta/ 30 | -------------------------------------------------------------------------------- /language/asqa/requirements.txt: -------------------------------------------------------------------------------- 1 | rouge-score 2 | nltk 3 | datasets 4 | -------------------------------------------------------------------------------- /language/bert_extraction/README.md: -------------------------------------------------------------------------------- 1 | # Model Extraction of BERT-based APIs 2 | 3 | This folder contains the original codebase used to conduct the experiments in the ICLR 2020 paper *[Thieves on Sesame Street! Model Extraction of BERT-based APIs](https://arxiv.org/abs/1910.12366)*. The OpenReview discussion for this paper can be found [here](https://openreview.net/forum?id=Byl5NREFDr). 4 | 5 | ## Setup 6 | 7 | Please follow the setup in [google-research/language](https://github.com/google-research/language). This codebase requires [google-research/bert](https://github.com/google-research/bert) for all its experiments. 8 | 9 | ## Experiments on SST2, MNLI 10 | 11 | Please find more details in [`steal_bert_classifier/README.md`](steal_bert_classifier/README.md). The codebase can be trivially modified for any classification task using BERT expecting a single sentence input or a pair of sentences as input. 12 | 13 | ## Experiments on SQuAD 1.1, SQuAD 2.0, BoolQ 14 | 15 | Please find more details in [`steal_bert_qa/README.md`](steal_bert_qa/README.md). 16 | 17 | ## Citation 18 | 19 | If you find this paper or codebase useful, please cite us. 20 | 21 | ``` 22 | @inproceedings{krishna2020thieves, 23 | title={Thieves on Sesame Street! Model Extraction of BERT-based APIs}, 24 | author={Krishna, Kalpesh and Tomar, Gaurav Singh and Parikh, Ankur P and Papernot, Nicolas and Iyyer, Mohit}, 25 | booktitle={International Conference on Learning Representations}, 26 | year={2020} 27 | } 28 | ``` 29 | 30 | 31 | -------------------------------------------------------------------------------- /language/bert_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_classifier/data_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_classifier/embedding_perturbations/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_classifier/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_classifier/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_qa/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_qa/data_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_qa/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/bert_extraction/steal_bert_qa/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/bertology/frequency_effects/README.md: -------------------------------------------------------------------------------- 1 | This repository contains relevant files for [Frequency Effects on Syntactic Rule-Learning in Transformers (EMNLP '21)](https://arxiv.org/abs/2109.07020) by Jason Wei, Dan Garrette, Tal Linzen, and Ellie Pavlick. 2 | 3 | The data can be found in the [`data`](https://github.com/google-research/language/tree/master/language/bertology/frequency_effects/data) subdirectory. 4 | -------------------------------------------------------------------------------- /language/boolq/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/boolq/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/capwap/img/capwap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/capwap/img/capwap.png -------------------------------------------------------------------------------- /language/capwap/utils/nltk_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """NLTK utils for cluster-friendly usage.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import pickle 22 | import nltk 23 | 24 | word_tokenize = nltk.word_tokenize 25 | pos_tag = nltk.pos_tag 26 | 27 | 28 | def get_stopwords(): 29 | # pylint: disable=g-import-not-at-top 30 | from nltk.corpus import stopwords 31 | # pylint: enable=g-import-not-at-top 32 | return set(stopwords.words("english")) 33 | 34 | 35 | -------------------------------------------------------------------------------- /language/casper/utils/data_types.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r"""Common data types.""" 16 | import collections 17 | from typing import Any, Dict 18 | 19 | # Deserialized JSON 20 | RawExample = Dict[str, Any] 21 | 22 | AugmentedExample = collections.namedtuple("AugmentedExample", 23 | ["inputs", "targets"]) 24 | -------------------------------------------------------------------------------- /language/casper/utils/sample_utils_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for sample_utils.""" 16 | 17 | from absl.testing import absltest 18 | from language.casper.utils import sample_utils 19 | 20 | 21 | class SampleUtilsTest(absltest.TestCase): 22 | 23 | def test_geometric_sample(self): 24 | pool = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 25 | sampled = sample_utils.geometric_sample(pool, 5, 0.5) 26 | self.assertLen(sampled, 5) 27 | sampled = sample_utils.geometric_sample(pool, 99, 0.5) 28 | self.assertLen(sampled, 10) 29 | # Test extreme values. 30 | sampled = sample_utils.geometric_sample(pool, 7, 1.0) 31 | self.assertEqual(sampled, [0, 1, 2, 3, 4, 5, 6]) 32 | sampled = sample_utils.geometric_sample(pool, 7, 0.0) 33 | self.assertEqual(sampled, [9, 8, 7, 6, 5, 4, 3]) 34 | 35 | 36 | if __name__ == '__main__': 37 | absltest.main() 38 | -------------------------------------------------------------------------------- /language/common/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/common/inputs/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/common/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/common/layers/affine_transform_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for language.common.layers.affine_transform.""" 16 | 17 | 18 | from language.common.layers import affine_transform 19 | import tensorflow as tf 20 | 21 | 22 | class AffineTransformTest(tf.test.TestCase): 23 | 24 | def test_layer_api_compatibility(self): 25 | input_array = tf.constant( 26 | [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0], [2.0, 3.0, 5.0]] 27 | ) 28 | 29 | cls = affine_transform.AffineTransform 30 | with tf.keras.utils.CustomObjectScope( 31 | {cls.__name__: cls} 32 | ): 33 | _ = tf._keras_internal.testing_infra.test_utils.layer_test( 34 | cls, 35 | kwargs={ 36 | 'output_size': 1, 37 | 'initializer': tf.keras.initializers.TruncatedNormal(stddev=0.02), 38 | }, 39 | input_shape=(None), 40 | input_data=input_array, 41 | ) 42 | 43 | 44 | if __name__ == '__main__': 45 | tf.test.main() 46 | -------------------------------------------------------------------------------- /language/common/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/common/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilities for file I/O.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os.path 22 | 23 | import tensorflow.compat.v1 as tf 24 | 25 | 26 | def make_empty_dir(path): 27 | """Makes an empty directory at `path`, deleting `path` first if needed.""" 28 | if tf.gfile.Exists(path): 29 | tf.gfile.DeleteRecursively(path) 30 | tf.gfile.MakeDirs(path) 31 | 32 | 33 | def copy_files_to_dir(source_filepattern, dest_dir): 34 | """Copies files matching `source_filepattern` into `dest_dir`.""" 35 | for source_path in tf.gfile.Glob(source_filepattern): 36 | dest_path = os.path.join(dest_dir, os.path.basename(source_path)) 37 | tf.gfile.Copy(source_path, dest_path) 38 | 39 | 40 | def set_file_contents(data, path): 41 | """Overwrites `path` with `data.""" 42 | with tf.gfile.Open(path, "w") as output_file: 43 | output_file.write(data) 44 | -------------------------------------------------------------------------------- /language/compgen/csl/augment/merge_tsvs.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utility to merge tsv files.""" 16 | 17 | import random 18 | 19 | from absl import app 20 | from absl import flags 21 | 22 | from language.compgen.nqg.tasks import tsv_utils 23 | 24 | 25 | FLAGS = flags.FLAGS 26 | 27 | flags.DEFINE_string("input_1", "", "Input tsv file.") 28 | 29 | flags.DEFINE_string("input_2", "", "Input tsv file.") 30 | 31 | flags.DEFINE_string("output", "", "Output tsv file.") 32 | 33 | flags.DEFINE_integer("duplicate_input_1", 1, 34 | "Number of times to duplicate inputs in input_1.") 35 | 36 | 37 | def main(unused_argv): 38 | input_1 = tsv_utils.read_tsv(FLAGS.input_1) 39 | input_2 = tsv_utils.read_tsv(FLAGS.input_2) 40 | outputs = input_1 * FLAGS.duplicate_input_1 + input_2 41 | random.shuffle(outputs) 42 | tsv_utils.write_tsv(outputs, FLAGS.output) 43 | 44 | 45 | if __name__ == "__main__": 46 | app.run(main) 47 | -------------------------------------------------------------------------------- /language/compgen/csl/augment/test_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilities for testing data augmentation.""" 16 | 17 | 18 | def get_test_config(): 19 | return { 20 | "min_recursions": 1, 21 | "max_recursions": 5, 22 | "temperature": 1, 23 | "nonterminal_bias": 0, 24 | "max_single_nt_applications": 1 25 | } 26 | -------------------------------------------------------------------------------- /language/compgen/csl/common/beam_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilities for Beam pipeline.""" 16 | 17 | import apache_beam as beam 18 | 19 | 20 | def dict_to_beam_counts(metrics_dict, namespace): 21 | for metric_name, metric_value in metrics_dict.items(): 22 | beam.metrics.Metrics.counter(namespace, metric_name).inc(metric_value) 23 | -------------------------------------------------------------------------------- /language/compgen/csl/common/json_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Function for loading config json file.""" 16 | 17 | import json 18 | 19 | from tensorflow.io import gfile 20 | 21 | 22 | def json_file_to_dict(json_file): 23 | """Constructs a dictionary from a json file.""" 24 | with gfile.GFile(json_file, "r") as reader: 25 | text = reader.read() 26 | return json.loads(text) 27 | 28 | 29 | def dict_to_json_file(json_dict, json_file): 30 | """Saves a dictionary to a json file.""" 31 | with gfile.GFile(json_file, "w") as writer: 32 | json.dump(json_dict, writer, indent=2) 33 | print("Saved dict to %s." % json_file) 34 | -------------------------------------------------------------------------------- /language/compgen/csl/common/txt_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilties for reading and writing TXT dataset files.""" 16 | 17 | from tensorflow.io import gfile 18 | 19 | 20 | def read_txt(filename): 21 | """Read file to list of lines.""" 22 | examples = [] 23 | with gfile.GFile(filename, "r") as tsv_file: 24 | for line in tsv_file: 25 | line = line.rstrip() 26 | examples.append(line) 27 | print("Loaded %s lines from %s." % (len(examples), filename)) 28 | return examples 29 | 30 | 31 | def write_txt(examples, filename): 32 | """Write examples to tsv file.""" 33 | with gfile.GFile(filename, "w") as tsv_file: 34 | for example in examples: 35 | line = "%s\n" % example 36 | tsv_file.write(line) 37 | print("Wrote %s lines to %s." % (len(examples), filename)) 38 | -------------------------------------------------------------------------------- /language/compgen/csl/common/writer_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilities for TF writer.""" 16 | 17 | import tensorflow as tf 18 | 19 | 20 | def get_summary_writer(write_dir): 21 | return tf.summary.create_file_writer(write_dir) 22 | 23 | 24 | def write_metrics(writer, metrics_dict, step): 25 | for metric_name, metric_value in metrics_dict.items(): 26 | with writer.as_default(): 27 | tf.summary.scalar(metric_name, metric_value, step=step) 28 | -------------------------------------------------------------------------------- /language/compgen/csl/csl_flowchart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/compgen/csl/csl_flowchart.jpg -------------------------------------------------------------------------------- /language/compgen/csl/induction/action_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Defines actions that can mutate the set of rules.""" 16 | 17 | import collections 18 | 19 | Action = collections.namedtuple( 20 | "Action", 21 | [ 22 | "rules_to_add", # Set of QCFGRule. 23 | "rules_to_remove", # Set of QCFGRule. 24 | ]) 25 | -------------------------------------------------------------------------------- /language/compgen/csl/model/data_constants.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Constants used in generating tf.Examples that are used across modules.""" 16 | 17 | # Forest node types. 18 | RULE_APPLICATION = 1 19 | AGGREGATION = 2 20 | -------------------------------------------------------------------------------- /language/compgen/csl/model/inference/inference_parser_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for inference_parser.""" 16 | 17 | from language.compgen.csl.model import test_utils 18 | from language.compgen.csl.model.inference import inference_parser 19 | from language.compgen.csl.model.inference import inference_wrapper 20 | from language.compgen.csl.qcfg import qcfg_rule 21 | import tensorflow as tf 22 | 23 | 24 | class InferenceParserTest(tf.test.TestCase): 25 | 26 | def test_get_outputs(self): 27 | rules = [ 28 | qcfg_rule.rule_from_string("foo NT_1 ### foo NT_1"), 29 | qcfg_rule.rule_from_string("bar ### bar"), 30 | qcfg_rule.rule_from_string("foo bar ### foo bar"), 31 | ] 32 | config = test_utils.get_test_config() 33 | 34 | wrapper = inference_wrapper.InferenceWrapper(rules, config) 35 | wrapper.compute_application_scores() 36 | 37 | source = "foo bar" 38 | outputs = inference_parser.run_inference(source, wrapper) 39 | print("outputs: %s" % outputs) 40 | 41 | self.assertIsNotNone(outputs) 42 | 43 | if __name__ == "__main__": 44 | tf.test.main() 45 | -------------------------------------------------------------------------------- /language/compgen/csl/model/test_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilties for testing.""" 16 | 17 | 18 | def get_test_config(): 19 | return { 20 | "batch_size": 4, 21 | "learning_rate": 0.001, 22 | "training_steps": 10000, 23 | "steps_per_iteration": 8, 24 | "num_types": 2, 25 | "num_lhs_emb": 128, 26 | "num_rhs_emb": 128, 27 | "max_num_numerator_nodes": 8, 28 | "max_num_nts": 2, 29 | "max_single_nt_applications": 1 30 | } 31 | -------------------------------------------------------------------------------- /language/compgen/csl/qcfg/qcfg_file.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Read and write QCFG grammars to/from human readable txt files.""" 16 | 17 | from language.compgen.csl.qcfg import qcfg_rule 18 | from tensorflow.io import gfile 19 | 20 | 21 | def read_rules(filename): 22 | """Read rule txt file to list of rules.""" 23 | rules = [] 24 | with gfile.GFile(filename, "r") as txt_file: 25 | for line in txt_file: 26 | line = line.rstrip() 27 | rule = qcfg_rule.rule_from_string(line) 28 | rules.append(rule) 29 | print("Loaded %s rules from %s." % (len(rules), filename)) 30 | return rules 31 | 32 | 33 | def write_rules(rules, filename): 34 | """Write rules to txt file.""" 35 | with gfile.GFile(filename, "w") as txt_file: 36 | for rule in rules: 37 | line = "%s\n" % str(rule) 38 | txt_file.write(line) 39 | print("Wrote %s rules to %s." % (len(rules), filename)) 40 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/cogs/augment_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "min_recursions": 1, 3 | "max_recursions": 20, 4 | "temperature": 1, 5 | "nonterminal_bias": 6, 6 | "max_single_nt_applications": 2, 7 | "min_nonterminal_rule_arity": 2 8 | } 9 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/cogs/induction_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "allow_repeated_target_nts": true, 3 | "allow_single_nt_target": true, 4 | "max_num_nts": 4, 5 | "non_terminal_coef": 1, 6 | "terminal_coef": 8, 7 | "source_given_target_coef": 1, 8 | "target_given_source_coef": 5, 9 | "max_num_steps": 8, 10 | "save_every_step": 0 11 | } 12 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/cogs/model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.01, 4 | "training_steps": 3000, 5 | "steps_per_iteration": 1, 6 | "save_checkpoint_every": 64, 7 | "num_lhs_emb": 800, 8 | "num_rhs_emb": 800, 9 | "max_num_numerator_nodes": 500, 10 | "max_num_nts": 4, 11 | "max_single_nt_applications": 1, 12 | "num_types": 64 13 | } 14 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/cogs/tools/preprocess_cogs_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Preprocess the COGS dataset.""" 16 | from absl import app 17 | from absl import flags 18 | 19 | from language.compgen.csl.tasks.cogs.tools import cogs_converter 20 | from language.compgen.nqg.tasks import tsv_utils 21 | 22 | FLAGS = flags.FLAGS 23 | 24 | flags.DEFINE_string("input", "", "TSV file.") 25 | 26 | flags.DEFINE_string("output", "", "TSV file.") 27 | 28 | 29 | def main(_): 30 | examples = tsv_utils.read_tsv(FLAGS.input, expected_num_columns=3) 31 | new_examples = [] 32 | for source, target, category in examples: 33 | if category == "primitive": 34 | if len(source.split()) != 1: 35 | raise ValueError(f"Invalid primitive: {source}") 36 | new_target = source 37 | else: 38 | new_target = cogs_converter.cogs_lf_to_funcall(target) 39 | new_examples.append((source, new_target)) 40 | tsv_utils.write_tsv(new_examples, FLAGS.output) 41 | 42 | 43 | if __name__ == "__main__": 44 | app.run(main) 45 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/exact_match_utils_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for exact_match_utils.""" 16 | 17 | from language.compgen.csl.tasks import exact_match_utils 18 | import tensorflow as tf 19 | 20 | 21 | class InitializationUtilsTest(tf.test.TestCase): 22 | 23 | def test_exact_match_1(self): 24 | dataset = [("salary between 8000 and 12000", 25 | "salaries between 8000 and 12000 .")] 26 | 27 | exact_match_rules = exact_match_utils.get_exact_match_rules(dataset) 28 | exact_match_rule_strings = {str(rule) for rule in exact_match_rules} 29 | self.assertEqual(exact_match_rule_strings, 30 | {"between 8000 and 12000 ### between 8000 and 12000"}) 31 | 32 | 33 | if __name__ == "__main__": 34 | tf.test.main() 35 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/generate_exact_match_rules.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Generates exact match seed rules.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | from language.compgen.csl.qcfg import qcfg_file 20 | from language.compgen.csl.tasks import exact_match_utils 21 | from language.compgen.nqg.tasks import tsv_utils 22 | 23 | FLAGS = flags.FLAGS 24 | 25 | flags.DEFINE_string("input", "", "Input tsv file.") 26 | 27 | flags.DEFINE_string("output", "", "Output txt file.") 28 | 29 | 30 | def main(unused_argv): 31 | examples = tsv_utils.read_tsv(FLAGS.input) 32 | rules = exact_match_utils.get_exact_match_rules(examples) 33 | # Sort by target. 34 | rules = list(rules) 35 | rules.sort(key=lambda x: x.target) 36 | qcfg_file.write_rules(rules, FLAGS.output) 37 | 38 | 39 | if __name__ == "__main__": 40 | app.run(main) 41 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/geoquery/augment_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "min_recursions": 1, 3 | "max_recursions": 20, 4 | "temperature": 1, 5 | "nonterminal_bias": 0, 6 | "max_single_nt_applications": 1 7 | } 8 | 9 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/geoquery/induction_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "allow_repeated_target_nts": false, 3 | "allow_single_nt_target": false, 4 | "max_num_nts": 4, 5 | "non_terminal_coef": 1, 6 | "terminal_coef": 8, 7 | "sample_size": 0, 8 | "source_given_target_coef": 4, 9 | "target_given_source_coef": 16, 10 | "max_num_steps": 20, 11 | "save_every_step": 0 12 | } 13 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/geoquery/model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.05, 4 | "training_steps": 2000, 5 | "steps_per_iteration": 1, 6 | "save_checkpoint_every": 64, 7 | "num_lhs_emb": 300, 8 | "num_rhs_emb": 500, 9 | "max_num_numerator_nodes": 500, 10 | "max_num_nts": 4, 11 | "max_single_nt_applications": 1, 12 | "num_types": 64 13 | } 14 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/scan/augment_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "min_recursions": 1, 3 | "max_recursions": 5, 4 | "temperature": 1, 5 | "nonterminal_bias": 0, 6 | "max_single_nt_applications": 1 7 | } 8 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/scan/induction_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "allow_repeated_target_nts": true, 3 | "allow_single_nt_target": true, 4 | "max_num_nts": 2, 5 | "non_terminal_coef": 1, 6 | "terminal_coef": 4, 7 | "source_given_target_coef": 0.0, 8 | "target_given_source_coef": 100.0, 9 | "max_num_steps": 5, 10 | "num_partitions": 16, 11 | "allow_duplicate_examples": false, 12 | "save_every_step": 0 13 | } 14 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/scan/model_config_t2.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.01, 4 | "training_steps": 1000, 5 | "steps_per_iteration": 64, 6 | "save_checkpoint_every": 64, 7 | "num_lhs_emb": 25, 8 | "num_rhs_emb": 15, 9 | "max_num_numerator_nodes": 20, 10 | "max_num_nts": 2, 11 | "max_single_nt_applications": 0, 12 | "num_types": 2 13 | } 14 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/scan/model_config_t4.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.01, 4 | "training_steps": 1000, 5 | "steps_per_iteration": 64, 6 | "save_checkpoint_every": 64, 7 | "num_lhs_emb": 25, 8 | "num_rhs_emb": 15, 9 | "max_num_numerator_nodes": 20, 10 | "max_num_nts": 2, 11 | "max_single_nt_applications": 0, 12 | "num_types": 4 13 | } 14 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/smcalflow/augment_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "min_recursions": 1, 3 | "max_recursions": 10, 4 | "temperature": 1, 5 | "nonterminal_bias": 0, 6 | "max_single_nt_applications": 1 7 | } 8 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/smcalflow/induction_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "allow_repeated_target_nts": true, 3 | "allow_single_nt_target": false, 4 | "max_num_nts": 4, 5 | "non_terminal_coef": 1, 6 | "terminal_coef": 8, 7 | "sample_size": 0, 8 | "source_given_target_coef": 4, 9 | "target_given_source_coef": 16, 10 | "max_num_steps": 20, 11 | "save_every_step": 0 12 | } 13 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/smcalflow/manual_seed_rules.txt: -------------------------------------------------------------------------------- 1 | NT_1 ### AttendeeListHasPeople :people ( NT_1 ) 2 | NT_1 ### AttendeeListHasRecipient :recipient ( NT_1 ) 3 | NT_1 ### Execute :intension ( refer ( extensionConstraint ( RecipientWithNameLike :constraint ( Constraint[Recipient] ) :name # ( NT_1 ) ) ) ) 4 | NT_1 ### AttendeeListHasRecipient :recipient ( Execute :intension ( refer ( extensionConstraint ( RecipientWithNameLike :constraint ( Constraint[Recipient] ) :name # ( NT_1 ) ) ) ) ) 5 | NT_1 ### Constraint[DateTime] :date ( ?= ( NextDOW :dow # ( NT_1 ) ) ) 6 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/smcalflow/model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.01, 4 | "training_steps": 6000, 5 | "steps_per_iteration": 1, 6 | "save_checkpoint_every": 64, 7 | "num_lhs_emb": 32000, 8 | "num_rhs_emb": 42000, 9 | "max_num_numerator_nodes": 400, 10 | "max_num_nts": 4, 11 | "max_single_nt_applications": 1, 12 | "num_types": 64, 13 | "max_num_batch_embs": 500, 14 | "approximate_denominator": true 15 | } 16 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/smcalflow/tools/format_for_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Format targets so they are encoded better by T5's SPM.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | 20 | from language.compgen.nqg.tasks import tsv_utils 21 | 22 | FLAGS = flags.FLAGS 23 | 24 | flags.DEFINE_string("input", "", "TSV file.") 25 | 26 | flags.DEFINE_string("output", "", "TSV file.") 27 | 28 | 29 | def format_target(target): 30 | """Reformat targets.""" 31 | # """Switches OOV T5 tokens to in-vocabulary tokens.""" 32 | target = target.replace("<", "lb") 33 | target = target.replace(">", "rb") 34 | target = target.replace("~", "sim") 35 | 36 | return target 37 | 38 | 39 | def main(unused_argv): 40 | examples = tsv_utils.read_tsv(FLAGS.input) 41 | new_examples = [] 42 | for source, target in examples: 43 | new_target = format_target(target) 44 | new_examples.append((source, new_target)) 45 | tsv_utils.write_tsv(new_examples, FLAGS.output) 46 | 47 | 48 | if __name__ == "__main__": 49 | app.run(main) 50 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/smcalflow/tools/merge_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Merge source and target txt files to tsv.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | 20 | from language.compgen.nqg.tasks import tsv_utils 21 | 22 | from tensorflow.io import gfile 23 | 24 | FLAGS = flags.FLAGS 25 | 26 | flags.DEFINE_string("source", "", "Input txt file.") 27 | 28 | flags.DEFINE_string("target", "", "Input txt file.") 29 | 30 | flags.DEFINE_string("output", "", "Output tsv file.") 31 | 32 | 33 | def read_txt(filename): 34 | """Read file to list of lines.""" 35 | lines = [] 36 | with gfile.GFile(filename, "r") as txt_file: 37 | for line in txt_file: 38 | line = line.decode().rstrip() 39 | lines.append(line) 40 | print("Loaded %s lines from %s." % (len(lines), filename)) 41 | return lines 42 | 43 | 44 | def main(unused_argv): 45 | source = read_txt(FLAGS.source) 46 | target = read_txt(FLAGS.target) 47 | examples = list(zip(source, target)) 48 | tsv_utils.write_tsv(examples, FLAGS.output) 49 | 50 | 51 | if __name__ == "__main__": 52 | app.run(main) 53 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/smcalflow/tools/retokenize_inputs.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Retokenize inputs by separating on punctuation.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | 20 | from language.compgen.csl.tasks.smcalflow.tools import string_utils 21 | from language.compgen.nqg.tasks import tsv_utils 22 | 23 | FLAGS = flags.FLAGS 24 | 25 | flags.DEFINE_string("input", "", "TSV file.") 26 | 27 | flags.DEFINE_string("output", "", "TSV file.") 28 | 29 | 30 | def main(unused_argv): 31 | examples = tsv_utils.read_tsv(FLAGS.input) 32 | new_examples = [] 33 | for source, target in examples: 34 | new_source = string_utils.format_source(source) 35 | new_examples.append((new_source, target)) 36 | tsv_utils.write_tsv(new_examples, FLAGS.output) 37 | 38 | 39 | if __name__ == "__main__": 40 | app.run(main) 41 | -------------------------------------------------------------------------------- /language/compgen/csl/tasks/smcalflow/tools/string_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilities for processing SMCalFlow strings.""" 16 | 17 | import string 18 | 19 | 20 | def format_source(source): 21 | for char in string.punctuation: 22 | source = source.replace(char, " %s " % char) 23 | source = " ".join(source.split()) 24 | return source 25 | -------------------------------------------------------------------------------- /language/compgen/nqg/common/cky/cfg_rule.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Define structures to represent CFG symbols and rules. 16 | 17 | For efficiency, all symbols are referenced by integers rather than strings. 18 | This typically requires some pre-processing to define terminal 19 | and non-terminal vocabularies and map symbols to corresponding integers. 20 | """ 21 | 22 | import collections 23 | 24 | # CFGSymbol type constants. 25 | TERMINAL = 0 26 | NON_TERMINAL = 1 27 | 28 | # Represents a TERMINAL or NON_TERMINAL symbol. 29 | CFGSymbol = collections.namedtuple( 30 | "CFGSymbol", 31 | [ 32 | "idx", # Integer (considered as separate id spaces for different type). 33 | "type", # Integer (TERMINAL or NON_TERMINAL). 34 | ]) 35 | 36 | # Represents a CFG rule. 37 | CFGRule = collections.namedtuple( 38 | "CFGRule", 39 | [ 40 | "idx", # Integer to optionally reference additional rule information. 41 | "lhs", # Integer non-terminal index. 42 | "rhs", # Tuple of >= 1 CFGSymbols. 43 | ]) 44 | -------------------------------------------------------------------------------- /language/compgen/nqg/model/induction/exact_match_utils_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for exact_match.""" 16 | 17 | from language.compgen.nqg.model.induction import exact_match_utils 18 | 19 | import tensorflow as tf 20 | 21 | 22 | class ExactMatchTest(tf.test.TestCase): 23 | 24 | def test_exact_match_1(self): 25 | dataset = [("salary between 8000 and 12000", 26 | "salaries between 8000 and 12000 .")] 27 | 28 | exact_match_rules = exact_match_utils.get_exact_match_rules(dataset) 29 | exact_match_rule_strings = {str(rule) for rule in exact_match_rules} 30 | self.assertEqual(exact_match_rule_strings, 31 | {"between 8000 and 12000 ### between 8000 and 12000"}) 32 | 33 | 34 | if __name__ == "__main__": 35 | tf.test.main() 36 | -------------------------------------------------------------------------------- /language/compgen/nqg/model/parser/config_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Function for loading config json file.""" 16 | 17 | import json 18 | 19 | from tensorflow.io import gfile 20 | 21 | 22 | def json_file_to_dict(json_file): 23 | """Constructs a dictionary from a json file.""" 24 | with gfile.GFile(json_file, "r") as reader: 25 | text = reader.read() 26 | return json.loads(text) 27 | -------------------------------------------------------------------------------- /language/compgen/nqg/model/parser/configs/geoquery_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.0001, 4 | "training_steps": 256, 5 | "warmup_steps": 100, 6 | "steps_per_iteration": 8, 7 | "save_checkpoint_every": 64, 8 | "model_dims": 256, 9 | "max_num_wordpieces": 25, 10 | "max_num_applications": 400, 11 | "max_num_numerator_nodes": 150, 12 | "max_num_denominator_nodes": 2000, 13 | "max_num_rules": 300 14 | } 15 | -------------------------------------------------------------------------------- /language/compgen/nqg/model/parser/configs/geoquery_xl_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.0001, 4 | "training_steps": 256, 5 | "warmup_steps": 100, 6 | "steps_per_iteration": 8, 7 | "save_checkpoint_every": 64, 8 | "model_dims": 256, 9 | "max_num_wordpieces": 25, 10 | "max_num_applications": 2000, 11 | "max_num_numerator_nodes": 500, 12 | "max_num_denominator_nodes": 15000, 13 | "max_num_rules": 300 14 | } 15 | -------------------------------------------------------------------------------- /language/compgen/nqg/model/parser/configs/scan_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.0001, 4 | "training_steps": 256, 5 | "warmup_steps": 100, 6 | "steps_per_iteration": 8, 7 | "save_checkpoint_every": 64, 8 | "model_dims": 256, 9 | "max_num_wordpieces": 24, 10 | "max_num_applications": 50, 11 | "max_num_numerator_nodes": 32, 12 | "max_num_denominator_nodes": 64, 13 | "max_num_rules": 30 14 | } 15 | -------------------------------------------------------------------------------- /language/compgen/nqg/model/parser/configs/spider_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 256, 3 | "learning_rate": 0.0001, 4 | "training_steps": 256, 5 | "warmup_steps": 100, 6 | "steps_per_iteration": 8, 7 | "save_checkpoint_every": 64, 8 | "model_dims": 256, 9 | "max_num_wordpieces": 80, 10 | "max_num_applications": 200, 11 | "max_num_numerator_nodes": 100, 12 | "max_num_denominator_nodes": 500, 13 | "max_num_rules": 6000 14 | } 15 | -------------------------------------------------------------------------------- /language/compgen/nqg/model/parser/data/data_constants.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Constants used in generating tf.Examples that are used across modules.""" 16 | 17 | # Forest node types. 18 | RULE_APPLICATION = 1 19 | AGGREGATION = 2 20 | -------------------------------------------------------------------------------- /language/compgen/nqg/model/qcfg/qcfg_file.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Read and write QCFG grammars to/from human readable txt files.""" 16 | 17 | from language.compgen.nqg.model.qcfg import qcfg_rule 18 | 19 | from tensorflow.io import gfile 20 | 21 | 22 | def read_rules(filename): 23 | """Read rule txt file to list of rules.""" 24 | rules = [] 25 | with gfile.GFile(filename, "r") as txt_file: 26 | for line in txt_file: 27 | line = line.rstrip() 28 | rule = qcfg_rule.rule_from_string(line) 29 | rules.append(rule) 30 | print("Loaded %s rules from %s." % (len(rules), filename)) 31 | return rules 32 | 33 | 34 | def write_rules(rules, filename): 35 | """Write rules to txt file.""" 36 | with gfile.GFile(filename, "w") as txt_file: 37 | for rule in rules: 38 | line = "%s\n" % str(rule) 39 | txt_file.write(line) 40 | print("Wrote %s rules to %s." % (len(rules), filename)) 41 | -------------------------------------------------------------------------------- /language/compgen/nqg/tasks/geoquery/measure_compound_divergence.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Measures and prints compound divergence between two sets of examples.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | 20 | from language.compgen.nqg.tasks import mcd_utils 21 | from language.compgen.nqg.tasks import tsv_utils 22 | from language.compgen.nqg.tasks.geoquery import tmcd_utils 23 | 24 | FLAGS = flags.FLAGS 25 | 26 | flags.DEFINE_string("input_1", "", "Input tsv file.") 27 | 28 | flags.DEFINE_string("input_2", "", "Input tsv file.") 29 | 30 | 31 | def main(unused_argv): 32 | examples_1 = tsv_utils.read_tsv(FLAGS.input_1) 33 | examples_2 = tsv_utils.read_tsv(FLAGS.input_2) 34 | divergence = mcd_utils.measure_example_divergence( 35 | examples_1, examples_2, get_compounds_fn=tmcd_utils.get_example_compounds) 36 | print("Compound divergence: %s" % divergence) 37 | 38 | 39 | if __name__ == "__main__": 40 | app.run(main) 41 | -------------------------------------------------------------------------------- /language/compgen/nqg/tasks/scan/convert_to_tsv.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert SCAN txt format to standard TSV format.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | 20 | from language.compgen.nqg.tasks import tsv_utils 21 | 22 | from tensorflow.io import gfile 23 | 24 | FLAGS = flags.FLAGS 25 | 26 | flags.DEFINE_string("input", "", "Input txt file.") 27 | 28 | flags.DEFINE_string("output", "", "Output tsv file.") 29 | 30 | 31 | def load_examples(filename): 32 | """Load SCAN examples from original data file.""" 33 | examples = [] 34 | 35 | with gfile.GFile(filename, "r") as input_file: 36 | for line in input_file: 37 | splits = line.split("OUT:") 38 | # Trim "IN:" prefix. 39 | input_string = splits[0][3:].strip() 40 | output_string = splits[1].strip() 41 | examples.append((input_string, output_string)) 42 | 43 | return examples 44 | 45 | 46 | def main(unused_argv): 47 | examples = load_examples(FLAGS.input) 48 | tsv_utils.write_tsv(examples, FLAGS.output) 49 | 50 | 51 | if __name__ == "__main__": 52 | app.run(main) 53 | -------------------------------------------------------------------------------- /language/compgen/nqg/tasks/spider/measure_compound_divergence.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Measures and prints compound divergence between two sets of examples.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | 20 | from language.compgen.nqg.tasks import mcd_utils 21 | from language.compgen.nqg.tasks import tsv_utils 22 | from language.compgen.nqg.tasks.spider import tmcd_utils 23 | 24 | FLAGS = flags.FLAGS 25 | 26 | flags.DEFINE_string("input_1", "", "Input tsv file.") 27 | 28 | flags.DEFINE_string("input_2", "", "Input tsv file.") 29 | 30 | 31 | def main(unused_argv): 32 | examples_1 = tsv_utils.read_tsv(FLAGS.input_1) 33 | examples_2 = tsv_utils.read_tsv(FLAGS.input_2) 34 | divergence = mcd_utils.measure_example_divergence( 35 | examples_1, examples_2, get_compounds_fn=tmcd_utils.get_example_compounds) 36 | print("Compound divergence: %s" % divergence) 37 | 38 | 39 | if __name__ == "__main__": 40 | app.run(main) 41 | -------------------------------------------------------------------------------- /language/compgen/nqg/tasks/spider/nqg_preprocess.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Pre-tokenize dataset for NQG which uses space-separated tokenization. 16 | 17 | Input should be a TSV file, e.g. generated by applying `split_dataset.py` to 18 | the output ofr `spider/write_dataset.py`. 19 | """ 20 | 21 | from absl import app 22 | from absl import flags 23 | 24 | from language.compgen.nqg.tasks import tsv_utils 25 | 26 | from language.compgen.nqg.tasks.spider import nqg_tokenization 27 | 28 | FLAGS = flags.FLAGS 29 | 30 | flags.DEFINE_string("input", "", "Input tsv file.") 31 | 32 | flags.DEFINE_string("output", "", "Output tsv file.") 33 | 34 | 35 | def main(unused_argv): 36 | examples = tsv_utils.read_tsv(FLAGS.input) 37 | new_examples = [] 38 | for source, target in examples: 39 | new_examples.append((nqg_tokenization.process_source(source), 40 | nqg_tokenization.process_target(target))) 41 | tsv_utils.write_tsv(new_examples, FLAGS.output) 42 | 43 | 44 | if __name__ == "__main__": 45 | app.run(main) 46 | -------------------------------------------------------------------------------- /language/compgen/nqg/tasks/spider/sql_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilities for tokenizing SQL.""" 16 | 17 | import sqlparse 18 | 19 | 20 | def _is_whitespace(sqlparse_token): 21 | return sqlparse_token.ttype == sqlparse.tokens.Whitespace 22 | 23 | 24 | def tokenize_sql(sql_exp): 25 | sql_exp = sql_exp.lower() 26 | sql_exp = sql_exp.rstrip(";") 27 | parse = sqlparse.parse(sql_exp) 28 | sql = parse[0] 29 | flat_tokens = sql.flatten() 30 | sql_tokens = [ 31 | token.value for token in flat_tokens if not _is_whitespace(token) 32 | ] 33 | return sql_tokens 34 | -------------------------------------------------------------------------------- /language/compgen/nqg/tasks/strip_targets.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Strip targets from a tsv file and write as newline-separated txt. 16 | 17 | This file can be useful as input to generate predictions (e.g. for evaluation). 18 | """ 19 | 20 | from absl import app 21 | from absl import flags 22 | 23 | from language.compgen.nqg.tasks import tsv_utils 24 | 25 | from tensorflow.io import gfile 26 | 27 | FLAGS = flags.FLAGS 28 | 29 | flags.DEFINE_string("input", "", "Input tsv file.") 30 | 31 | flags.DEFINE_string("output", "", "Output txt file.") 32 | 33 | flags.DEFINE_string("prefix", "", "Optional prefix to prepend to source.") 34 | 35 | 36 | def main(unused_argv): 37 | examples = tsv_utils.read_tsv(FLAGS.input) 38 | with gfile.GFile(FLAGS.output, "w") as txt_file: 39 | for example in examples: 40 | txt_file.write("%s%s\n" % (FLAGS.prefix, example[0])) 41 | 42 | 43 | if __name__ == "__main__": 44 | app.run(main) 45 | -------------------------------------------------------------------------------- /language/compir/utils/dataset_parser_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Gets the parser that corresponds to a specific dataset.""" 16 | 17 | 18 | from language.compir.dataset_parsers.cfq_parser import CfqParser 19 | from language.compir.dataset_parsers.dataset_parser import DatasetParserInterface 20 | from language.compir.dataset_parsers.scan_parser import ScanParser 21 | from language.compir.dataset_parsers.sql_parser import SqlParser 22 | 23 | dataset_parsers = { 24 | "scan": ScanParser, 25 | "cfq": CfqParser, 26 | "atis": SqlParser, 27 | "geo": SqlParser, 28 | "scholar": SqlParser 29 | } 30 | 31 | 32 | def get_parser(dataset): 33 | """Gets the parser that corresponds to a specific dataset.""" 34 | return dataset_parsers[dataset] 35 | -------------------------------------------------------------------------------- /language/decontext/eval_requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | nltk 3 | numpy 4 | sentencepiece 5 | -------------------------------------------------------------------------------- /language/diffqg/.gitignore: -------------------------------------------------------------------------------- 1 | bleurt/ 2 | data/ 3 | -------------------------------------------------------------------------------- /language/diffqg/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/diffqg/requirements.txt: -------------------------------------------------------------------------------- 1 | bleurt @ git+https://github.com/google-research/bleurt 2 | absl-py==1.3.0 3 | huggingface-hub==0.10.1 4 | sentence-transformers==2.2.2 5 | rouge-score==0.1.2 6 | -------------------------------------------------------------------------------- /language/emql/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """EmQL library.""" 16 | -------------------------------------------------------------------------------- /language/frost/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/fruit/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/fruit/postprocessors.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """SeqIO postprocessors for wikidiff tasks.""" 16 | 17 | import json 18 | 19 | from language.fruit import tf_utils 20 | import tensorflow as tf 21 | 22 | 23 | @tf.autograph.experimental.do_not_convert 24 | def postprocess_wikidiff( 25 | output, 26 | vocabulary, 27 | normalize_fn, 28 | is_target=False, 29 | example=None, 30 | ): 31 | """Applies normalization to outputs.""" 32 | del is_target 33 | inputs = tf_utils.maybe_decode( 34 | vocabulary.decode_tf(example["inputs"]).numpy()) 35 | targets = tf_utils.maybe_decode(output) 36 | normalized_inputs, normalized_targets = normalize_fn(inputs, targets) 37 | results = { 38 | "inputs": 39 | inputs, 40 | "targets": 41 | targets, 42 | "normalized_inputs": 43 | normalized_inputs, 44 | "normalized_targets": 45 | normalized_targets, 46 | "generatable_surfaces": 47 | json.loads(tf_utils.maybe_decode(example["generatable_surfaces"])), 48 | } 49 | return results 50 | -------------------------------------------------------------------------------- /language/fruit/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/fruit/scripts/sample_data/pred.jsonl: -------------------------------------------------------------------------------- 1 | {"normalized_targets": "''The Lone Rider in Frontier Fury is a 1941 American Western film directed by Sam Newfield. The film stars George Houston as the \"Lone Rider\" and Al St. John as his sidekick \"Fuzzy\" Jones, with Hillary Brooke, Karl Hackett, Ted Adams and Arch Hall Sr. The film was released on August 8, 1941, by Producers Releasing Corporation. The film is also known as Frontier Fury in the United Kingdom and Rangeland Racket (American reissue title).''"} 2 | {"normalized_targets": "''William Emmanuel Bevan, known by his recording alias Burial, is a British electronic musician from South London. Initially remaining anonymous, Burial became the first artist signed to Kode9's electronic label Hyperdub in 2005. He won acclaim the following year for his self-titled debut album, which showcased a dark, emotive take on UK rave music styles such as UK garage and 2-step; it was named the album of the year by The Wire. Burial's second album, Untrue, was released to further critical acclaim in 2007.''"} 3 | {"normalized_targets": "''Primearth EV Energy Co., Ltd. (abbreviated as PEVE) is a Japanese manufacturer of prismatic nickel–metal hydride (NiMH) and lithium-ion battery packs for hybrid electric vehicles, located in Shizuoka Prefecture, Japan. PEVE's products had been solely based on NiMH until early 2011 when the company has started mass production of Li-ion battery.''"} 4 | -------------------------------------------------------------------------------- /language/fruit/t5x/configs/t5_3b_eval.gin: -------------------------------------------------------------------------------- 1 | # Eval finetuned T5 3B on WikiDiff 2 | 3 | from t5x import utils 4 | 5 | import language.fruit.tasks 6 | 7 | 8 | include "t5x/configs/runs/eval.gin" 9 | include "t5x/examples/t5/t5_1_0/3B.gin" 10 | 11 | 12 | RestoreCheckpointConfig.mode = "specific" 13 | utils.DatasetConfig.split = "test" 14 | utils.DatasetConfig.batch_size = 128 15 | # partitioning.PjitPartitioner: 16 | # model_parallel_submesh=(1,1,1,1) 17 | 18 | # Ensure truncation 19 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512} 20 | utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS 21 | -------------------------------------------------------------------------------- /language/fruit/t5x/configs/t5_3b_finetune.gin: -------------------------------------------------------------------------------- 1 | # Finetune pre-trained T5 3B on WikiDiff 2 | 3 | import language.fruit.tasks 4 | 5 | 6 | include 't5x/configs/runs/finetune.gin' 7 | include 't5x/examples/t5/t5_1_0/3B.gin' 8 | 9 | 10 | TASK_FEATURE_LENGTHS = {'inputs': 1024, 'targets': 512} 11 | TRAIN_STEPS = 1_030_000 12 | BATCH_SIZE = 128 13 | INITIAL_CHECKPOINT_PATH = '' 14 | USE_CACHED_TASKS = False 15 | partitioning.PjitPartitioner: 16 | model_parallel_submesh=(4,8,1,2) 17 | 18 | # Ensure truncation during inference 19 | infer_eval/utils.DatasetConfig: 20 | task_feature_lengths = %TASK_FEATURE_LENGTHS 21 | 22 | trainer.Trainer: 23 | num_microbatches = 4 24 | 25 | utils.SaveCheckpointConfig: 26 | keep = 1 27 | -------------------------------------------------------------------------------- /language/fruit/t5x/configs/t5_base_eval.gin: -------------------------------------------------------------------------------- 1 | # Eval pre-trained T5 Large on WikiDiff 2 | 3 | from t5x import utils 4 | 5 | import language.fruit.tasks 6 | 7 | 8 | include "third_party/py/t5x/configs/runs/eval.gin" 9 | include "t5x/configs/t5/models/base.gin" 10 | 11 | 12 | RestoreCheckpointConfig.mode = "specific" 13 | utils.DatasetConfig.split = "test" 14 | utils.DatasetConfig.batch_size = 1024 15 | 16 | # Ensure truncation 17 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512} 18 | utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS 19 | -------------------------------------------------------------------------------- /language/fruit/t5x/configs/t5_base_finetune.gin: -------------------------------------------------------------------------------- 1 | # Finetune pre-trained T5 Large on WikiDiff 2 | 3 | import language.fruit.tasks 4 | 5 | include "t5x/configs/runs/finetune.gin" 6 | include "t5x/examples/t5/t5_1_0/base.gin" 7 | 8 | 9 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512} 10 | TRAIN_STEPS = 1_030_700 # TODO(rloganiv): Enough? 11 | INITIAL_CHECKPOINT_PATH = "" 12 | USE_CACHED_TASKS = False 13 | 14 | # NOTE: When fine-tuning the public T5 checkpoints (trained in T5 MeshTF) 15 | # the loss normalizing factor should be set to 1024 * 228 (pretraining 16 | # batch_size * target_token_length). 17 | LOSS_NORMALIZING_FACTOR = 233472 18 | 19 | # Ensure truncation during inference 20 | infer_eval/utils.DatasetConfig: 21 | task_feature_lengths = %TASK_FEATURE_LENGTHS 22 | -------------------------------------------------------------------------------- /language/fruit/t5x/configs/t5_large_eval.gin: -------------------------------------------------------------------------------- 1 | # Eval pre-trained T5 Large on WikiDiff 2 | 3 | from t5x import utils 4 | 5 | import language.fruit.tasks 6 | 7 | 8 | include "t5x/configs/runs/eval.gin" 9 | include "t5x/examples/t5/t5_1_0/large.gin" 10 | 11 | 12 | RestoreCheckpointConfig.mode = "specific" 13 | utils.DatasetConfig.split = "test" 14 | utils.DatasetConfig.batch_size = 1024 15 | 16 | # Ensure truncation 17 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512} 18 | utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS 19 | -------------------------------------------------------------------------------- /language/fruit/t5x/configs/t5_large_finetune.gin: -------------------------------------------------------------------------------- 1 | # Finetune pre-trained T5 Large on WikiDiff 2 | 3 | import language.fruit.tasks 4 | 5 | include "t5x/configs/runs/finetune.gin" 6 | include "t5x/examples/t5/t5_1_0/large.gin" 7 | 8 | 9 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512} 10 | TRAIN_STEPS = 1_030_700 # TODO(rloganiv): Enough? 11 | INITIAL_CHECKPOINT_PATH = %gin.REQUIRED 12 | USE_CACHED_TASKS = False 13 | 14 | # NOTE: When fine-tuning the public T5 checkpoints (trained in T5 MeshTF) 15 | # the loss normalizing factor should be set to 1024 * 228 (pretraining 16 | # batch_size * target_token_length). 17 | LOSS_NORMALIZING_FACTOR = 233472 18 | 19 | # Ensure truncation during inference 20 | infer_eval/utils.DatasetConfig: 21 | task_feature_lengths = %TASK_FEATURE_LENGTHS 22 | -------------------------------------------------------------------------------- /language/fruit/t5x/configs/t5_small_eval.gin: -------------------------------------------------------------------------------- 1 | # Eval pre-trained T5 Large on WikiDiff 2 | 3 | from t5x import utils 4 | 5 | import language.fruit.tasks 6 | 7 | 8 | include "t5x/configs/runs/eval.gin" 9 | include "t5x/examples/t5/t5_1_0/small.gin" 10 | 11 | 12 | RestoreCheckpointConfig.mode = "specific" 13 | utils.DatasetConfig.split = "test" 14 | utils.DatasetConfig.batch_size = 1024 15 | 16 | # Ensure truncation 17 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512} 18 | utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS 19 | -------------------------------------------------------------------------------- /language/fruit/t5x/configs/t5_small_finetune.gin: -------------------------------------------------------------------------------- 1 | # Finetune pre-trained T5 Large on WikiDiff 2 | 3 | import language.fruit.tasks 4 | 5 | include "t5x/configs/runs/finetune.gin" 6 | include "t5x/examples/t5/t5_1_0/small.gin" 7 | 8 | 9 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512} 10 | TRAIN_STEPS = 1_030_700 # TODO(rloganiv): Enough? 11 | INITIAL_CHECKPOINT_PATH = %gin.REQUIRED 12 | USE_CACHED_TASKS = False 13 | 14 | # NOTE: When fine-tuning the public T5 checkpoints (trained in T5 MeshTF) 15 | # the loss normalizing factor should be set to 1024 * 228 (pretraining 16 | # batch_size * target_token_length). 17 | LOSS_NORMALIZING_FACTOR = 233472 18 | 19 | # Ensure truncation during inference 20 | infer_eval/utils.DatasetConfig: 21 | task_feature_lengths = %TASK_FEATURE_LENGTHS 22 | -------------------------------------------------------------------------------- /language/fruit/testdata/test.diff.tfrecords: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/fruit/testdata/test.diff.tfrecords -------------------------------------------------------------------------------- /language/fruit/testdata/test_annotations.jsonl: -------------------------------------------------------------------------------- 1 | {"target":{"normalized_inputs": "foo", "inputs": "bar [CONTEXT] bizz", "normalized_targets": "baz"}, "prediction": {"normalized_targets": "buzz"}} 2 | -------------------------------------------------------------------------------- /language/fruit/testdata/test_article_pairs.jsonl: -------------------------------------------------------------------------------- 1 | {"source_article": {"title": "foo", "ns": "0", "id": 0, "text": "is foo", "entities": [], "added_entities": []}, "target_article": {"title": "foo", "ns": "0", "id": 0, "text": "you is foo", "entities": [{"id": "you", "start": 0, "end": 3}], "added_entities": [{"id": "you", "start": 0, "end": 3}]}, "updated": true, "annotated_mentions": [{"mention": {"title": "you", "section": "INTRODUCTION", "text": "foo is you", "entities": [{"id": "foo", "start": 0, "end": 3}], "added_entities": [{"id": "foo", "start": 0, "end": 3}]}, "label": 1}]} 2 | -------------------------------------------------------------------------------- /language/fruit/testdata/test_redirects.tsv: -------------------------------------------------------------------------------- 1 | foo bar 2 | bar bar 3 | baz baz 4 | qux quux 5 | -------------------------------------------------------------------------------- /language/fruit/testdata/test_source_articles.jsonl: -------------------------------------------------------------------------------- 1 | {"title": "foo", "ns": "0", "id": 0, "redirect": "bar", "text": "not important."} 2 | {"title": "bar", "ns": "0", "id": 1, "text": "[[baz]] with an r."} 3 | {"title": "baz", "ns": "0", "id": 2, "text": "[[bar|Bar]] with a z."} 4 | {"title": "qux", "ns": "0", "id": 3, "redirect": "quux", "text": "not important"} 5 | -------------------------------------------------------------------------------- /language/fruit/testdata/test_target_articles.jsonl: -------------------------------------------------------------------------------- 1 | {"title": "foo", "ns": "0", "id": 0, "redirect": "bar", "text": "not important."} 2 | {"title": "bar", "ns": "0", "id": 1, "text": "[[baz]] with an r. [[new ent]]."} 3 | {"title": "baz", "ns": "0", "id": 2, "text": "[[bar|Bar]] with a z. [[new ent]]."} 4 | {"title": "qux", "ns": "0", "id": 3, "redirect": "quux", "text": "not important"} 5 | -------------------------------------------------------------------------------- /language/gscan/xattn_model/model/model_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Util funtion for modeling.""" 16 | 17 | import jax.numpy as jnp 18 | 19 | 20 | def shift_right(x, axis=1): 21 | """Shift the input to the right by padding in the end on axis 1.""" 22 | pad_widths = [(0, 0)] * len(x.shape) 23 | pad_widths[axis] = (0, 1) 24 | padded = jnp.pad( 25 | x[:, 1:], pad_widths, mode='constant', constant_values=x.dtype.type(0)) 26 | return padded 27 | 28 | 29 | def shift_left(x, axis=1): 30 | """Shift the input to the left by padding in the front on axis 1.""" 31 | pad_widths = [(0, 0)] * len(x.shape) 32 | pad_widths[axis] = (0, 1) 33 | padded = jnp.pad( 34 | x[:, :-1], pad_widths, mode='constant', constant_values=x.dtype.type(0)) 35 | return padded 36 | -------------------------------------------------------------------------------- /language/gscan/xattn_model/predict_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for predict.""" 16 | 17 | import tempfile 18 | 19 | from language.gscan.xattn_model import predict 20 | from language.gscan.xattn_model import test_utils 21 | import tensorflow as tf 22 | 23 | 24 | class PredictTest(tf.test.TestCase): 25 | 26 | def setUp(self): 27 | super().setUp() 28 | tf.config.experimental.set_visible_devices([], 'GPU') 29 | 30 | def test_train_and_evaluate(self): 31 | config = test_utils.get_test_config() 32 | # Create a temporary directory where tensorboard metrics are written. 33 | workdir = tempfile.mkdtemp() 34 | predict.predict_and_evaluate(workdir=workdir, config=config) 35 | 36 | 37 | if __name__ == '__main__': 38 | tf.test.main() 39 | -------------------------------------------------------------------------------- /language/gscan/xattn_model/testdata/train.tfrecord: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/gscan/xattn_model/testdata/train.tfrecord -------------------------------------------------------------------------------- /language/gscan/xattn_model/testdata/training_input_vocab.txt: -------------------------------------------------------------------------------- 1 | { 2 | "sos_token": "", 3 | "eos_token": "", 4 | "pad_token": "", 5 | "idx_to_word": [ 6 | "", 7 | "", 8 | "", 9 | "walk", 10 | "to", 11 | "a", 12 | "red", 13 | "circle", 14 | "green", 15 | "square", 16 | "yellow", 17 | "blue", 18 | "big", 19 | "small" 20 | ], 21 | "word_to_idx": { 22 | "": 0, 23 | "": 1, 24 | "": 2, 25 | "walk": 3, 26 | "to": 4, 27 | "a": 5, 28 | "red": 6, 29 | "circle": 7, 30 | "green": 8, 31 | "square": 9, 32 | "yellow": 10, 33 | "blue": 11, 34 | "big": 12, 35 | "small": 13 36 | }, 37 | "word_frequencies": { 38 | "walk": 1531, 39 | "to": 1531, 40 | "a": 1531, 41 | "red": 65, 42 | "circle": 869, 43 | "green": 146, 44 | "square": 662, 45 | "yellow": 78, 46 | "blue": 140, 47 | "big": 336, 48 | "small": 289 49 | } 50 | } -------------------------------------------------------------------------------- /language/gscan/xattn_model/testdata/training_target_vocab.txt: -------------------------------------------------------------------------------- 1 | { 2 | "sos_token": "", 3 | "eos_token": "", 4 | "pad_token": "", 5 | "idx_to_word": [ 6 | "", 7 | "", 8 | "", 9 | "turn right", 10 | "walk", 11 | "turn left" 12 | ], 13 | "word_to_idx": { 14 | "": 0, 15 | "": 1, 16 | "": 2, 17 | "turn right": 3, 18 | "walk": 4, 19 | "turn left": 5 20 | }, 21 | "word_frequencies": { 22 | "turn right": 755, 23 | "walk": 4794, 24 | "turn left": 1361 25 | } 26 | } -------------------------------------------------------------------------------- /language/gscan/xattn_model/train_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for train.""" 16 | 17 | import tempfile 18 | 19 | from language.gscan.xattn_model import test_utils 20 | from language.gscan.xattn_model import train 21 | 22 | import tensorflow as tf 23 | 24 | 25 | class TrainTest(tf.test.TestCase): 26 | 27 | def setUp(self): 28 | super().setUp() 29 | tf.config.experimental.set_visible_devices([], 'GPU') 30 | 31 | def test_train_and_evaluate(self): 32 | config = test_utils.get_test_config() 33 | # Create a temporary directory where tensorboard metrics are written. 34 | workdir = tempfile.mkdtemp() 35 | train.train_and_evaluate(workdir=workdir, config=config) 36 | 37 | 38 | if __name__ == '__main__': 39 | tf.test.main() 40 | -------------------------------------------------------------------------------- /language/labs/README: -------------------------------------------------------------------------------- 1 | The "labs" folder contains projects that are works-in-progress but 2 | may still be useful to a broader communtiy, for example partial code that 3 | results from internships with the Google AI Language team. 4 | 5 | If you use any of this code in your work, please contact the contributors 6 | in the corresponding README to find the appropriate citation. 7 | -------------------------------------------------------------------------------- /language/labs/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/bin/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/data_generators/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/models/losses_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for losses.py.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | from language.labs.consistent_zero_shot_nmt.models import losses 22 | import tensorflow.compat.v1 as tf 23 | 24 | 25 | class LossesTest(tf.test.TestCase): 26 | """Tests for losses.""" 27 | 28 | def test_l2_distance(self): 29 | """Tests l2 distance.""" 30 | with tf.Graph().as_default(): 31 | x = [1.0, 2.0] 32 | y = [3.0, 4.0] 33 | dist = losses.l2_distance(x=x, y=y) 34 | normalize_dist = losses.l2_distance(x=x, y=y, normalize=True) 35 | with tf.Session("") as sess: 36 | tf_dist, tf_normalize_dist = sess.run([dist, normalize_dist]) 37 | self.assertAllClose([tf_dist, tf_normalize_dist], [8.0, 0.0322602]) 38 | 39 | 40 | if __name__ == "__main__": 41 | tf.test.main() 42 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/modules/base.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Base functionality ofr modules.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import abc 22 | 23 | import six 24 | import tensorflow.compat.v1 as tf 25 | 26 | 27 | __all__ = ["AbstractNMTModule"] 28 | 29 | 30 | @six.add_metaclass(abc.ABCMeta) 31 | class AbstractNMTModule(object): 32 | """Abstract base class for neural machine translation modules.""" 33 | 34 | def __init__(self, name): 35 | """Creates a new NMT module. 36 | 37 | Args: 38 | name: String used as the scope name of the module's subgraph. 39 | """ 40 | self.name = name 41 | 42 | def __call__(self, reuse=None, **kwargs): 43 | with tf.variable_scope(self.name, reuse=reuse): 44 | outputs = self._build(**kwargs) 45 | return outputs 46 | 47 | @abc.abstractmethod 48 | def _build(self, **kwargs): 49 | """Must be implemented by a subclass.""" 50 | raise NotImplementedError("Abstract Method") 51 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/scripts/datagen_europarl.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/usr/bin/env bash 16 | 17 | set -e 18 | 19 | # Parse cmd arguments. 20 | SCRIPTS_DIR="$( dirname "${BASH_SOURCE[0]}" )" 21 | source "${SCRIPTS_DIR}/parse-args.sh" 22 | 23 | ORIG_DATA_PATH="${EXP_DATASET_DIR}/original" 24 | OVERLAP_DATA_PATH="${EXP_DATASET_DIR}/overlap" 25 | TFRECORD_DATA_PATH="${EXP_DATASET_DIR}/tfrecords" 26 | TMP_DIR="${EXP_DATASET_DIR}/tmp" 27 | 28 | mkdir -p $TFRECORD_DATA_PATH $TMP_DIR 29 | 30 | python -m language.labs.consistent_zero_shot_nmt.bin.t2t_datagen \ 31 | --data_dir=${TFRECORD_DATA_PATH} \ 32 | --europarl_orig_data_path=${ORIG_DATA_PATH} \ 33 | --europarl_overlap_data_path=${OVERLAP_DATA_PATH} \ 34 | --problem=${EXP_PROBLEM_NAME} \ 35 | --tmp_dir=${TMP_DIR} \ 36 | --alsologtostderr 37 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/scripts/datagen_iwslt17.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/usr/bin/env bash 16 | 17 | set -e 18 | 19 | # Parse cmd arguments. 20 | SCRIPTS_DIR="$( dirname "${BASH_SOURCE[0]}" )" 21 | source "${SCRIPTS_DIR}/parse-args.sh" 22 | 23 | ORIG_DATA_PATH="${EXP_DATASET_DIR}/original" 24 | OVERLAP_DATA_PATH="${EXP_DATASET_DIR}/overlap" 25 | TFRECORD_DATA_PATH="${EXP_DATASET_DIR}/tfrecords" 26 | TMP_DIR="${EXP_DATASET_DIR}/tmp" 27 | 28 | mkdir -p $TFRECORD_DATA_PATH $TMP_DIR 29 | 30 | python -m language.labs.consistent_zero_shot_nmt.bin.t2t_datagen \ 31 | --data_dir=${TFRECORD_DATA_PATH} \ 32 | --iwslt17_orig_data_path=${ORIG_DATA_PATH} \ 33 | --iwslt17_overlap_data_path=${OVERLAP_DATA_PATH} \ 34 | --problem=${EXP_PROBLEM_NAME} \ 35 | --tmp_dir=${TMP_DIR} \ 36 | --alsologtostderr 37 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/scripts/datagen_uncorpus.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | PROBLEM=translate_uncorpus_exp1_lm 18 | DATA_DIR=$1 19 | TMP_DIR=$2 20 | UNCORPUS_ORIG_DATA_EXP1=$3 21 | UNCORPUS_ORIG_DATA_EXP1_LM=$4 22 | UNCORPUS_ORIG_DATA_EXP2=$5 23 | UNCORPUS_ORIG_DATA_EXP2_LM=$6 24 | 25 | mkdir -p $DATA_DIR $TMP_DIR 26 | 27 | python -m language.labs.consistent_zero_shot_nmt.bin.t2t_datagen \ 28 | --data_dir=$DATA_DIR \ 29 | --uncorpus_orig_data_exp1=$UNCORPUS_ORIG_DATA_EXP1 \ 30 | --uncorpus_orig_data_exp1_lm=$UNCORPUS_ORIG_DATA_EXP1_LM \ 31 | --uncorpus_orig_data_exp2=$UNCORPUS_ORIG_DATA_EXP2 \ 32 | --uncorpus_orig_data_exp2_lm=$UNCORPUS_ORIG_DATA_EXP2_LM \ 33 | --problem=$PROBLEM \ 34 | --tmp_dir=$TMP_DIR \ 35 | --alsologtostderr 36 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/scripts/run_nmt_experiment.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/usr/bin/env bash 16 | 17 | set -e 18 | 19 | # Parse cmd arguments. 20 | SCRIPTS_DIR="$( dirname "${BASH_SOURCE[0]}" )" 21 | source "${SCRIPTS_DIR}/parse-args.sh" 22 | 23 | rm -rf ${EXP_OUTPUT_DIR} 24 | 25 | # Additional parameters. 26 | EXP_HPARAMS="" 27 | EXP_TRAIN_STEPS=1000000 28 | EXP_LOCAL_EVAL_FREQ=500 29 | 30 | python -m language.labs.consistent_zero_shot_nmt.bin.t2t_trainer \ 31 | --problem=${EXP_PROBLEM_NAME} \ 32 | --model=${EXP_MODEL_NAME} \ 33 | --hparams=${EXP_HPARAMS} \ 34 | --hparams_set=${EXP_CONF_NAME} \ 35 | --data_dir=${EXP_DATASET_DIR}/tfrecords \ 36 | --train_steps=${EXP_TRAIN_STEPS} \ 37 | --output_dir=${EXP_OUTPUT_DIR} \ 38 | --local_eval_frequency=${EXP_LOCAL_EVAL_FREQ} \ 39 | --schedule=train_and_evaluate \ 40 | --alsologtostderr 41 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/consistent_zero_shot_nmt/utils/common_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Common utilities.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | 22 | # Attention types. 23 | ATT_LUONG = "luong" 24 | ATT_LUONG_SCALED = "luong_scaled" 25 | ATT_BAHDANAU = "bahdanau" 26 | ATT_BAHDANAU_NORM = "bahdanau_norm" 27 | ATT_TYPES = (ATT_LUONG, ATT_LUONG_SCALED, ATT_BAHDANAU, ATT_BAHDANAU_NORM) 28 | 29 | # Encoder types. 30 | ENC_UNI = "uni" 31 | ENC_BI = "bi" 32 | ENC_GNMT = "gnmt" 33 | ENC_TYPES = (ENC_UNI, ENC_BI, ENC_GNMT) 34 | 35 | # Decoder types. 36 | DEC_BASIC = "basic" 37 | DEC_ATTENTIVE = "attentive" 38 | DEC_TYPES = (DEC_BASIC, DEC_ATTENTIVE) 39 | 40 | 41 | # Language model types. 42 | LM_L2R = "left2right" 43 | LM_TYPES = (LM_L2R,) 44 | -------------------------------------------------------------------------------- /language/labs/drkit/README.md: -------------------------------------------------------------------------------- 1 | ## Multi-Hop Reasoning over a Virtual KB 2 | 3 | This repository contains the code for running multi-hop reasoning templates 4 | against a Virtual Knowledge Base (KB). A virtual KB is an index of contextual 5 | representations of entity mentions in text (here Wikipedia). Multi-hop 6 | reasoning is done using a combination of maximum inner product search (MIPS) 7 | over the index, followed by sparse matrix operations. The templates are 8 | used to answer natural language questions from three benchmarks. 9 | -------------------------------------------------------------------------------- /language/labs/drkit/hotpotqa/scripts/run_demo.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | BERT_DIR="wwm_uncased_L-24_H-1024_A-16/" 18 | TEST_DIR="data/tiny-preprocessed-corpus" 19 | DRKIT_DIR="models/multihop" 20 | BERT_CKPT="models/answer" 21 | PASSAGES="data/tiny-wiki.json" 22 | OUTPUT="/tmp/demo" 23 | WEB="language/labs/drkit/hotpotqa/web" 24 | 25 | python language.labs.drkit.hotpotqa.demo \ 26 | --vocab_file $BERT_DIR/vocab.txt \ 27 | --bert_config_file $BERT_DIR/bert_config.json \ 28 | --output_dir $OUTPUT \ 29 | --init_checkpoint $DRKIT_DIR \ 30 | --hotpot_init_checkpoint $BERT_CKPT \ 31 | --raw_passages $PASSAGES \ 32 | --train_data_dir $TEST_DIR \ 33 | --model_type "hotpotqa" \ 34 | --sparse_strategy "sparse_first" \ 35 | --num_hops 2 \ 36 | --port 8888 \ 37 | --web_path $WEB \ 38 | --logtostderr 39 | -------------------------------------------------------------------------------- /language/labs/drkit/hotpotqa/scripts/run_hotpotqa_answer.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | BERT_DIR="wwm_uncased_L-24_H-1024_A-16/" 18 | HOTPOT_DIR="hotpot" 19 | OUTPUT="models/answer" 20 | 21 | # Train answer extraction model. 22 | python -m language.labs.drkit.hotpotqa.answer_extractor \ 23 | --vocab_file $BERT_DIR/vocab.txt \ 24 | --bert_config_file $BERT_DIR/bert_config.json \ 25 | --init_checkpoint $BERT_DIR/bert_model.ckpt \ 26 | --output_dir $OUTPUT \ 27 | --train_file $HOTPOT_DIR/hotpot_train_v1.1.json \ 28 | --predict_file $HOTPOT_DIR/hotpot_dev_distractor_v1.json \ 29 | --do_train=True \ 30 | --do_predict=True \ 31 | --train_batch_size 32 \ 32 | --num_train_epochs 5.0 \ 33 | --use_tpu=False \ 34 | --logtostderr 35 | -------------------------------------------------------------------------------- /language/labs/drkit/hotpotqa/web/static/drkit.css: -------------------------------------------------------------------------------- 1 | body { 2 | color: #5f6368; 3 | font-family: 'Google Sans', Arial, Helvetica, sans-serif; 4 | font-size: 16px; 5 | } 6 | 7 | div.page-background-image { 8 | padding: 20px; 9 | background-image: url(https://ai.google/static/images/about/about_hero.jpg); 10 | } 11 | 12 | /* Styling for the sub-card that holds the options controls. */ 13 | div.results { 14 | width: 800px; 15 | padding: 20px; 16 | } 17 | div.results table tr td.results-field-label { 18 | width: 220px; 19 | } 20 | div.results table tr td.results-field-textbox div { 21 | margin-right: 80px; 22 | width: 100px; 23 | } 24 | 25 | .answer table { 26 | border-spacing: 50px; 27 | border-collapse: separate; 28 | border: 1px solid grey; 29 | } 30 | 31 | -------------------------------------------------------------------------------- /language/labs/drkit/metaqa/scripts/index_metaqa_corpus.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | BERT_BASE_DIR="wwm_uncased_L-24_H-1024_A-16/" 18 | DATA_DIR="data/preprocessed" 19 | PRETRAIN_DIR="models/pretraining" 20 | 21 | for HOP in "1" "2" "3"; do 22 | 23 | # Index training corpus. 24 | python -m language.labs.drkit.wikidata.index \ 25 | --data_dir $DATA_DIR \ 26 | --qry_dir "$DATA_DIR/$HOP-hop/" \ 27 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 28 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 29 | --multihop_output_dir="$DATA_DIR/$HOP-hop/indexed" \ 30 | --predict_batch_size=32 \ 31 | --output_dir="$PRETRAIN_DIR/$HOP-hop" \ 32 | --projection_dim=200 \ 33 | --pretrain_dir="$PRETRAIN_DIR/$HOP-hop" \ 34 | --max_seq_length 256 \ 35 | --logtostderr 36 | 37 | done 38 | -------------------------------------------------------------------------------- /language/labs/drkit/metaqa/scripts/run_metaqa_pretraining.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | set -e 18 | 19 | BERT_DIR="wwm_uncased_L-24_H-1024_A-16/" 20 | INIT_CKPT="../wikidata/models/pretraining/model.ckpt-?????" 21 | DATA_DIR="data/pretraining/2-hop" 22 | OUTPUT_DIR="models/pretraining/2-hop" 23 | 24 | python -m language.labs.drkit.run_dualencoder_lsf \ 25 | --vocab_file=$BERT_DIR/vocab.txt \ 26 | --bert_config_file=$BERT_DIR/bert_config.json \ 27 | --init_checkpoint=$INIT_CKPT \ 28 | --do_train=True \ 29 | --train_file=$DATA_DIR/train.json \ 30 | --do_predict=False \ 31 | --do_test=True \ 32 | --test_file=$DATA_DIR/dev.json \ 33 | --output_dir=$OUTPUT_DIR \ 34 | --projection_dim=200 \ 35 | --train_batch_size 48 \ 36 | --num_train_epochs 12.0 \ 37 | --max_seq_length 256 \ 38 | --logtostderr 39 | -------------------------------------------------------------------------------- /language/labs/drkit/wikidata/scripts/run_wikidata_pretraining.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | set -e 18 | 19 | BERT_DIR="wwm_uncased_L-24_H-1024_A-16/" 20 | DATA_DIR="data/pretraining" 21 | OUTPUT_DIR="models/pretraining" 22 | 23 | python -m language.labs.drkit.run_dualencoder_lsf \ 24 | --vocab_file=$BERT_DIR/vocab.txt \ 25 | --bert_config_file=$BERT_DIR/bert_config.json \ 26 | --init_checkpoint=$BERT_DIR/bert_model.ckpt \ 27 | --do_train=True \ 28 | --train_file=$DATA_DIR/train.json \ 29 | --do_predict=False \ 30 | --do_test=True \ 31 | --test_file=$DATA_DIR/dev.json \ 32 | --output_dir=$OUTPUT_DIR \ 33 | --projection_dim=200 \ 34 | --logtostderr 35 | -------------------------------------------------------------------------------- /language/labs/exemplar_decoding/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/labs/exemplar_decoding/docs/giga_hyperparameters.txt: -------------------------------------------------------------------------------- 1 | dataset = "giga" 2 | use_bpe = true 3 | vocab_size = 26000 4 | use_copy = false 5 | reuse_attention = false 6 | random_neighbor = false 7 | use_cluster = false 8 | encode_neighbor = true 9 | sum_neighbor = false 10 | att_neighbor = false 11 | binary_neighbor = false 12 | binary_dim = 0 13 | neighbor_dim = 32 14 | num_neighbors = 10 15 | max_enc_steps = 1000 16 | max_dec_steps = 50 17 | max_grad_norm = 1.0 18 | num_eval_steps = 10000 19 | save_checkpoints_steps = 5000 20 | lr_schedule = 240000 21 | total_steps = 1000000 22 | beam_width = 10 23 | length_norm = 1.0 24 | coverage_penalty = 0. 25 | batch_size = 64 26 | rnn_cell = "hyper_lstm" 27 | att_type = "luong" 28 | use_bridge = true 29 | use_residual = true 30 | trainer = "adam" 31 | num_mlp_layers = 1 32 | sampling_probability = 0.0 33 | sample_neighbor = false 34 | weight_decay = 1e-2 35 | tie_embedding = true 36 | decoder_drop = 0.0 37 | num_decoder_layers = 1 38 | sigma_norm = 16.0 39 | learning_rate = 1e-3 40 | emb_dim = 256 41 | num_encoder_layers = 3 42 | encoder_dim = 256 43 | drop = 0.15 44 | emb_drop = [0.15, 0.25, 0.35] // (one of these is optimal) 45 | out_drop = [0.15, 0.25, 0.35] // (one of these is optimal) 46 | encoder_drop = [0.0, 0.1] // (one of these is optimal) 47 | decoder_dim = 256 48 | rank = 256 49 | -------------------------------------------------------------------------------- /language/labs/exemplar_decoding/docs/nyt_hyperparameters.txt: -------------------------------------------------------------------------------- 1 | dataset = "nyt" 2 | sigma_norm = 1.0 3 | weight_decay = 1e-2 4 | tie_embedding = true 5 | learning_rate = 1e-3 6 | emb_dim = 300 7 | num_encoder_layers = [1, 2] // (one of these is optimal) 8 | encoder_dim = 300 9 | drop = 0.2 10 | emb_drop = [0.15, 0.25] // (one of these is optimal) 11 | out_drop = [0.15, 0.25] // (one of these is optimal) 12 | encoder_drop = 0.0 13 | decoder_drop = 0.0 14 | num_decoder_layers = 1 15 | decoder_dim = 300 16 | rank = 300 17 | use_bpe = true 18 | vocab_size = 11000 // 11000 if using bpe, else 124500 19 | use_copy = true 20 | reuse_attention = false 21 | random_neighbor = false 22 | use_cluster = false 23 | encode_neighbor = true 24 | sum_neighbor = false 25 | att_neighbor = true 26 | binary_neighbor = false 27 | neighbor_dim = 150 28 | num_neighbors = 10 29 | max_enc_steps = 750 30 | max_dec_steps = 400 31 | beam_width = 10 32 | max_grad_norm = 0.1 33 | num_eval_steps = 10000 34 | save_checkpoints_steps = 500 35 | lr_schedule = 10000 36 | total_steps = 150000 37 | cp = 0.0 38 | length_norm = 1.0 39 | batch_size = 64 40 | rnn_cell = "hyper_lstm" 41 | att_type = "my" 42 | use_bridge = true 43 | use_residual = true 44 | trainer = "adam" 45 | sampling_probability = 0.25 46 | num_mlp_layers = 1 47 | sample_neighbor = false 48 | -------------------------------------------------------------------------------- /language/labs/exemplar_decoding/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/labs/exemplar_decoding/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/labs/exemplar_decoding/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/labs/exemplar_decoding/utils/tensor_utils_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for tensor_utils.py.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | from language.labs.exemplar_decoding.utils import tensor_utils 22 | 23 | import tensorflow.compat.v1 as tf 24 | 25 | 26 | class TensorUtilsTest(tf.test.TestCase): 27 | 28 | def test_linear_interpolation(self): 29 | with tf.Graph().as_default(): 30 | result = tensor_utils.linear_interpolation([1, 2, 3, 4, 5], 2, 10) 31 | with tf.Session("") as sess: 32 | tf_result = sess.run(result) 33 | self.assertAllEqual(tf_result, [2, 4, 6, 8, 10]) 34 | 35 | 36 | if __name__ == "__main__": 37 | tf.test.main() 38 | -------------------------------------------------------------------------------- /language/labs/memory/README: -------------------------------------------------------------------------------- 1 | This project consists of experimental code to investigate a few different 2 | memory mechanisms on synthetic baselines. 3 | 4 | Contributors: 5 | 6 | * Jessy Lin (jessylin@) 7 | * David Weiss (djweiss@) 8 | * Eugene Ie (eugeneie@) 9 | * Zora Tung (gatoatigrado@) 10 | -------------------------------------------------------------------------------- /language/labs/memory/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/mentionmemory/README.md: -------------------------------------------------------------------------------- 1 | # MentionMemory model 2 | 3 | This repository contains the code for the MentionMemory project. 4 | 5 | ## Requirements 6 | 7 | ``` 8 | git clone https://github.com/google-research/language 9 | pip install -r language/mentionmemory/requirements.txt 10 | ``` 11 | 12 | Unit tests can be run via: 13 | 14 | ```bash 15 | python -m language.mentionmemory.run_tests 16 | ``` 17 | 18 | Note that these tests might need to be run independently 19 | 20 | ```bash 21 | python -m language.mentionmemory.encoders.mention_memory_encoder_test 22 | python -m language.mentionmemory.encoders.readtwice_encoder_test 23 | python -m language.mentionmemory.modules.kmeans_test 24 | python -m language.mentionmemory.modules.memory_attention_layer_test 25 | python -m language.mentionmemory.modules.memory_extraction_layer_test 26 | python -m language.mentionmemory.modules.mention_losses_test 27 | python -m language.mentionmemory.tasks.mention_memory_task_test 28 | python -m language.mentionmemory.tasks.readtwice_task_test 29 | python -m language.mentionmemory.training.trainer_test 30 | python -m language.mentionmemory.utils.data_utils_test 31 | ``` 32 | 33 | When running the unit tests and all python commands mentioned later, the current working directory must the root the git project. 34 | 35 | ## Pre-trained models. 36 | -------------------------------------------------------------------------------- /language/mentionmemory/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Find and register unittests. 16 | 17 | See https://docs.python.org/3/library/unittest.html#load-tests-protocol 18 | for details or 19 | https://github.com/python/cpython/blob/main/Lib/unittest/test/__main__.py 20 | for sample implementation. 21 | """ 22 | 23 | import os 24 | 25 | 26 | def load_tests(loader, standard_tests, unused_pattern): 27 | """Our tests end in `_test.py`, so need to override the test discovery.""" 28 | this_dir = os.path.dirname(__file__) 29 | package_tests = loader.discover(start_dir=this_dir, pattern="*_test.py") 30 | standard_tests.addTests(package_tests) 31 | return standard_tests 32 | -------------------------------------------------------------------------------- /language/mentionmemory/data/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/mentionmemory/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/mentionmemory/encoders/import_encoders.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Import encoders so that decorated encoders are added to registry.""" 16 | 17 | # pylint: disable=unused-import 18 | from language.mentionmemory.encoders import bert_encoder 19 | from language.mentionmemory.encoders import eae_encoder 20 | from language.mentionmemory.encoders import mauto_encoder 21 | from language.mentionmemory.encoders import mention_memory_encoder 22 | from language.mentionmemory.encoders import readtwice_encoder 23 | 24 | # pylint: enable=unused-import 25 | -------------------------------------------------------------------------------- /language/mentionmemory/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/mentionmemory/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/mentionmemory/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py>=0.10.0 2 | clu>=0.0.3 3 | flax>=0.3.4 4 | jax>=0.2.14 5 | ml_collections>=0.1 6 | numpy>=1.16 7 | spacy>=3.1.2 8 | scikit-learn>=0.24.2 9 | scipy>=1.2.1 10 | tensorflow>=1.15.0 11 | -------------------------------------------------------------------------------- /language/mentionmemory/run.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | set -e 17 | set -x 18 | 19 | # Install dependencies and run tests. 20 | 21 | virtualenv -p python3 . 22 | source ./bin/activate 23 | 24 | pip install tensorflow 25 | pip install -r language/mentionmemory/requirements.txt 26 | python -m language.mentionmemory.run_tests 27 | -------------------------------------------------------------------------------- /language/mentionmemory/run_tests.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Find and run the tests. 16 | 17 | Run as: python -m language.mentionmemory.run_tests 18 | """ 19 | from absl.testing import absltest 20 | import language.mentionmemory 21 | 22 | absltest.main(module=language.mentionmemory) 23 | -------------------------------------------------------------------------------- /language/mentionmemory/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/mentionmemory/tasks/import_tasks.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Import tasks so that decorated tasks are added to registry.""" 16 | 17 | # pylint: disable=unused-import 18 | # Block of imports needed to allow different tasks to get registered 19 | # with task registry. 20 | from language.mentionmemory.tasks import eae_task 21 | from language.mentionmemory.tasks import embedding_based_entity_qa_task 22 | from language.mentionmemory.tasks import example_task 23 | from language.mentionmemory.tasks import mauto_task 24 | from language.mentionmemory.tasks import mention_based_entity_qa_task 25 | from language.mentionmemory.tasks import mention_memory_task 26 | from language.mentionmemory.tasks import readtwice_task 27 | from language.mentionmemory.tasks import relation_classifier_task 28 | from language.mentionmemory.tasks import text_classifier 29 | from language.mentionmemory.tasks import ultra_fine_entity_typing_task 30 | -------------------------------------------------------------------------------- /language/mentionmemory/tasks/testdata/tacred/README.md: -------------------------------------------------------------------------------- 1 | Sample predictions (test set) for SpanBERT were downloaded from 2 | https://github.com/DFKI-NLP/tacrev/blob/master/results/test_results/spanbert_tacred_test.txt 3 | -------------------------------------------------------------------------------- /language/mentionmemory/training/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/mentionmemory/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/mentionmemory/utils/custom_types.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Contains custom type definitions.""" 16 | from typing import Any, Callable, Dict, Sequence 17 | 18 | import jax.numpy as jnp 19 | 20 | Array = jnp.ndarray 21 | PRNGKey = jnp.ndarray 22 | Dtype = Any 23 | Shape = Sequence[int] 24 | InitType = Callable[[PRNGKey, Shape, Dtype], Array] 25 | MetricGroups = Dict[str, Dict[str, Array]] 26 | -------------------------------------------------------------------------------- /language/mentionmemory/utils/default_values.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Gather default values in central location.""" 16 | 17 | import flax.linen as nn 18 | 19 | from language.mentionmemory.utils import initializers 20 | 21 | kernel_init = initializers.truncated_normal(stddev=0.02) 22 | bias_init = nn.initializers.zeros 23 | layer_norm_epsilon = 1e-12 24 | 25 | CLS_TOKEN = 101 26 | SEP_TOKEN = 102 27 | MASK_TOKEN = 103 28 | ENTITY_START_TOKEN = 1 29 | ENTITY_END_TOKEN = 2 30 | 31 | # Value typically used to prevent division by zero. 32 | SMALL_NUMBER = 1e-8 33 | 34 | LARGE_NUMBER = 1e10 35 | -------------------------------------------------------------------------------- /language/mentionmemory/utils/initializers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Contains custom parameter initializers.""" 16 | 17 | import jax 18 | import jax.numpy as jnp 19 | 20 | from language.mentionmemory.utils.custom_types import Array, Dtype, InitType, PRNGKey, Shape # pylint: disable=g-multiple-import 21 | 22 | 23 | def truncated_normal(stddev: float) -> InitType: 24 | """Truncated normal initializer.""" 25 | 26 | def init(key: PRNGKey, shape: Shape, dtype: Dtype = jnp.float32) -> Array: 27 | return jax.random.truncated_normal( 28 | key=key, lower=-2., upper=2., shape=shape, dtype=dtype) * stddev 29 | 30 | return init 31 | -------------------------------------------------------------------------------- /language/mentionmemory/utils/testdata/eae_paper-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/mentionmemory/utils/testdata/eae_paper-00000-of-00001 -------------------------------------------------------------------------------- /language/mentionmemory/utils/testdata/mtb.v5-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/mentionmemory/utils/testdata/mtb.v5-00000-of-00001 -------------------------------------------------------------------------------- /language/multivec/requirements.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | tf-hub-nightly 3 | scann 4 | -------------------------------------------------------------------------------- /language/multivec/utils/download.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | mkdir $DATA_DIR 17 | cd $DATA_DIR 18 | wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz 19 | wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz 20 | tar -xf collectionandqueries.tar.gz 21 | tar -xf top1000.dev.tar.gz 22 | 23 | -------------------------------------------------------------------------------- /language/nqg/README.md: -------------------------------------------------------------------------------- 1 | This directory has moved to `../compgen/nqg/`. 2 | 3 | https://github.com/google-research/language/tree/master/language/compgen/nqg 4 | -------------------------------------------------------------------------------- /language/nql/demos/data/royal92/README.md: -------------------------------------------------------------------------------- 1 | # royal92 data 2 | 3 | This file contains instances of 12 familial relations which were extracted from royal92.ged, a widely-distributed public domain GEDCOM file containing 4 | information on 3010 individuals and 1422 families of European royalty. The 5 | 12 relations are those used originally in (Hinton 1986). The data parsed used to convert the code was modified from code distributed on with (Yang et al, 2017). 6 | 7 | Comment from original data source on http://www.daml.org/2001/01/gedcom/: _royal92.ged is a public domain GEDCOM file containing information on 3010 8 | individuals and 1422 families of European royalty. royal92.daml was produced 9 | using ged2daml._ 10 | 11 | ## Bibliography 12 | 13 | * Hinton, G.E. (1986). _Learning distributed representations of concepts._ Proceedings of the Eighth Annual Conference of the Cognitive Science Society. 14 | * Paper URL: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.408.7684&rep=rep1&type=pdf 15 | * Yang, Fan, Zhilin Yang, and William W. Cohen (2017). _Differentiable 16 | learning of logical rules for knowledge base completion_, in NeurIPS 2017. 17 | * Paper URL: http://papers.nips.cc/paper/6826-differentiable-learning-of-logical-rules-for-knowledge-base-reasoning 18 | * GitHub URL: https://github.com/fanyangxyz/Neural-LP 19 | -------------------------------------------------------------------------------- /language/nql/demos/gridworld_scaling/README.txt: -------------------------------------------------------------------------------- 1 | Scalability experiments with NQL for the paper Scalable Neural Methods 2 | for Reasoning With a Symbolic Knowledge Base (ICLR 2020) 3 | 4 | To create something very similar to the figure 1 graphic: 5 | 6 | % cd [this directory] 7 | % bash figure1.bash 8 | 9 | Output will be in $HOME/new-results/figure1.png. 10 | 11 | 12 | -------------------------------------------------------------------------------- /language/nql/demos/gridworld_scaling/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/nql/demos/gridworld_scaling/figure1.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # where experimental data will be stored 4 | DATA_DIR=${HOME}/new-results 5 | DATA_STEM=${DATA_DIR}/nql 6 | SOURCE_DIR=`pwd` 7 | 8 | # generate data 9 | 10 | bash ${SOURCE_DIR}/gendata_figure1.bash ${SOURCE_DIR} ${DATA_STEM} ${DATA_DIR} 11 | 12 | # generate plots from data 13 | 14 | python ${SOURCE_DIR}/plot_figure1.py ${DATA_STEM} ${DATA_DIR} 15 | 16 | -------------------------------------------------------------------------------- /language/nql/demos/metaqa/README.txt: -------------------------------------------------------------------------------- 1 | MetaQA experiments with NQL for the paper Scalable Neural Methods 2 | for Reasoning With a Symbolic Knowledge Base (ICLR 2020) 3 | 4 | To reproduce experiment results in Table 3: 5 | 6 | 0. cd [this directory] 7 | 8 | 1. Download MetaQA datasets from 9 | 10 | https://github.com/yuyuz/MetaQA 11 | 12 | 2. Preprocess data 13 | 14 | python preprocess_data.py 15 | 16 | 3. Run tensorflow experiments 17 | 18 | MetaQA-2hop: 19 | python metaqa.py --rootdir /home/haitiansun/metaqa --num_hops=2 \ 20 | --train_file=qa_van2_train.exam --dev_file=qa_van2_dev.exam \ 21 | --test_file=qa_van2_test.exam --mask_seeds=False 22 | 23 | MetaQA-3hop: 24 | python metaqa.py --rootdir /home/haitiansun/metaqa --num_hops=3 \ 25 | --train_file=qa_van3_train.exam --dev_file=qa_van3_dev.exam \ 26 | --test_file=qa_van3_test.exam --mask_seeds=False 27 | 28 | Note: you may change --mask_seeds=True for results with "ReifKB + mask" 29 | -------------------------------------------------------------------------------- /language/nql/demos/nell995/README.txt: -------------------------------------------------------------------------------- 1 | Nell995 experiments with NQL for the paper Scalable Neural Methods 2 | for Reasoning With a Symbolic Knowledge Base (ICLR 2020) 3 | 4 | To reproduce experiment results in Table 4: 5 | 6 | 0. cd [this directory] 7 | 8 | 1. Download MetaQA datasets from 9 | 10 | git clone https://github.com/shehzaadzd/MINERVA.git 11 | 12 | 2. Preprocess data 13 | 14 | python preprocess_data.py 15 | 16 | 3. Run tensorflow experiments 17 | 18 | python nell995.py --rootdir=nell995/ --task=concept_athletehomestadium \ 19 | --num_hops=5 --epochs=50 20 | 21 | Note: you may change --task to run experiments on other queries. 22 | 23 | Available tasks: 24 | concept_agentbelongstoorganization 25 | concept_athletehomestadium 26 | concept_athleteplaysforteam 27 | concept_athleteplaysinleague 28 | concept_athleteplayssport 29 | concept_organizationheadquarteredincity 30 | concept_organizationhiredperson 31 | concept_personborninlocation 32 | concept_personleadsorganization 33 | concept_teamplaysinleague 34 | concept_teamplayssport 35 | concept_worksfor 36 | -------------------------------------------------------------------------------- /language/nql/setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Install Neural Query Language.""" 16 | 17 | from setuptools import find_packages 18 | from setuptools import setup 19 | 20 | setup( 21 | name="nql", 22 | version="0.0.1.dev", 23 | packages=find_packages(), 24 | description="Neural Query Language", 25 | author="Google Inc.", 26 | author_email="no-reply@google.com", 27 | url="https://github.com/google-research/language/tree/master/language/nql", 28 | license="Apache 2.0", 29 | install_requires=[ 30 | "tensorflow-gpu", 31 | "scipy", 32 | "mock", 33 | "numpy", 34 | ], 35 | ) 36 | -------------------------------------------------------------------------------- /language/orqa/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/orqa/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/orqa/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/orqa/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/orqa/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/orqa/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ORQA ops.""" 16 | import os 17 | import tensorflow.compat.v1 as tf 18 | 19 | try: 20 | orqa_ops 21 | except NameError: 22 | orqa_ops = tf.load_op_library( 23 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "orqa_ops.so")) 24 | 25 | has_answer = orqa_ops.has_answer 26 | reader_inputs = orqa_ops.reader_inputs 27 | -------------------------------------------------------------------------------- /language/orqa/predict/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/orqa/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/orqa/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow~=2.1.0 2 | tensorflow-text~=2.1.0 3 | tf-models-official==2.1.0.dev2 4 | bert-tensorflow==1.0.4 5 | tf-hub-nightly 6 | Jinja2~=2.11.2 7 | tornado~=4.5.1 8 | wikiextractor==0.1 9 | sentencepiece==0.1.91 10 | beautifulsoup4==4.9.3 11 | lxml==4.6.3 12 | https://storage.googleapis.com/scann/releases/1.0.0/scann-1.0.0-cp37-cp37m-linux_x86_64.whl 13 | -------------------------------------------------------------------------------- /language/orqa/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/orqa/utils/scann_utils_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for scann_utils.py.""" 16 | import os 17 | 18 | from language.orqa.utils import scann_utils 19 | import numpy as np 20 | import tensorflow.compat.v1 as tf 21 | 22 | 23 | class ScannUtilsTest(tf.test.TestCase): 24 | 25 | def test_scann_searcher(self): 26 | temp_dir = self.create_tempdir().full_path 27 | checkpoint_path = os.path.join(temp_dir, "dummy_db.ckpt") 28 | 29 | dummy_db = np.random.uniform(size=[1024, 32]).astype(np.float32) 30 | scann_utils.write_array_to_checkpoint("dummy_db", dummy_db, checkpoint_path) 31 | 32 | dummy_queries = np.random.uniform(size=[4, 32]).astype(np.float32) 33 | _, searcher = scann_utils.load_scann_searcher( 34 | var_name="dummy_db", checkpoint_path=checkpoint_path, num_neighbors=10) 35 | distance, index = searcher.search_batched(dummy_queries) 36 | self.assertAllEqual(distance.numpy().shape, [4, 10]) 37 | self.assertAllEqual(index.numpy().shape, [4, 10]) 38 | 39 | 40 | if __name__ == "__main__": 41 | tf.test.main() 42 | -------------------------------------------------------------------------------- /language/qa_counterfactuals/figure1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/qa_counterfactuals/figure1.jpeg -------------------------------------------------------------------------------- /language/qresp/README.md: -------------------------------------------------------------------------------- 1 | Code for [Entity-Centric Query Refinement] (https://arxiv.org/abs/2204.00743) will be released here. 2 | -------------------------------------------------------------------------------- /language/quest/common/document_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilities for reading and writing documents files.""" 16 | 17 | import dataclasses 18 | 19 | from language.quest.common import jsonl_utils 20 | 21 | 22 | @dataclasses.dataclass(frozen=True) 23 | class Document: 24 | """Represents a document with its title and text.""" 25 | # Document title (should be unique in corpus). 26 | title: str 27 | # Document text. 28 | text: str 29 | 30 | 31 | def read_documents(filepath, limit=None): 32 | documents_json = jsonl_utils.read(filepath, limit=limit, verbose=True) 33 | return [Document(**document) for document in documents_json] 34 | 35 | 36 | def write_documents(filepath, documents): 37 | documents_json = [dataclasses.asdict(document) for document in documents] 38 | jsonl_utils.write(filepath, documents_json) 39 | -------------------------------------------------------------------------------- /language/quest/common/vocab_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Utilities for dealing with T5 sentence piece model.""" 16 | 17 | from sentencepiece import SentencePieceProcessor 18 | 19 | 20 | class T5SpmWrapper(object): 21 | """Wrapper for T5 sentence piece model.""" 22 | 23 | def __init__(self, sp_model): 24 | self.sp = SentencePieceProcessor() 25 | self.sp.Load(sp_model) 26 | 27 | def tokenize(self, input_string): 28 | """Return list of tokens for input.""" 29 | return self.sp.EncodeAsPieces(input_string) 30 | 31 | def truncate(self, input_string, num_tokens): 32 | """Truncate input to be `num_tokens`.""" 33 | tokens = self.sp.EncodeAsPieces(input_string) 34 | truncated_tokens = tokens[:num_tokens] 35 | return self.sp.DecodePieces(truncated_tokens) 36 | -------------------------------------------------------------------------------- /language/quest/eval/README.md: -------------------------------------------------------------------------------- 1 | These scripts expect that systems have produced predictions 2 | following the same jsonl format of the original examples files. Only the `query` and `docs` fields need to be populated 3 | for predictions. 4 | 5 | Use `run_eval.py` to compute average precision, recall, and F1. 6 | 7 | To analyze the average recall and MRecall of a candidate set produced by a retriever prior to thresholding or classifying candidates to produce a final set, use `analyze_retriever.py`. 8 | -------------------------------------------------------------------------------- /language/quest/t5xr/README.md: -------------------------------------------------------------------------------- 1 | We provide data preprocessing scripts to help setup dual 2 | encoder experiments. To run fine-tuning and inference follow 3 | the instructions in the `t5x_retrieval` library: 4 | 5 | https://github.com/google-research/t5x_retrieval 6 | 7 | You can use `write_doc_idx_maps.py` and `convert_examples.py` to 8 | convert examples and documents jsonl files to the indexed format used by the `t5x_retrieval` library. 9 | -------------------------------------------------------------------------------- /language/quest/xattn/README.md: -------------------------------------------------------------------------------- 1 | This directory contains scripts to train a T5-based cross-attention classifier. 2 | The codebase relies upon the [t5x repository](https://github.com/google-research/t5x). 3 | Follow instructions from that library to define a task, run fine-tuning, and 4 | generate scores at inference time. 5 | 6 | To generate training examples, you can run `gen_training_examples.py`. 7 | This script can also run on the validation set to generate an evaluation set to efficiently 8 | evaluate model performance during fine-tuning. 9 | 10 | To generate predictions, you should first run `gen_inference_inputs.py`. Then, generate scores following the inference instructions from the `t5x` library with `--gin.infer.mode="'score'"`. 11 | You can determine a threshold by running `determine_threshold.py` on the validation 12 | set. 13 | Then, you can run `filter_predictions.py` to filter a set of retrieved documents based on the cross-attention classifier. 14 | -------------------------------------------------------------------------------- /language/quest/xattn/xattn_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Common constants and functions for xattn model.""" 16 | 17 | # Labels used for T5 output. 18 | POS_LABEL = "relevant" 19 | NEG_LABEL = "not relevant" 20 | 21 | # Input format for T5. 22 | INPUT_FORMAT = "query: {query}, doc: {doc}" 23 | 24 | 25 | def get_example( 26 | query, 27 | doc_title, 28 | doc_title_to_text, 29 | spm_wrapper, 30 | is_relevant, 31 | context_size 32 | ): 33 | """Adds a tuple representing an example to `outputs`.""" 34 | if doc_title not in doc_title_to_text: 35 | raise Exception("Missing document title: %s" % doc_title) 36 | 37 | doc_text = doc_title + " " + doc_title_to_text[doc_title] 38 | truncated_text = spm_wrapper.truncate(doc_text, context_size) 39 | input_string = INPUT_FORMAT.format( 40 | query=query, doc=truncated_text) 41 | output_string = ( 42 | POS_LABEL if is_relevant else NEG_LABEL) 43 | return (input_string, output_string) 44 | -------------------------------------------------------------------------------- /language/question_answering/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/question_answering/b2t2/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | tensorflow 3 | bert-tensorflow 4 | -------------------------------------------------------------------------------- /language/question_answering/bert_joint/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/question_answering/decatt_docreader/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/question_answering/decatt_docreader/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/question_answering/decatt_docreader/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/question_answering/decatt_docreader/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/question_answering/decatt_docreader/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/question_answering/decatt_docreader/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /language/question_answering/decatt_docreader/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/realm/preprocessing.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | import "tensorflow/core/example/example.proto"; 4 | 5 | package language.realm; 6 | 7 | service Preprocessing { 8 | // Return a tf.Example given an unused (usually empty) input tf.Example. 9 | rpc PopExample(tensorflow.Example) returns (tensorflow.Example) { 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /language/relation_learning/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/relation_learning/data/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/relation_learning/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/search_agents/demo.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Demo for testing the environment server.""" 16 | 17 | from absl import app 18 | from absl import flags 19 | from absl import logging 20 | import grpc 21 | 22 | from language.search_agents import environment_pb2 23 | from language.search_agents import environment_pb2_grpc 24 | 25 | flags.DEFINE_string('server_address', 'localhost:50055', 26 | 'Address of the Environment Server.') 27 | FLAGS = flags.FLAGS 28 | 29 | 30 | def main(_): 31 | channel_creds = grpc.local_channel_credentials() 32 | channel = grpc.secure_channel(FLAGS.server_address, channel_creds) 33 | grpc.channel_ready_future(channel).result(timeout=10) 34 | stub = environment_pb2_grpc.EnvironmentServiceStub(channel) 35 | 36 | request = environment_pb2.GetQueryRequest() 37 | response = stub.GetQuery(request, timeout=10) 38 | logging.info('\n\nReceived GetQueryResponse:\n%s\n', response) 39 | 40 | 41 | if __name__ == '__main__': 42 | app.run(main) 43 | -------------------------------------------------------------------------------- /language/search_agents/muzero/utils_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tests for language.search_agents.muzero.utils.py.""" 16 | 17 | from language.search_agents.muzero import utils 18 | 19 | import tensorflow as tf 20 | 21 | 22 | class UtilsTest(tf.test.TestCase): 23 | 24 | def test_escape_for_lucene(self): 25 | self.assertEqual(utils.escape_for_lucene("foo:bar-baz"), "foo\\:bar\\-baz") 26 | 27 | 28 | if __name__ == "__main__": 29 | tf.test.main() 30 | -------------------------------------------------------------------------------- /language/search_agents/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | apache_beam 3 | attrs 4 | cloudpickle==1.3.0 5 | grpcio>=1.32.0 6 | grpcio-tools 7 | gym 8 | keras 9 | nltk 10 | numpy 11 | pygtrie 12 | tensorflow==2.4.1 13 | tensorflow-addons 14 | tensorflow-probability==0.11.0 15 | tensorflow-serving-api 16 | tensorflow_text 17 | transformers 18 | -------------------------------------------------------------------------------- /language/serene/constants.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Constants for fever data.""" 16 | 17 | 18 | VERIFIABLE = 'VERIFIABLE' 19 | NOT_VERIFIABLE = 'NOT VERIFIABLE' 20 | 21 | # Classes used for claim classification and labeling which evidence 22 | # support/refute the claim 23 | NOT_ENOUGH_INFO = 'NOT ENOUGH INFO' 24 | REFUTES = 'REFUTES' 25 | SUPPORTS = 'SUPPORTS' 26 | FEVER_CLASSES = [REFUTES, SUPPORTS, NOT_ENOUGH_INFO] 27 | 28 | # Classes used for scoring candidate evidence relevance 29 | MATCHING = 'MATCHING' 30 | NOT_MATCHING = 'NOT_MATCHING' 31 | EVIDENCE_MATCHING_CLASSES = [NOT_MATCHING, MATCHING] 32 | 33 | UKP_WIKI = 'ukp_wiki' 34 | UKP_PRED = 'ukp_pred' 35 | UKP_TYPES = [UKP_PRED, UKP_WIKI] 36 | DRQA = 'drqa' 37 | LUCENE = 'lucene' 38 | DOC_TYPES = [UKP_WIKI, UKP_PRED, DRQA, LUCENE] 39 | -------------------------------------------------------------------------------- /language/serene/fever.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | package language.google.fact_check; 4 | 5 | message WikipediaDump { 6 | message Entity { 7 | optional string mention = 1; 8 | optional string entity = 2; 9 | } 10 | 11 | message Sentence { 12 | optional string text = 1; 13 | repeated Entity entities = 2; 14 | } 15 | 16 | optional string id = 1; 17 | optional string text = 2; 18 | optional string title = 3; 19 | map sentences = 4; 20 | } 21 | 22 | message FeverExample { 23 | // Format is 24 | // [Annotation ID, Evidence ID, Wikipedia URL, sentence ID] 25 | // see http://fever.ai/2018/task.html#TrainingDevelopment_Data_format_30 26 | // We do not care about the unused Annotation and Evidence IDs, and optionally 27 | // add `text' which represents the actual sentence contents. 28 | message Evidence { 29 | optional string wikipedia_url = 1; 30 | optional string sentence_id = 2; 31 | 32 | // Not populated in gold data. 33 | optional string sentence = 3; 34 | 35 | optional string page_title = 4; 36 | } 37 | 38 | message EvidenceSet { 39 | repeated Evidence evidence = 1; 40 | } 41 | 42 | enum Label { 43 | UNKNOWN_LABEL = 0; 44 | SUPPORTS = 1; 45 | REFUTES = 2; 46 | NOT_ENOUGH_INFO = 3; 47 | } 48 | 49 | optional string id = 1; 50 | optional Label label = 2; 51 | 52 | optional string claim = 3; 53 | 54 | repeated EvidenceSet evidences = 4; 55 | } 56 | -------------------------------------------------------------------------------- /language/serene/retrieval.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | package language.google.fact_check; 4 | 5 | message Document { 6 | optional string doc_id = 1; 7 | optional string content = 2; 8 | optional double ir_score = 3; 9 | } 10 | 11 | message GetDocumentsResponse { 12 | repeated Document documents = 1; 13 | } 14 | 15 | message GetDocumentsRequest { 16 | optional string query = 1; 17 | optional int32 max_num_results = 2; 18 | } 19 | 20 | -------------------------------------------------------------------------------- /language/serene/serene.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # """Minimal build target for initial checkin.""" 16 | -------------------------------------------------------------------------------- /language/serene/types.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Types for fever data.""" 16 | 17 | import dataclasses 18 | 19 | Json = Dict[Text, Any] 20 | 21 | 22 | # An evidence set contains a list of tuples, each representing one line 23 | # of evidence 24 | # First two ints are IDs competition runners use, then the wiki page, then the 25 | # sentence number. 26 | @dataclasses.dataclass 27 | class Evidence: 28 | annotation_id: Optional[int] 29 | evidence_id: int 30 | # fever_identifier: not actually a url, but page title 31 | wikipedia_url: Optional[Text] 32 | sentence_id: Optional[int] 33 | 34 | 35 | # This must go after Evidence, otherwise python cannot parse it 36 | EvidenceSet = List[Evidence] 37 | EvidenceFromJson = Tuple[Optional[int], int, Optional[Text], Optional[int]] 38 | 39 | 40 | @dataclasses.dataclass 41 | class FeverMetrics: 42 | strict_score: float 43 | accuracy_score: float 44 | precision: float 45 | recall: float 46 | f1: float 47 | n_examples: int 48 | -------------------------------------------------------------------------------- /language/serene/web_api.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """A simple web api wrapper around the wikipedia sql db. 16 | 17 | This is helpful if trying to query the fever wikipedia dump without needing 18 | to use directly access protobufs. 19 | """ 20 | from absl import app 21 | from absl import flags 22 | import flask 23 | from language.serene import wiki_db 24 | 25 | FLAGS = flags.FLAGS 26 | flags.DEFINE_string('wiki_db_path', None, '') 27 | 28 | 29 | def main(_): 30 | db = wiki_db.WikiDatabase.from_local(FLAGS.wiki_db_path) 31 | flask_app = flask.Flask(__name__) 32 | 33 | @flask_app.route('/wiki_page_sentence', methods=['POST']) 34 | def get_page_sentence(): # pylint: disable=unused-variable 35 | request = flask.request.json 36 | maybe_sentence = db.get_page_sentence(request['wikipedia_url'], 37 | int(request['sentence_id'])) 38 | return flask.jsonify({'text': maybe_sentence}) 39 | 40 | flask_app.run() 41 | 42 | 43 | if __name__ == '__main__': 44 | app.run(main) 45 | -------------------------------------------------------------------------------- /language/table_text_eval/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/templama/install.sh: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #!/bin/bash 16 | 17 | SLING_BASE="$1" 18 | 19 | cd $SLING_BASE 20 | 21 | # Install SLING via pip. 22 | sudo -H pip3 install https://ringgaard.com/data/dist/sling-3.0.0-py3-none-linux_x86_64.whl 23 | 24 | # Download SLING KB and en wikipedia mapping. 25 | sling fetch --dataset kb,mapping 26 | -------------------------------------------------------------------------------- /language/templama/templates.csv: -------------------------------------------------------------------------------- 1 | Wikidata ID,Relation,Template 2 | P54,member of sports team, plays for . 3 | P39,position held, holds the position of . 4 | P108,employer, works for . 5 | P102,political party, is a member of the . 6 | P286,head coach, is the head coach of . 7 | P69,educated at, attended . 8 | P488,chairperson, is the chair of . 9 | P6,head of government, is the head of the government of . 10 | P127,owned by, is owned by . 11 | -------------------------------------------------------------------------------- /language/totto/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/totto/baseline_preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/totto/eval_requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | pytest 3 | sacrebleu 4 | six 5 | wheel 6 | -------------------------------------------------------------------------------- /language/totto/sample/example-3.html: -------------------------------------------------------------------------------- 1 |

Demetrius the Fair


Section Title: Sources
Table Section Text: "smith-bio/0198". ancientlibrary.com.
21 | 22 | 23 | 24 |
Demetrius the Fair Died: 249 BC
Regnal titles
Preceded by Magas King of Cyrene 250 BC – 249 BC VacantRepublic, under Ptolemaic rule from 246 BCTitle next held byPtolemy VIII Physcon

Sentence(s)

Demetrius was a King of Cyrene.
Demetrius the Fair was a King of Cyrene.
Demetrius the Fair (250 BC) was the king of Cyrene. 25 | -------------------------------------------------------------------------------- /language/totto/sample/output_sample.txt: -------------------------------------------------------------------------------- 1 | Colin Hanlon starred as Pete in The 12 in 2015. 2 | École Polytechnique has 4 Fields Medal winners. 3 | The New Hampshire census of 2010 reported that there were 7,230 people living in Swanzey. 4 | King Demetrius reigned over Cyrene. 5 | On October 10, 2012, The Nashville series premiered to over 8.93 million viewers. 6 | -------------------------------------------------------------------------------- /language/wino_dict/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /language/wino_dict/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py~=1.1.0 2 | nltk~=3.7 3 | spacy~=3.3.1 4 | tensorflow~=2.8.2 5 | tfds_nightly~=4.6.0 6 | -------------------------------------------------------------------------------- /language/xsp/data_utils/academic-prefix.txt: -------------------------------------------------------------------------------- 1 | create database academic; 2 | use academic; 3 | -------------------------------------------------------------------------------- /language/xsp/data_utils/add_indices.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Adds indices to databases which require them.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import argparse 21 | import sqlite3 22 | 23 | 24 | def main(db_name): 25 | with open('data_utils/extra_' + db_name + '_indices.txt') as infile: 26 | indices = infile.read().split('\n') 27 | 28 | db = sqlite3.connect('databases/' + db_name + '.db') 29 | c = db.cursor() 30 | 31 | for index in indices: 32 | print('Adding index:') 33 | print(index) 34 | q = index 35 | c.execute(q) 36 | db.commit() 37 | 38 | db.close() 39 | 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | '--database_name', type=str, help='The database to add indices to.') 45 | args = parser.parse_args() 46 | main(args.database_name) 47 | -------------------------------------------------------------------------------- /language/xsp/data_utils/advising-prefix.txt: -------------------------------------------------------------------------------- 1 | create database advising; 2 | use advising; 3 | -------------------------------------------------------------------------------- /language/xsp/data_utils/atis-prefix.txt: -------------------------------------------------------------------------------- 1 | create database atis; 2 | use atis; 3 | -------------------------------------------------------------------------------- /language/xsp/data_utils/extra_academic_indices.txt: -------------------------------------------------------------------------------- 1 | CREATE INDEX IF NOT EXISTS "author_oid" ON "author" ("oid"); 2 | CREATE INDEX IF NOT EXISTS "cite_cited" ON "cite" ("cited"); 3 | -------------------------------------------------------------------------------- /language/xsp/data_utils/extra_imdb_indices.txt: -------------------------------------------------------------------------------- 1 | CREATE INDEX IF NOT EXISTS "cast_msid" ON "cast" ("msid"); 2 | CREATE INDEX IF NOT EXISTS "directed_by_did" ON "directed_by" ("did"); 3 | CREATE INDEX IF NOT EXISTS "directed_by_msid" ON "directed_by" ("msid"); 4 | CREATE INDEX IF NOT EXISTS "made_by_msid" ON "made_by" ("msid"); 5 | CREATE INDEX IF NOT EXISTS "made_by_pid" ON "made_by" ("pid"); 6 | CREATE INDEX IF NOT EXISTS "cast_aid" ON "cast" ("aid"); 7 | CREATE INDEX IF NOT EXISTS "actor_aid" ON "actor" ("aid"); 8 | CREATE INDEX IF NOT EXISTS "actor_gender" ON "actor" ("gender"); 9 | CREATE INDEX IF NOT EXISTS "movie_mid" ON "movie" ("mid"); 10 | -------------------------------------------------------------------------------- /language/xsp/data_utils/extra_scholar_indices.txt: -------------------------------------------------------------------------------- 1 | CREATE INDEX IF NOT EXISTS "writes_authorId" ON "writes" ("authorId"); 2 | CREATE INDEX IF NOT EXISTS "writes_paperId" ON "writes" ("paperId"); 3 | DROP INDEX IF EXISTS "author_authorName"; 4 | CREATE INDEX IF NOT EXISTS "author_authorName" ON "author" ("authorName" collate nocase); 5 | DROP INDEX IF EXISTS "dataset_datasetName"; 6 | CREATE INDEX IF NOT EXISTS "dataset_datasetName" ON "dataset" ("datasetName" collate nocase); 7 | DROP INDEX IF EXISTS "journal_journalName"; 8 | CREATE INDEX IF NOT EXISTS "journal_journalName" ON "journal" ("journalName" collate nocase); 9 | DROP INDEX IF EXISTS "keyphrase_keyphraseName"; 10 | CREATE INDEX IF NOT EXISTS "keyphrase_keyphraseName" ON "keyphrase" ("keyphraseName" collate nocase); 11 | CREATE INDEX IF NOT EXISTS "paper_title" ON "paper" ("title" collate nocase); 12 | CREATE INDEX IF NOT EXISTS "venue_venueName" ON "venue" ("venueName" collate nocase); 13 | -------------------------------------------------------------------------------- /language/xsp/data_utils/geoquery-prefix.txt: -------------------------------------------------------------------------------- 1 | create database geoquery; 2 | use geoquery; 3 | -------------------------------------------------------------------------------- /language/xsp/data_utils/imdb-prefix.txt: -------------------------------------------------------------------------------- 1 | create database imdb; 2 | use imdb; 3 | -------------------------------------------------------------------------------- /language/xsp/data_utils/scholar-prefix.txt: -------------------------------------------------------------------------------- 1 | create database scholar; 2 | use scholar; 3 | -------------------------------------------------------------------------------- /language/xsp/data_utils/yelp-prefix.txt: -------------------------------------------------------------------------------- 1 | create database yelp; 2 | use yelp; 3 | -------------------------------------------------------------------------------- /language/xsp/model/local_model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_options": { 3 | "bert_vocab_path": "", 4 | "max_num_tokens": 512, 5 | "max_decode_length": 100 6 | }, 7 | "model_parameters": { 8 | "use_segment_ids": false, 9 | "use_foreign_key_features": false, 10 | "use_alignment_features": false, 11 | "pretrained_bert_dir": "", 12 | "source_embedding_dims": 128, 13 | "target_embedding_dims": 128, 14 | "encoder_dims": 128, 15 | "decoder_dims": 128, 16 | "max_decoder_relative_distance": 8, 17 | "num_decoder_layers": 2, 18 | "num_heads": 8, 19 | "decoder_ff_layer_hidden_size": 512 20 | }, 21 | "training_options": { 22 | "tpu_iterations_per_loop": 1000, 23 | "batch_size": 2, 24 | "training_steps": 1000, 25 | "layer_dropout_rate": 0.3, 26 | "optimizer_learning_rate": 0.00008, 27 | "optimizer_warmup_steps": 200, 28 | "freeze_pretrained_steps": 0, 29 | "after_restart_learning_rate": 0.00008 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /language/xsp/model/model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_options": { 3 | "bert_vocab_path": "", 4 | "max_num_tokens": 512, 5 | "max_decode_length": 100 6 | }, 7 | "model_parameters": { 8 | "use_segment_ids": false, 9 | "use_foreign_key_features": false, 10 | "use_alignment_features": false, 11 | "pretrained_bert_dir": "", 12 | "source_embedding_dims": 128, 13 | "target_embedding_dims": 128, 14 | "encoder_dims": 128, 15 | "decoder_dims": 128, 16 | "max_decoder_relative_distance": 8, 17 | "num_decoder_layers": 2, 18 | "num_heads": 8, 19 | "decoder_ff_layer_hidden_size": 512 20 | }, 21 | "training_options": { 22 | "tpu_iterations_per_loop": 1000, 23 | "batch_size": 32, 24 | "training_steps": 30000, 25 | "layer_dropout_rate": 0.3, 26 | "optimizer_learning_rate": 0.00008, 27 | "optimizer_warmup_steps": 5625, 28 | "freeze_pretrained_steps": 2100, 29 | "after_restart_learning_rate": 0.00008 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /language/xsp/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam==2.22.0 2 | sqlparse==0.3.1 3 | tf-slim==1.1.0 4 | timeout-decorator==0.4.1 5 | tqdm==4.49.0 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Install projects from the Language Team.""" 16 | import os 17 | from setuptools import find_packages 18 | from setuptools import setup 19 | 20 | 21 | def read(fname): 22 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 23 | 24 | 25 | setup( 26 | name="language", 27 | version="0.0.1.dev", 28 | packages=find_packages(), 29 | description="Google AI Language.", 30 | long_description=read("README.md"), 31 | author="Google Inc.", 32 | url="https://github.com/google-research/language", 33 | license="Apache 2.0", 34 | install_requires=[ 35 | "tensorflow-gpu~=1.15.0", 36 | ], 37 | extras_require={ 38 | "consistent-zero-shot-nmt": [ 39 | "tensorflow-probability==0.6.0", 40 | "tensor2tensor==1.11.0", 41 | ], 42 | }) 43 | --------------------------------------------------------------------------------