├── AUTHORS
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── language
    ├── __init__.py
    ├── asqa
    │   ├── README.md
    │   ├── convert_to_roberta_format.py
    │   ├── eval.sh
    │   ├── human_annotation
    │   │   ├── analysis.py
    │   │   ├── instructions.txt
    │   │   ├── preparation.py
    │   │   ├── prepare_interface.gs
    │   │   ├── ready_for_drive.tsv
    │   │   ├── screenshot.png
    │   │   └── setup.tsv
    │   ├── install.sh
    │   ├── requirements.txt
    │   └── scoring.py
    ├── bert_extraction
    │   ├── README.md
    │   ├── __init__.py
    │   ├── steal_bert_classifier
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── data_generation
    │   │   │   ├── __init__.py
    │   │   │   ├── build_aux_membership.py
    │   │   │   ├── build_membership_dataset.py
    │   │   │   ├── merge_dataset_pool_active_learning.py
    │   │   │   ├── preprocess_edit_distance_one.py
    │   │   │   ├── preprocess_random.py
    │   │   │   ├── preprocess_thief_dataset.py
    │   │   │   └── preprocess_util.py
    │   │   ├── embedding_perturbations
    │   │   │   ├── __init__.py
    │   │   │   ├── discrete_invert_embeddings.py
    │   │   │   ├── embedding_util.py
    │   │   │   ├── invert_embeddings.py
    │   │   │   ├── merge_shards.py
    │   │   │   └── mixup_bert_embeddings.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── run_classifier.py
    │   │   │   ├── run_classifier_distillation.py
    │   │   │   └── run_classifier_membership.py
    │   │   ├── scripts
    │   │   │   ├── evaluate_agreement.sh
    │   │   │   ├── run_extraction_random.sh
    │   │   │   ├── run_extraction_watermark_random.sh
    │   │   │   ├── run_extraction_watermark_wiki.sh
    │   │   │   ├── run_extraction_wiki.sh
    │   │   │   ├── run_membership_classification.sh
    │   │   │   ├── run_pool_filter.sh
    │   │   │   ├── run_query_synthesis.sh
    │   │   │   └── train_victim.sh
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── dataset_analysis.py
    │   │   │   ├── merge_datasets_simple.py
    │   │   │   ├── model_diff.py
    │   │   │   ├── model_diff_dataset.py
    │   │   │   ├── pairwise_dataset_analysis.py
    │   │   │   ├── preprocess_distill_input.py
    │   │   │   ├── preprocess_distill_input_watermark.py
    │   │   │   ├── verify_watermark.py
    │   │   │   └── wiki103_sentencize.py
    │   └── steal_bert_qa
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── data_generation
    │   │       ├── __init__.py
    │   │       ├── build_aux_membership.py
    │   │       ├── build_membership_dataset.py
    │   │       ├── preprocess_fraction_squad.py
    │   │       ├── preprocess_thief_dataset_boolq.py
    │   │       ├── preprocess_thief_dataset_squad.py
    │   │       ├── preprocess_thief_dataset_squad_custom.py
    │   │       ├── preprocess_thief_dev_squad.py
    │   │       └── preprocess_util.py
    │   │   ├── models
    │   │       ├── __init__.py
    │   │       ├── run_bert_boolq.py
    │   │       ├── run_bert_boolq_distill.py
    │   │       ├── run_squad.py
    │   │       └── run_squad_membership.py
    │   │   ├── scripts
    │   │       ├── run_extraction_boolq.sh
    │   │       ├── run_extraction_squad.sh
    │   │       ├── run_extraction_watermark_squad.sh
    │   │       ├── run_filter_victim_squad.sh
    │   │       ├── run_membership_squad.sh
    │   │       ├── train_victim_boolq.sh
    │   │       └── train_victim_squad.sh
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── combine_qa.py
    │   │       ├── combine_qa_watermark.py
    │   │       ├── evaluate_squad.py
    │   │       ├── evaluate_squad_2.py
    │   │       ├── evaluate_squad_watermark.py
    │   │       ├── filter_queries_victim_agreement.py
    │   │       ├── run_bert_boolq_diff.py
    │   │       └── wiki103_para_split.py
    ├── bertology
    │   └── frequency_effects
    │   │   ├── README.md
    │   │   └── data
    │   │       ├── README.md
    │   │       ├── nouns.tsv
    │   │       ├── sentential_contexts.tsv
    │   │       └── verbs.tsv
    ├── boolq
    │   ├── README.md
    │   ├── __init__.py
    │   ├── run_bert_boolq.py
    │   ├── run_recurrent_model_boolq.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── best_checkpoint_exporter.py
    │   │   ├── ops.py
    │   │   ├── ops_test.py
    │   │   ├── py_utils.py
    │   │   └── tokenization.py
    ├── canine
    │   ├── README.md
    │   ├── bert_modeling.py
    │   ├── bert_optimization.py
    │   ├── config_utils.py
    │   ├── config_utils_test.py
    │   ├── local_attention.py
    │   ├── modeling.py
    │   ├── modeling_test.py
    │   ├── special_codepoints.py
    │   ├── tensor_contracts.py
    │   ├── tensor_contracts_test.py
    │   └── tydiqa
    │   │   ├── README.md
    │   │   ├── char_splitter.py
    │   │   ├── data.py
    │   │   ├── debug.py
    │   │   ├── postproc.py
    │   │   ├── prepare_tydi_data.py
    │   │   ├── preproc.py
    │   │   ├── preproc_test.py
    │   │   ├── run_tydi.py
    │   │   ├── run_tydi_lib.py
    │   │   ├── tf_io.py
    │   │   ├── tf_io_test.py
    │   │   ├── tydi_modeling.py
    │   │   └── tydi_tokenization_interface.py
    ├── capwap
    │   ├── README.md
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── captions_dataset.py
    │   │   ├── rc_dataset.py
    │   │   ├── text_dataset.py
    │   │   ├── vqa_dataset.py
    │   │   └── wsp_dataset.py
    │   ├── download.sh
    │   ├── evaluation
    │   │   ├── infer_captions.py
    │   │   ├── infer_wsp_captions.py
    │   │   └── score_captions.py
    │   ├── img
    │   │   └── capwap.png
    │   ├── models
    │   │   ├── reinforce_model.py
    │   │   └── supervised_model.py
    │   ├── preprocessing
    │   │   ├── coco_ood_captions.py
    │   │   ├── coco_synthetic_qa.py
    │   │   ├── coco_text_planner.py
    │   │   ├── gqa_qa.py
    │   │   ├── text_synthetic_qa.py
    │   │   ├── v7w_qa.py
    │   │   ├── vizwiz_qa.py
    │   │   ├── vqa_qa.py
    │   │   └── weakly_supervised.py
    │   ├── synthetic
    │   │   ├── filter_round_trip.py
    │   │   ├── generate_answers.py
    │   │   └── generate_questions.py
    │   ├── training
    │   │   ├── train_reinforce.py
    │   │   └── train_supervised.py
    │   └── utils
    │   │   ├── checkpoint_utils.py
    │   │   ├── experiment_utils.py
    │   │   ├── image_utils.py
    │   │   ├── io_utils.py
    │   │   ├── metric_utils.py
    │   │   ├── nltk_utils.py
    │   │   ├── reward_utils.py
    │   │   ├── tensor_utils.py
    │   │   ├── text_utils.py
    │   │   ├── transformer_utils.py
    │   │   └── tsv_to_hdf5.py
    ├── casper
    │   ├── EXPERIMENTS.md
    │   ├── README.md
    │   ├── augment
    │   │   ├── cached_retrieval_to_dataset.py
    │   │   ├── cached_retrieval_to_dataset_lib.py
    │   │   ├── casper_converters.py
    │   │   ├── casper_converters_test.py
    │   │   ├── casper_formatters.py
    │   │   ├── casper_formatters_test.py
    │   │   └── patch_guiding_tag.py
    │   ├── evaluate
    │   │   ├── evaluate_mtop_predictions.py
    │   │   ├── evaluate_retrieval.py
    │   │   └── top_metrics.py
    │   ├── retrieve
    │   │   ├── cache_query_retrievals.py
    │   │   ├── query_retrievers.py
    │   │   └── query_retrievers_test.py
    │   ├── scripts
    │   │   ├── domain_bootstrap_cache_retrievals.sh
    │   │   ├── domain_bootstrap_gen_datasets.sh
    │   │   ├── parse_guiding_gen_datasets.sh
    │   │   ├── schema_refactor_gen_dataset.sh
    │   │   ├── standard_cache_retrievals.sh
    │   │   └── standard_gen_datasets.sh
    │   └── utils
    │   │   ├── data_types.py
    │   │   ├── mtop_tsv_to_jsonl.py
    │   │   ├── sample_utils.py
    │   │   ├── sample_utils_test.py
    │   │   ├── top_constants.py
    │   │   ├── top_utils.py
    │   │   └── top_utils_test.py
    ├── common
    │   ├── __init__.py
    │   ├── inputs
    │   │   ├── __init__.py
    │   │   ├── char_utils.py
    │   │   ├── char_utils_test.py
    │   │   ├── dataset_utils.py
    │   │   ├── dataset_utils_test.py
    │   │   ├── embedding_utils.py
    │   │   └── embedding_utils_test.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── affine_transform.py
    │   │   ├── affine_transform_test.py
    │   │   ├── common_layers.py
    │   │   ├── common_layers_test.py
    │   │   ├── cudnn_layers.py
    │   │   └── cudnn_layers_test.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── experiment_utils.py
    │   │   ├── experiment_utils_test.py
    │   │   ├── export_utils.py
    │   │   ├── export_utils_test.py
    │   │   ├── exporters.py
    │   │   ├── file_utils.py
    │   │   ├── model_utils.py
    │   │   ├── nest_utils.py
    │   │   ├── tensor_utils.py
    │   │   ├── tensor_utils_test.py
    │   │   └── tpu_utils.py
    ├── compgen
    │   ├── csl
    │   │   ├── README.md
    │   │   ├── augment
    │   │   │   ├── generate_synthetic_examples.py
    │   │   │   ├── generate_synthetic_examples_beam.py
    │   │   │   ├── joint_sampler.py
    │   │   │   ├── joint_sampler_test.py
    │   │   │   ├── merge_tsvs.py
    │   │   │   ├── qcfg_sampler.py
    │   │   │   ├── qcfg_sampler_test.py
    │   │   │   ├── sampler_utils.py
    │   │   │   ├── sampler_utils_test.py
    │   │   │   └── test_utils.py
    │   │   ├── cky
    │   │   │   ├── cfg_converter.py
    │   │   │   ├── cfg_parser.py
    │   │   │   ├── cfg_parser_test.py
    │   │   │   ├── cfg_rule.py
    │   │   │   ├── cfg_sampler.py
    │   │   │   └── cfg_sampler_test.py
    │   │   ├── common
    │   │   │   ├── beam_utils.py
    │   │   │   ├── json_utils.py
    │   │   │   ├── txt_utils.py
    │   │   │   └── writer_utils.py
    │   │   ├── csl_flowchart.jpg
    │   │   ├── demo_geoquery.sh
    │   │   ├── demo_smcalflow.sh
    │   │   ├── induction
    │   │   │   ├── action_utils.py
    │   │   │   ├── derivation_utils.py
    │   │   │   ├── derivation_utils_test.py
    │   │   │   ├── greedy_policy.py
    │   │   │   ├── induction_utils.py
    │   │   │   ├── objective_utils.py
    │   │   │   ├── objective_utils_test.py
    │   │   │   ├── rule_utils.py
    │   │   │   ├── rule_utils_test.py
    │   │   │   ├── search_main.py
    │   │   │   ├── search_main_beam.py
    │   │   │   ├── unification_utils.py
    │   │   │   └── unification_utils_test.py
    │   │   ├── model
    │   │   │   ├── data
    │   │   │   │   ├── example_converter.py
    │   │   │   │   ├── example_converter_test.py
    │   │   │   │   ├── forest_serialization.py
    │   │   │   │   ├── parsing_utils.py
    │   │   │   │   ├── write_examples.py
    │   │   │   │   └── write_examples_beam.py
    │   │   │   ├── data_constants.py
    │   │   │   ├── inference
    │   │   │   │   ├── eval_model.py
    │   │   │   │   ├── eval_model_beam.py
    │   │   │   │   ├── eval_utils.py
    │   │   │   │   ├── get_predictions.py
    │   │   │   │   ├── inference_parser.py
    │   │   │   │   ├── inference_parser_test.py
    │   │   │   │   ├── inference_utils.py
    │   │   │   │   └── inference_wrapper.py
    │   │   │   ├── test_utils.py
    │   │   │   ├── training
    │   │   │   │   ├── forest_utils.py
    │   │   │   │   ├── forest_utils_test.py
    │   │   │   │   ├── input_utils.py
    │   │   │   │   ├── train_model.py
    │   │   │   │   ├── training_utils.py
    │   │   │   │   └── training_utils_test.py
    │   │   │   └── weighted_model.py
    │   │   ├── qcfg
    │   │   │   ├── compute_recall.py
    │   │   │   ├── qcfg_file.py
    │   │   │   ├── qcfg_parser.py
    │   │   │   ├── qcfg_parser_test.py
    │   │   │   ├── qcfg_rule.py
    │   │   │   ├── qcfg_target_parser.py
    │   │   │   └── qcfg_target_parser_test.py
    │   │   ├── targets
    │   │   │   ├── target_grammar.py
    │   │   │   ├── target_grammar_test.py
    │   │   │   └── verify_target_grammar.py
    │   │   └── tasks
    │   │   │   ├── cogs
    │   │   │       ├── augment_config.json
    │   │   │       ├── induction_config.json
    │   │   │       ├── model_config.json
    │   │   │       ├── seed_rules.txt
    │   │   │       ├── target_cfg.txt
    │   │   │       └── tools
    │   │   │       │   ├── categorize_errors.py
    │   │   │       │   ├── cogs_converter.py
    │   │   │       │   ├── cogs_converter_test.py
    │   │   │       │   └── preprocess_cogs_data.py
    │   │   │   ├── exact_match_utils.py
    │   │   │   ├── exact_match_utils_test.py
    │   │   │   ├── generate_exact_match_rules.py
    │   │   │   ├── geoquery
    │   │   │       ├── augment_config.json
    │   │   │       ├── induction_config.json
    │   │   │       ├── model_config.json
    │   │   │       ├── splits
    │   │   │       │   ├── length.json
    │   │   │       │   ├── template_1.json
    │   │   │       │   ├── template_2.json
    │   │   │       │   ├── template_3.json
    │   │   │       │   ├── tmcd_1.json
    │   │   │       │   ├── tmcd_2.json
    │   │   │       │   └── tmcd_3.json
    │   │   │       └── target_cfg.txt
    │   │   │   ├── scan
    │   │   │       ├── augment_config.json
    │   │   │       ├── induction_config.json
    │   │   │       ├── model_config_t2.json
    │   │   │       └── model_config_t4.json
    │   │   │   └── smcalflow
    │   │   │       ├── augment_config.json
    │   │   │       ├── induction_config.json
    │   │   │       ├── manual_seed_rules.txt
    │   │   │       ├── model_config.json
    │   │   │       └── tools
    │   │   │           ├── filter_examples.py
    │   │   │           ├── format_for_t5.py
    │   │   │           ├── generate_identity_rules.py
    │   │   │           ├── generate_target_cfg.py
    │   │   │           ├── merge_dataset.py
    │   │   │           ├── restore_oov.py
    │   │   │           ├── retokenize_inputs.py
    │   │   │           ├── split_examples.py
    │   │   │           └── string_utils.py
    │   └── nqg
    │   │   ├── README.md
    │   │   ├── common
    │   │       └── cky
    │   │       │   ├── cfg_parser.py
    │   │       │   ├── cfg_parser_test.py
    │   │       │   ├── cfg_rule.py
    │   │       │   ├── cky_utils.py
    │   │       │   └── trie_utils.py
    │   │   ├── model
    │   │       ├── induction
    │   │       │   ├── codelength_utils.py
    │   │       │   ├── codelength_utils_test.py
    │   │       │   ├── derivation_utils.py
    │   │       │   ├── derivation_utils_test.py
    │   │       │   ├── exact_match_utils.py
    │   │       │   ├── exact_match_utils_test.py
    │   │       │   ├── induce_rules.py
    │   │       │   ├── induction_utils.py
    │   │       │   ├── rule_utils.py
    │   │       │   ├── rule_utils_test.py
    │   │       │   ├── split_utils.py
    │   │       │   └── split_utils_test.py
    │   │       ├── parser
    │   │       │   ├── config_utils.py
    │   │       │   ├── configs
    │   │       │   │   ├── geoquery_config.json
    │   │       │   │   ├── geoquery_xl_config.json
    │   │       │   │   ├── scan_config.json
    │   │       │   │   └── spider_config.json
    │   │       │   ├── data
    │   │       │   │   ├── data_constants.py
    │   │       │   │   ├── example_converter.py
    │   │       │   │   ├── example_converter_test.py
    │   │       │   │   ├── forest_serialization.py
    │   │       │   │   ├── forest_serialization_test.py
    │   │       │   │   ├── parsing_utils.py
    │   │       │   │   ├── tokenization_utils.py
    │   │       │   │   ├── tokenization_utils_test.py
    │   │       │   │   └── write_examples.py
    │   │       │   ├── inference
    │   │       │   │   ├── eval_model.py
    │   │       │   │   ├── generate_predictions.py
    │   │       │   │   ├── inference_parser.py
    │   │       │   │   ├── inference_wrapper.py
    │   │       │   │   ├── inference_wrapper_test.py
    │   │       │   │   └── targets
    │   │       │   │   │   ├── funql.txt
    │   │       │   │   │   ├── generate_spider_grammars.py
    │   │       │   │   │   ├── target_grammar.py
    │   │       │   │   │   └── target_grammar_test.py
    │   │       │   ├── nqg_model.py
    │   │       │   ├── nqg_model_test.py
    │   │       │   ├── test_utils.py
    │   │       │   └── training
    │   │       │   │   ├── forest_utils.py
    │   │       │   │   ├── forest_utils_test.py
    │   │       │   │   ├── input_utils.py
    │   │       │   │   ├── train_model.py
    │   │       │   │   ├── training_utils.py
    │   │       │   │   └── training_utils_test.py
    │   │       └── qcfg
    │   │       │   ├── compute_recall.py
    │   │       │   ├── qcfg_file.py
    │   │       │   ├── qcfg_parser.py
    │   │       │   ├── qcfg_parser_test.py
    │   │       │   └── qcfg_rule.py
    │   │   └── tasks
    │   │       ├── compare_predictions.py
    │   │       ├── compare_splits.py
    │   │       ├── gen_length_split.py
    │   │       ├── gen_random_split.py
    │   │       ├── geoquery
    │   │           ├── entity_utils.py
    │   │           ├── funql_normalization.py
    │   │           ├── funql_normalization_test.py
    │   │           ├── gen_template_split.py
    │   │           ├── gen_tmcd_split.py
    │   │           ├── geobase_utils.py
    │   │           ├── measure_compound_divergence.py
    │   │           ├── measure_unseen_atoms.py
    │   │           ├── splits
    │   │           │   ├── length_1.json
    │   │           │   ├── standard.json
    │   │           │   ├── template_1.json
    │   │           │   └── tmcd_1.json
    │   │           ├── tmcd_utils.py
    │   │           ├── tmcd_utils_test.py
    │   │           ├── write_dataset.py
    │   │           └── xml_file_utils.py
    │   │       ├── mcd_utils.py
    │   │       ├── mcd_utils_test.py
    │   │       ├── scan
    │   │           ├── convert_to_tsv.py
    │   │           └── join_txt_to_tsv.py
    │   │       ├── spider
    │   │           ├── append_schema.py
    │   │           ├── database_constants.py
    │   │           ├── gen_template_split.py
    │   │           ├── gen_tmcd_split.py
    │   │           ├── generate_gold.py
    │   │           ├── measure_compound_divergence.py
    │   │           ├── measure_unseen_atoms.py
    │   │           ├── nqg_preprocess.py
    │   │           ├── nqg_tokenization.py
    │   │           ├── print_database_counts.py
    │   │           ├── restore_oov.py
    │   │           ├── splits
    │   │           │   ├── length_1.json
    │   │           │   ├── random_1.json
    │   │           │   ├── template_1.json
    │   │           │   └── tmcd_1.json
    │   │           ├── sql_parser.py
    │   │           ├── sql_parser_main.py
    │   │           ├── sql_parser_test.py
    │   │           ├── sql_tokenizer.py
    │   │           ├── tmcd_utils.py
    │   │           ├── tmcd_utils_test.py
    │   │           └── write_dataset.py
    │   │       ├── split_dataset.py
    │   │       ├── strip_targets.py
    │   │       ├── template_utils.py
    │   │       └── tsv_utils.py
    ├── compir
    │   ├── README.md
    │   ├── dataset_parsers
    │   │   ├── cfq_parser.py
    │   │   ├── dataset_parser.py
    │   │   ├── scan_parser.py
    │   │   └── sql_parser.py
    │   ├── evaluate
    │   │   ├── evaluate_predictions.py
    │   │   └── evaluate_predictions_utils.py
    │   ├── transform
    │   │   ├── apply_transformation.py
    │   │   └── apply_transformation_utils.py
    │   └── utils
    │   │   ├── dataset_parser_utils.py
    │   │   └── io_utils.py
    ├── conpono
    │   ├── README.md
    │   ├── binary_order
    │   │   └── run_binary_coherence.py
    │   ├── cpc
    │   │   ├── bilin_model_builder.py
    │   │   ├── model_builder.py
    │   │   ├── preproc
    │   │   │   ├── books_preproc_pipeline.py
    │   │   │   ├── ccnews_preproc_pipeline.py
    │   │   │   ├── preprocessing_utils.py
    │   │   │   ├── raw_books_preproc_pipeline.py
    │   │   │   └── wiki_preproc_pipeline.py
    │   │   ├── run_bilin_cpc.py
    │   │   ├── run_cc_cpc.py
    │   │   └── run_cpc.py
    │   ├── create_pretrain_data
    │   │   ├── books_preproc_pipeline.py
    │   │   ├── preprocessing_utils.py
    │   │   └── wiki_preproc_pipeline.py
    │   ├── evals
    │   │   ├── classifier_utils.py
    │   │   ├── coherence_eval.py
    │   │   ├── discriminative_eval.py
    │   │   ├── model_builder.py
    │   │   ├── race_utils.py
    │   │   ├── run_classifier.py
    │   │   ├── run_concat_classifier.py
    │   │   ├── run_finetune_coherence.py
    │   │   ├── run_hellaswag.py
    │   │   ├── run_multichoice.py
    │   │   ├── run_race_sp_eval_all.py
    │   │   ├── run_record.py
    │   │   └── run_squad.py
    │   └── reconstruct
    │   │   ├── model_builder.py
    │   │   ├── preprocess.py
    │   │   └── run_paragraph_reconstruct.py
    ├── decontext
    │   ├── README.md
    │   ├── decontext_util.py
    │   ├── decontextualization_demo.ipynb
    │   ├── eval.py
    │   ├── eval_requirements.txt
    │   └── eval_util.py
    ├── diffqg
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── annotation.py
    │   ├── install_and_test.sh
    │   ├── metrics.py
    │   ├── requirements.txt
    │   └── run_metrics.py
    ├── emql
    │   ├── README.md
    │   ├── __init__.py
    │   ├── cm_sketch.py
    │   ├── cm_sketch_test.py
    │   ├── data_loader.py
    │   ├── data_loader_test.py
    │   ├── eval.py
    │   ├── eval_test.py
    │   ├── main.py
    │   ├── model.py
    │   ├── model_test.py
    │   ├── module.py
    │   ├── module_test.py
    │   ├── preprocess
    │   │   ├── metaqa_preprocess.py
    │   │   └── query2box_preprocess.py
    │   ├── util.py
    │   └── util_test.py
    ├── frost
    │   ├── LICENSE
    │   ├── README.md
    │   ├── __init__.py
    │   ├── create_frost_finetuning_data.py
    │   ├── spacy_frost_annotator_lib.py
    │   └── spacy_frost_annotator_lib_test.py
    ├── fruit
    │   ├── README.md
    │   ├── README_PIPELINE.md
    │   ├── __init__.py
    │   ├── beam_pipelines.py
    │   ├── beam_pipelines_test.py
    │   ├── convert_task_to_jsonl.py
    │   ├── metrics.py
    │   ├── metrics_test.py
    │   ├── postprocessors.py
    │   ├── postprocessors_test.py
    │   ├── rendering_utils.py
    │   ├── rendering_utils_test.py
    │   ├── requirements.txt
    │   ├── scripts
    │   │   ├── __init__.py
    │   │   ├── convert_to_jsonl.py
    │   │   ├── evaluate_direct_jsonls.py
    │   │   ├── evaluate_t5x_jsonl.py
    │   │   ├── get_topics.py
    │   │   ├── run_convert_to_jsonl.py
    │   │   ├── run_filter_for_generation_pipeline.py
    │   │   ├── run_process_snapshot_pipeline.py
    │   │   ├── run_redirect_table_pipeline.py
    │   │   ├── run_to_tfrecords_pipeline.py
    │   │   └── sample_data
    │   │   │   ├── inputlabels.jsonl
    │   │   │   └── pred.jsonl
    │   ├── t5x
    │   │   └── configs
    │   │   │   ├── t5_3b_eval.gin
    │   │   │   ├── t5_3b_finetune.gin
    │   │   │   ├── t5_base_eval.gin
    │   │   │   ├── t5_base_finetune.gin
    │   │   │   ├── t5_large_eval.gin
    │   │   │   ├── t5_large_finetune.gin
    │   │   │   ├── t5_small_eval.gin
    │   │   │   └── t5_small_finetune.gin
    │   ├── tasks.py
    │   ├── testdata
    │   │   ├── test.diff.tfrecords
    │   │   ├── test_annotations.jsonl
    │   │   ├── test_article_pairs.jsonl
    │   │   ├── test_redirects.tsv
    │   │   ├── test_source_articles.jsonl
    │   │   └── test_target_articles.jsonl
    │   ├── tf_utils.py
    │   ├── tf_utils_test.py
    │   ├── wiki_utils.py
    │   └── wiki_utils_test.py
    ├── gscan
    │   ├── data
    │   │   ├── README.md
    │   │   ├── dataset.py
    │   │   ├── grammar.py
    │   │   ├── main.py
    │   │   ├── vocabulary.py
    │   │   └── world.py
    │   └── xattn_model
    │   │   ├── README.md
    │   │   ├── configs
    │   │       ├── compositional.py
    │   │       └── spatial_relation.py
    │   │   ├── dataset
    │   │       ├── gscan_dataset.py
    │   │       ├── input_pipeline.py
    │   │       ├── input_pipeline_test.py
    │   │       └── preprocess.py
    │   │   ├── evaluation.py
    │   │   ├── main.py
    │   │   ├── model
    │   │       ├── decode.py
    │   │       ├── layers.py
    │   │       ├── layers_test.py
    │   │       ├── model_utils.py
    │   │       ├── models.py
    │   │       └── models_test.py
    │   │   ├── predict.py
    │   │   ├── predict_test.py
    │   │   ├── test_utils.py
    │   │   ├── testdata
    │   │       ├── dataset.txt
    │   │       ├── train.tfrecord
    │   │       ├── training_input_vocab.txt
    │   │       └── training_target_vocab.txt
    │   │   ├── train.py
    │   │   ├── train_test.py
    │   │   └── train_utils.py
    ├── labs
    │   ├── README
    │   ├── __init__.py
    │   ├── consistent_zero_shot_nmt
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── bin
    │   │   │   ├── __init__.py
    │   │   │   ├── t2t_datagen.py
    │   │   │   ├── t2t_decoder.py
    │   │   │   └── t2t_trainer.py
    │   │   ├── data_generators
    │   │   │   ├── __init__.py
    │   │   │   ├── translate_europarl.py
    │   │   │   ├── translate_iwslt17.py
    │   │   │   ├── translate_multilingual.py
    │   │   │   └── translate_uncorpus.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── agreement.py
    │   │   │   ├── basic.py
    │   │   │   ├── losses.py
    │   │   │   └── losses_test.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   ├── attention_mechanisms.py
    │   │   │   ├── attention_wrappers.py
    │   │   │   ├── base.py
    │   │   │   ├── decoders.py
    │   │   │   ├── encoders.py
    │   │   │   ├── helpers.py
    │   │   │   └── language_models.py
    │   │   ├── scripts
    │   │   │   ├── __init__.py
    │   │   │   ├── datagen_europarl.sh
    │   │   │   ├── datagen_iwslt17.sh
    │   │   │   ├── datagen_uncorpus.sh
    │   │   │   ├── decode_europarl.sh
    │   │   │   ├── decode_iwslt17.sh
    │   │   │   ├── decode_uncorpus.sh
    │   │   │   ├── download_and_preproc_europarl.sh
    │   │   │   ├── download_and_preproc_iwslt2017.sh
    │   │   │   ├── identify_overlap_europarl.py
    │   │   │   ├── identify_overlap_iwslt17.py
    │   │   │   ├── parse-args.sh
    │   │   │   └── run_nmt_experiment.sh
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── common_utils.py
    │   │   │   ├── model_utils.py
    │   │   │   └── t2t_tweaks.py
    │   ├── drkit
    │   │   ├── README.md
    │   │   ├── bert_utils.py
    │   │   ├── bert_utils_v2.py
    │   │   ├── evaluate.py
    │   │   ├── hotpotqa
    │   │   │   ├── answer_extractor.py
    │   │   │   ├── demo.py
    │   │   │   ├── index.py
    │   │   │   ├── preprocessing
    │   │   │   │   ├── convert_hotpot_to_mrqa.py
    │   │   │   │   ├── convert_wikidata_to_mrqa.py
    │   │   │   │   ├── create_tfrecords.py
    │   │   │   │   ├── link_questions.py
    │   │   │   │   └── parse_wiki.py
    │   │   │   ├── scripts
    │   │   │   │   ├── index_hotpot_corpus.sh
    │   │   │   │   ├── run_demo.sh
    │   │   │   │   ├── run_hotpotqa_answer.sh
    │   │   │   │   ├── run_hotpotqa_finetuning.sh
    │   │   │   │   └── run_hotpotqa_pretraining.sh
    │   │   │   └── web
    │   │   │   │   ├── static
    │   │   │   │       └── drkit.css
    │   │   │   │   └── templates
    │   │   │   │       └── drkit.html
    │   │   ├── input_fns.py
    │   │   ├── metaqa
    │   │   │   ├── index.py
    │   │   │   ├── preprocessing
    │   │   │   │   ├── distantly_supervise.py
    │   │   │   │   ├── metaqa_preprocess.py
    │   │   │   │   └── process_wiki.py
    │   │   │   └── scripts
    │   │   │   │   ├── index_metaqa_corpus.sh
    │   │   │   │   ├── preprocess_data.sh
    │   │   │   │   ├── run_metaqa_finetuning.sh
    │   │   │   │   └── run_metaqa_pretraining.sh
    │   │   ├── model_fns.py
    │   │   ├── preprocessing
    │   │   │   ├── preprocess_qa.py
    │   │   │   └── preprocess_utils.py
    │   │   ├── run_dualencoder_lsf.py
    │   │   ├── run_dualencoder_qa.py
    │   │   ├── run_multihop_follow.py
    │   │   ├── search_utils.py
    │   │   └── wikidata
    │   │   │   ├── index.py
    │   │   │   ├── preprocessing
    │   │   │       ├── add_negatives.py
    │   │   │       ├── create_3hop_queries.py
    │   │   │       ├── create_follow_queries.py
    │   │   │       └── distantly_supervise.py
    │   │   │   └── scripts
    │   │   │       ├── create_multihop_wikidata.sh
    │   │   │       ├── index_wikidata_corpus.sh
    │   │   │       ├── run_wikidata_finetuning.sh
    │   │   │       └── run_wikidata_pretraining.sh
    │   ├── exemplar_decoding
    │   │   ├── __init__.py
    │   │   ├── docs
    │   │   │   ├── giga_hyperparameters.txt
    │   │   │   └── nyt_hyperparameters.txt
    │   │   ├── experiments
    │   │   │   ├── __init__.py
    │   │   │   ├── predict.py
    │   │   │   └── train.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── adam.py
    │   │   │   ├── attention.py
    │   │   │   ├── baselines.py
    │   │   │   ├── common.py
    │   │   │   ├── hyperlstm.py
    │   │   │   ├── hypernet.py
    │   │   │   ├── linear.py
    │   │   │   ├── model_function.py
    │   │   │   └── output_wrapper.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── data.py
    │   │   │   ├── rouge_utils.py
    │   │   │   ├── tensor_utils.py
    │   │   │   └── tensor_utils_test.py
    │   └── memory
    │   │   ├── README
    │   │   ├── __init__.py
    │   │   ├── baseline_models.py
    │   │   ├── differentiable_plasticity.py
    │   │   ├── explicit_mem.py
    │   │   ├── model_utils.py
    │   │   ├── model_utils_test.py
    │   │   ├── run_models.py
    │   │   ├── synthetic_dataset.py
    │   │   └── synthetic_dataset_test.py
    ├── massive_translations
    │   └── README.md
    ├── mentionmemory
    │   ├── README.md
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── prepare_complexwq.py
    │   │   ├── prepare_fever.py
    │   │   ├── prepare_hover.py
    │   │   ├── prepare_tacred.py
    │   │   ├── prepare_webred.py
    │   │   └── testdata
    │   │   │   └── tacred
    │   │   │       └── test_sample.json
    │   ├── encoders
    │   │   ├── __init__.py
    │   │   ├── base_encoder.py
    │   │   ├── bert_encoder.py
    │   │   ├── eae_encoder.py
    │   │   ├── eae_encoder_test.py
    │   │   ├── encoder_registry.py
    │   │   ├── encoder_registry_test.py
    │   │   ├── import_encoders.py
    │   │   ├── mauto_encoder.py
    │   │   ├── mauto_encoder_test.py
    │   │   ├── mention_memory_encoder.py
    │   │   ├── mention_memory_encoder_test.py
    │   │   ├── readtwice_encoder.py
    │   │   └── readtwice_encoder_test.py
    │   ├── experiments
    │   │   ├── __init__.py
    │   │   ├── jax_runner.py
    │   │   └── memory_generation_main.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── attention_test.py
    │   │   ├── batch_memory_attention_layer.py
    │   │   ├── batch_memory_attention_layer_test.py
    │   │   ├── embedding.py
    │   │   ├── embedding_test.py
    │   │   ├── entity_attention_layer.py
    │   │   ├── entity_attention_layer_test.py
    │   │   ├── kmeans.py
    │   │   ├── kmeans_test.py
    │   │   ├── memory_attention_layer.py
    │   │   ├── memory_attention_layer_test.py
    │   │   ├── memory_extraction_layer.py
    │   │   ├── memory_extraction_layer_test.py
    │   │   ├── memory_retrieval_layer.py
    │   │   ├── mention_losses.py
    │   │   ├── mention_losses_test.py
    │   │   ├── mlm_layer.py
    │   │   ├── mlm_layer_test.py
    │   │   ├── mlp.py
    │   │   ├── mlp_test.py
    │   │   ├── retrieval_update_layers.py
    │   │   ├── retrieval_update_layers_test.py
    │   │   ├── sparse_topk_similarity_layer.py
    │   │   ├── sparse_topk_similarity_layer_test.py
    │   │   ├── topk_similarity_layer.py
    │   │   ├── topk_similarity_layer_test.py
    │   │   ├── transformer.py
    │   │   └── transformer_test.py
    │   ├── requirements.txt
    │   ├── run.sh
    │   ├── run_tests.py
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── base_task.py
    │   │   ├── downstream_encoder_task.py
    │   │   ├── eae_task.py
    │   │   ├── eae_task_test.py
    │   │   ├── embedding_based_entity_qa_task.py
    │   │   ├── entity_qa_task.py
    │   │   ├── example_task.py
    │   │   ├── example_task_test.py
    │   │   ├── import_tasks.py
    │   │   ├── mauto_task.py
    │   │   ├── mauto_task_test.py
    │   │   ├── memory_generation_task.py
    │   │   ├── memory_generation_task_test.py
    │   │   ├── mention_based_entity_qa_task.py
    │   │   ├── mention_based_entity_qa_task_test.py
    │   │   ├── mention_classifier_task.py
    │   │   ├── mention_encoder_task.py
    │   │   ├── mention_memory_task.py
    │   │   ├── mention_memory_task_test.py
    │   │   ├── readtwice_task.py
    │   │   ├── readtwice_task_test.py
    │   │   ├── relation_classifier_task.py
    │   │   ├── relation_classifier_task_test.py
    │   │   ├── task_registry.py
    │   │   ├── task_registry_test.py
    │   │   ├── testdata
    │   │   │   ├── tacred
    │   │   │   │   ├── README.md
    │   │   │   │   ├── spanbert_tacred_test.txt
    │   │   │   │   └── test.gold
    │   │   │   └── ultra_fine_entity_typing
    │   │   │   │   ├── dev.json
    │   │   │   │   └── types.txt
    │   │   ├── text_classifier.py
    │   │   ├── text_classifier_test.py
    │   │   ├── ultra_fine_entity_typing_task.py
    │   │   └── ultra_fine_entity_typing_task_test.py
    │   ├── training
    │   │   ├── __init__.py
    │   │   ├── trainer.py
    │   │   └── trainer_test.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── checkpoint_utils.py
    │   │   ├── checkpoint_utils_test.py
    │   │   ├── custom_types.py
    │   │   ├── data_utils.py
    │   │   ├── data_utils_test.py
    │   │   ├── default_values.py
    │   │   ├── initializers.py
    │   │   ├── jax_utils.py
    │   │   ├── jax_utils_test.py
    │   │   ├── mention_preprocess_utils.py
    │   │   ├── mention_preprocess_utils_test.py
    │   │   ├── mention_utils.py
    │   │   ├── mention_utils_test.py
    │   │   ├── metric_utils.py
    │   │   ├── metric_utils_test.py
    │   │   ├── optim_utils.py
    │   │   ├── optim_utils_test.py
    │   │   ├── test_utils.py
    │   │   ├── testdata
    │   │       ├── eae_paper-00000-of-00001
    │   │       └── mtb.v5-00000-of-00001
    │   │   └── tokenization_utils.py
    ├── miqa
    │   ├── README.md
    │   └── data
    │   │   └── metaphor_inference_qa.tsv
    ├── multiberts
    │   ├── 2m_vs_1m.ipynb
    │   ├── README.md
    │   ├── coref.ipynb
    │   ├── multi_vs_original.ipynb
    │   └── multibootstrap.py
    ├── multivec
    │   ├── README.md
    │   ├── models
    │   │   ├── checkpoint_utils.py
    │   │   ├── export_to_tfhub.py
    │   │   ├── metrics.py
    │   │   └── ranking_model_experiment_inbatch.py
    │   ├── predict
    │   │   ├── encode_blocks.py
    │   │   └── retrieval.py
    │   ├── preprocessing
    │   │   └── create_training_data.py
    │   ├── requirements.txt
    │   └── utils
    │   │   ├── convert_tsv_to_json.py
    │   │   ├── data_processor.py
    │   │   └── download.sh
    ├── nqg
    │   └── README.md
    ├── nql
    │   ├── demos
    │   │   ├── Introduction_to_NQL.ipynb
    │   │   ├── NQL_Gridworld_Pathfollowing.ipynb
    │   │   ├── data
    │   │   │   └── royal92
    │   │   │   │   ├── README.md
    │   │   │   │   ├── cfacts.kg.tsv
    │   │   │   │   ├── fathers.tsv
    │   │   │   │   └── royal_family.tsv
    │   │   ├── gridworld_scaling
    │   │   │   ├── README.txt
    │   │   │   ├── __init__.py
    │   │   │   ├── figure1.bash
    │   │   │   ├── gendata_figure1.bash
    │   │   │   ├── plot_figure1.py
    │   │   │   └── scaling_eval.py
    │   │   ├── metaqa
    │   │   │   ├── README.txt
    │   │   │   ├── metaqa.py
    │   │   │   └── preprocess_data.py
    │   │   └── nell995
    │   │   │   ├── README.txt
    │   │   │   ├── nell995.py
    │   │   │   └── preprocess_data.py
    │   ├── nql
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── dataset_test.py
    │   │   ├── dist.py
    │   │   ├── io.py
    │   │   ├── io_test.py
    │   │   ├── nql_test.py
    │   │   ├── nql_test_lib.py
    │   │   ├── symbol.py
    │   │   ├── symbol_test.py
    │   │   ├── util.py
    │   │   └── util_test.py
    │   └── setup.py
    ├── orqa
    │   ├── README.md
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── ict_dataset.py
    │   │   ├── orqa_dataset.py
    │   │   └── text_classification_dataset.py
    │   ├── evaluation
    │   │   ├── __init__.py
    │   │   └── evaluate_predictions.py
    │   ├── experiments
    │   │   ├── __init__.py
    │   │   ├── ict_experiment.py
    │   │   ├── orqa_experiment.py
    │   │   └── text_classifier_experiment.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── ict_model.py
    │   │   ├── orqa_model.py
    │   │   └── text_classifier_model.py
    │   ├── ops
    │   │   ├── __init__.py
    │   │   ├── orqa_ops.cc
    │   │   └── orqa_ops_test.py
    │   ├── predict
    │   │   ├── __init__.py
    │   │   ├── encode_blocks.py
    │   │   ├── orqa_demo.py
    │   │   ├── orqa_eval.py
    │   │   ├── orqa_predict.py
    │   │   ├── text_classifier_predict.py
    │   │   └── web
    │   │   │   └── orqa.html
    │   ├── preprocessing
    │   │   ├── __init__.py
    │   │   ├── convert_to_nq_open.py
    │   │   ├── create_data_splits.py
    │   │   ├── preprocess_wiki_extractor.py
    │   │   └── wiki_preprocessor.py
    │   ├── requirements.txt
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── bert_utils.py
    │   │   ├── eval_utils.py
    │   │   ├── scann_utils.py
    │   │   └── scann_utils_test.py
    ├── qa_counterfactuals
    │   ├── README.md
    │   └── figure1.jpeg
    ├── qresp
    │   └── README.md
    ├── quest
    │   ├── README.md
    │   ├── bm25
    │   │   ├── bm25_retriever.py
    │   │   └── run_bm25_retriever.py
    │   ├── common
    │   │   ├── document_utils.py
    │   │   ├── example_utils.py
    │   │   ├── jsonl_utils.py
    │   │   ├── tsv_utils.py
    │   │   └── vocab_utils.py
    │   ├── eval
    │   │   ├── README.md
    │   │   ├── analyze_retriever.py
    │   │   ├── eval_utils.py
    │   │   └── run_eval.py
    │   ├── t5xr
    │   │   ├── README.md
    │   │   ├── convert_examples.py
    │   │   └── write_doc_idx_maps.py
    │   └── xattn
    │   │   ├── README.md
    │   │   ├── determine_threshold.py
    │   │   ├── filter_predictions.py
    │   │   ├── gen_inference_inputs.py
    │   │   ├── gen_training_examples.py
    │   │   └── xattn_utils.py
    ├── question_answering
    │   ├── __init__.py
    │   ├── b2t2
    │   │   ├── README.md
    │   │   ├── compute_vcr_features.py
    │   │   ├── requirements.txt
    │   │   ├── run_b2t2.py
    │   │   └── run_dual_encoder.py
    │   ├── bert_joint
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── prepare_nq_data.py
    │   │   ├── run_nq.py
    │   │   └── run_nq_test.py
    │   └── decatt_docreader
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── datasets
    │   │       ├── __init__.py
    │   │       ├── nq_long_dataset.py
    │   │       └── nq_short_pipeline_dataset.py
    │   │   ├── experiments
    │   │       ├── __init__.py
    │   │       ├── nq_export_scorer.py
    │   │       ├── nq_long_experiment.py
    │   │       └── nq_short_pipeline_experiment.py
    │   │   ├── layers
    │   │       ├── __init__.py
    │   │       ├── decomposable_attention.py
    │   │       ├── decomposable_attention_test.py
    │   │       ├── document_reader.py
    │   │       └── document_reader_test.py
    │   │   ├── models
    │   │       ├── __init__.py
    │   │       ├── nq_long_decatt_model.py
    │   │       ├── nq_long_model.py
    │   │       └── nq_short_pipeline_model.py
    │   │   ├── preprocessing
    │   │       ├── __init__.py
    │   │       ├── create_nq_long_examples.py
    │   │       └── create_nq_short_pipeline_examples.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── nq_long_utils.py
    │   │       ├── nq_long_utils_test.py
    │   │       ├── span_utils.py
    │   │       └── span_utils_test.py
    ├── realm
    │   ├── README.md
    │   ├── example_generator.py
    │   ├── featurization.py
    │   ├── generate_retrieval_corpus.py
    │   ├── local_launcher.sh
    │   ├── model.py
    │   ├── parallel.py
    │   ├── preprocessing.proto
    │   ├── preprocessing.py
    │   ├── preprocessing_pb2.py
    │   ├── preprocessing_pb2_grpc.py
    │   ├── profile.py
    │   ├── refresh_doc_embeds.py
    │   ├── retrieval.py
    │   ├── retrieval_test.py
    │   └── train_realm.py
    ├── relation_learning
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   └── fewrel.py
    │   └── models
    │   │   ├── __init__.py
    │   │   └── bert_fewshot_classifier.py
    ├── search_agents
    │   ├── README.md
    │   ├── demo.py
    │   ├── environment.proto
    │   ├── environment_server.py
    │   ├── muzero
    │   │   ├── agent_lib.py
    │   │   ├── bert_state_lib.py
    │   │   ├── bert_state_lib_test.py
    │   │   ├── common_flags.py
    │   │   ├── env.py
    │   │   ├── env_test.py
    │   │   ├── grammar_lib.py
    │   │   ├── grammar_lib_test.py
    │   │   ├── muzero_main.py
    │   │   ├── network.py
    │   │   ├── run_inference_beam.py
    │   │   ├── server.py
    │   │   ├── state_tree.py
    │   │   ├── state_tree_test.py
    │   │   ├── transformer_encoder.py
    │   │   ├── types.py
    │   │   ├── types_test.py
    │   │   ├── utils.py
    │   │   └── utils_test.py
    │   ├── requirements.txt
    │   └── t5
    │   │   ├── run_inference_beam.py
    │   │   ├── t5_agent_lib.py
    │   │   └── t5_agent_lib_test.py
    ├── serene
    │   ├── analysis.py
    │   ├── boolq_tfds.py
    │   ├── callbacks.py
    │   ├── claim_tfds.py
    │   ├── config.py
    │   ├── constants.py
    │   ├── export_embeddings.py
    │   ├── fever.proto
    │   ├── fever_cli.py
    │   ├── fever_tfds.py
    │   ├── layers.py
    │   ├── losses.py
    │   ├── model.py
    │   ├── preprocessing.py
    │   ├── retrieval.proto
    │   ├── scrape_db.py
    │   ├── serene.py
    │   ├── text_matcher.py
    │   ├── tokenizers.py
    │   ├── training.py
    │   ├── types.py
    │   ├── util.py
    │   ├── web_api.py
    │   ├── wiki_db.py
    │   ├── wiki_index.py
    │   ├── wiki_tfds.py
    │   └── wikipedia_processing.py
    ├── spatial_prep
    │   └── README.md
    ├── table_text_eval
    │   ├── README.md
    │   ├── __init__.py
    │   ├── preprocess_webnlg.py
    │   ├── table_text_eval.py
    │   ├── table_text_eval_test.py
    │   └── webnlg_correlations.py
    ├── templama
    │   ├── README.md
    │   ├── install.sh
    │   ├── prepare_data.sh
    │   ├── sling2facts.py
    │   ├── templama.py
    │   └── templates.csv
    ├── totto
    │   ├── README.md
    │   ├── __init__.py
    │   ├── baseline_preprocessing
    │   │   ├── __init__.py
    │   │   ├── preprocess_data_main.py
    │   │   └── preprocess_utils.py
    │   ├── create_table_to_text_html.py
    │   ├── eval_pipeline_test.py
    │   ├── eval_requirements.txt
    │   ├── prepare_predictions_for_eval.py
    │   ├── prepare_references_for_eval.py
    │   ├── sample
    │   │   ├── dev_sample.jsonl
    │   │   ├── example-0.html
    │   │   ├── example-1.html
    │   │   ├── example-2.html
    │   │   ├── example-3.html
    │   │   ├── example-4.html
    │   │   ├── output_sample.txt
    │   │   └── train_sample.jsonl
    │   ├── table_to_text_html_utils.py
    │   ├── table_to_text_utils.py
    │   ├── table_to_text_utils_test.py
    │   ├── totto_bleurt_eval.py
    │   ├── totto_eval.sh
    │   └── totto_parent_eval.py
    ├── wikipedia_anchors
    │   └── README.md
    ├── wino_dict
    │   ├── README.md
    │   ├── __init__.py
    │   ├── create_new_words.py
    │   ├── generate.py
    │   ├── morph_rules.txt
    │   ├── original_words.tsv
    │   ├── requirements.txt
    │   ├── utils.py
    │   └── utils_test.py
    └── xsp
    │   ├── README.md
    │   ├── data_download.sh
    │   ├── data_preprocessing
    │       ├── abstract_sql.py
    │       ├── abstract_sql_converters.py
    │       ├── abstract_sql_main.py
    │       ├── abstract_sql_test.py
    │       ├── compute_asql_coverage_spider.py
    │       ├── convert_to_examples.py
    │       ├── convert_to_tfrecords.py
    │       ├── create_vocabularies.py
    │       ├── estimate_asql_coverage_michigan.py
    │       ├── language_utils.py
    │       ├── michigan_preprocessing.py
    │       ├── nl_to_sql_example.py
    │       ├── schema_utils.py
    │       ├── spider_preprocessing.py
    │       ├── sql_parsing.py
    │       ├── sql_utils.py
    │       ├── sqlparse_keyword_utils.py
    │       └── wikisql_preprocessing.py
    │   ├── data_utils
    │       ├── academic-prefix.txt
    │       ├── add_indices.py
    │       ├── advising-prefix.txt
    │       ├── atis-prefix.txt
    │       ├── create_cache.py
    │       ├── empty_database.py
    │       ├── extra_academic_indices.txt
    │       ├── extra_imdb_indices.txt
    │       ├── extra_scholar_indices.txt
    │       ├── geoquery-prefix.txt
    │       ├── imdb-prefix.txt
    │       ├── scholar-prefix.txt
    │       └── yelp-prefix.txt
    │   ├── evaluation
    │       ├── convert_preds_for_spider_eval.py
    │       ├── filter_results.py
    │       ├── official_evaluation.py
    │       ├── official_evaluation_test.py
    │       ├── restore_from_asql.py
    │       └── restore_from_asql_main.py
    │   ├── model
    │       ├── adam_weight_decay.py
    │       ├── beam_search.py
    │       ├── bert_utils.py
    │       ├── common_layers.py
    │       ├── constants.py
    │       ├── decode_utils.py
    │       ├── embeddings.py
    │       ├── input_pipeline.py
    │       ├── input_utils.py
    │       ├── load_from_checkpoint.py
    │       ├── local_model_config.json
    │       ├── loss.py
    │       ├── metrics.py
    │       ├── model_builder.py
    │       ├── model_config.json
    │       ├── model_config.py
    │       ├── run_inference.py
    │       ├── sequence_example_decoder.py
    │       ├── tpu_utils.py
    │       └── transformer.py
    │   ├── requirements.txt
    │   └── training
    │       └── train_model.py
└── setup.py


/AUTHORS:
--------------------------------------------------------------------------------
1 | # This is the list of the Google AI Language Team authors for copyright purposes.
2 | #
3 | # This does not necessarily list everyone who has contributed code, since in
4 | # some cases, their employer may be the copyright holder.  To see the full list
5 | # of contributors, see the revision history in source control.
6 | 
7 | Google Inc.
8 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google.com/conduct/).
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Google Research: Language
2 | 
3 | Shared repository for open-sourced projects from the
4 | [Google Research Language](https://research.google/teams/language/) team.
5 | 
6 | This is not an official Google product.
7 | 


--------------------------------------------------------------------------------
/language/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/asqa/eval.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | RESULTS_PATH=$1 #path to result json file
18 | EXP_NAME=$2 #name of your experiment
19 | OUTPUT_DIR=./results/${EXP_NAME}
20 | 
21 | mkdir -p ${OUTPUT_DIR}
22 | python convert_to_roberta_format.py  \
23 |   --asqa ./dataset/ASQA.json \
24 |   --predictions $RESULTS_PATH  \
25 |   --split dev \
26 |   --output_path ${OUTPUT_DIR}
27 | 
28 | python transformers/examples/pytorch/question-answering/run_qa.py \
29 |   --model_name_or_path ./roberta/roberta-squad \
30 |   --validation_file ${OUTPUT_DIR}/qa.json \
31 |   --do_eval \
32 |   --version_2_with_negative \
33 |   --max_seq_length 384 \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --null_score_diff_threshold 0
36 | 
37 | python scoring.py \
38 |   --asqa ./dataset/ASQA.json \
39 |   --predictions $RESULTS_PATH \
40 |   --roberta_output ${OUTPUT_DIR}/eval_predictions.json \
41 |   --split dev \
42 |   --out_dir $OUTPUT_DIR
43 | 


--------------------------------------------------------------------------------
/language/asqa/human_annotation/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/asqa/human_annotation/screenshot.png


--------------------------------------------------------------------------------
/language/asqa/install.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | # Install requirements.txt
18 | pip install -r requirements.txt
19 | 
20 | # Install huggingface transformers from github so that we have access to example
21 | # scripts.
22 | git clone https://github.com/huggingface/transformers.git
23 | cd transformers
24 | pip install .
25 | 
26 | # Download Roberta checkpoint.
27 | cd ../
28 | mkdir roberta
29 | gsutil cp -R gs://gresearch/ASQA/ckpts/roberta-squad roberta/
30 | 


--------------------------------------------------------------------------------
/language/asqa/requirements.txt:
--------------------------------------------------------------------------------
1 | rouge-score
2 | nltk
3 | datasets
4 | 


--------------------------------------------------------------------------------
/language/bert_extraction/README.md:
--------------------------------------------------------------------------------
 1 | # Model Extraction of BERT-based APIs
 2 | 
 3 | This folder contains the original codebase used to conduct the experiments in the ICLR 2020 paper *[Thieves on Sesame Street! Model Extraction of BERT-based APIs](https://arxiv.org/abs/1910.12366)*. The OpenReview discussion for this paper can be found [here](https://openreview.net/forum?id=Byl5NREFDr).
 4 | 
 5 | ## Setup
 6 | 
 7 | Please follow the setup in [google-research/language](https://github.com/google-research/language). This codebase requires [google-research/bert](https://github.com/google-research/bert) for all its experiments.
 8 | 
 9 | ## Experiments on SST2, MNLI
10 | 
11 | Please find more details in [`steal_bert_classifier/README.md`](steal_bert_classifier/README.md). The codebase can be trivially modified for any classification task using BERT expecting a single sentence input or a pair of sentences as input.
12 | 
13 | ## Experiments on SQuAD 1.1, SQuAD 2.0, BoolQ
14 | 
15 | Please find more details in [`steal_bert_qa/README.md`](steal_bert_qa/README.md).
16 | 
17 | ## Citation
18 | 
19 | If you find this paper or codebase useful, please cite us.
20 | 
21 | ```
22 | @inproceedings{krishna2020thieves,
23 |   title={Thieves on Sesame Street! Model Extraction of BERT-based APIs},
24 |   author={Krishna, Kalpesh and Tomar, Gaurav Singh and Parikh, Ankur P and Papernot, Nicolas and Iyyer, Mohit},
25 |   booktitle={International Conference on Learning Representations},
26 |   year={2020}
27 | }
28 | ```
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/language/bert_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_classifier/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_classifier/data_generation/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_classifier/embedding_perturbations/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_classifier/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_classifier/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_qa/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_qa/data_generation/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_qa/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/bert_extraction/steal_bert_qa/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/bertology/frequency_effects/README.md:
--------------------------------------------------------------------------------
1 | This repository contains relevant files for [Frequency Effects on Syntactic Rule-Learning in Transformers (EMNLP '21)](https://arxiv.org/abs/2109.07020) by Jason Wei, Dan Garrette, Tal Linzen, and Ellie Pavlick.
2 | 
3 | The data can be found in the [`data`](https://github.com/google-research/language/tree/master/language/bertology/frequency_effects/data) subdirectory.
4 | 


--------------------------------------------------------------------------------
/language/boolq/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/boolq/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/capwap/img/capwap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/capwap/img/capwap.png


--------------------------------------------------------------------------------
/language/capwap/utils/nltk_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """NLTK utils for cluster-friendly usage."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import pickle
22 | import nltk
23 | 
24 | word_tokenize = nltk.word_tokenize
25 | pos_tag = nltk.pos_tag
26 | 
27 | 
28 | def get_stopwords():
29 |   # pylint: disable=g-import-not-at-top
30 |   from nltk.corpus import stopwords
31 |   # pylint: enable=g-import-not-at-top
32 |   return set(stopwords.words("english"))
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/language/casper/utils/data_types.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | r"""Common data types."""
16 | import collections
17 | from typing import Any, Dict
18 | 
19 | # Deserialized JSON
20 | RawExample = Dict[str, Any]
21 | 
22 | AugmentedExample = collections.namedtuple("AugmentedExample",
23 |                                           ["inputs", "targets"])
24 | 


--------------------------------------------------------------------------------
/language/casper/utils/sample_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for sample_utils."""
16 | 
17 | from absl.testing import absltest
18 | from language.casper.utils import sample_utils
19 | 
20 | 
21 | class SampleUtilsTest(absltest.TestCase):
22 | 
23 |   def test_geometric_sample(self):
24 |     pool = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
25 |     sampled = sample_utils.geometric_sample(pool, 5, 0.5)
26 |     self.assertLen(sampled, 5)
27 |     sampled = sample_utils.geometric_sample(pool, 99, 0.5)
28 |     self.assertLen(sampled, 10)
29 |     # Test extreme values.
30 |     sampled = sample_utils.geometric_sample(pool, 7, 1.0)
31 |     self.assertEqual(sampled, [0, 1, 2, 3, 4, 5, 6])
32 |     sampled = sample_utils.geometric_sample(pool, 7, 0.0)
33 |     self.assertEqual(sampled, [9, 8, 7, 6, 5, 4, 3])
34 | 
35 | 
36 | if __name__ == '__main__':
37 |   absltest.main()
38 | 


--------------------------------------------------------------------------------
/language/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/common/inputs/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/common/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/common/layers/affine_transform_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for language.common.layers.affine_transform."""
16 | 
17 | 
18 | from language.common.layers import affine_transform
19 | import tensorflow as tf
20 | 
21 | 
22 | class AffineTransformTest(tf.test.TestCase):
23 | 
24 |   def test_layer_api_compatibility(self):
25 |     input_array = tf.constant(
26 |         [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0], [2.0, 3.0, 5.0]]
27 |     )
28 | 
29 |     cls = affine_transform.AffineTransform
30 |     with tf.keras.utils.CustomObjectScope(
31 |         {cls.__name__: cls}
32 |     ):
33 |       _ = tf._keras_internal.testing_infra.test_utils.layer_test(
34 |           cls,
35 |           kwargs={
36 |               'output_size': 1,
37 |               'initializer': tf.keras.initializers.TruncatedNormal(stddev=0.02),
38 |           },
39 |           input_shape=(None),
40 |           input_data=input_array,
41 |       )
42 | 
43 | 
44 | if __name__ == '__main__':
45 |   tf.test.main()
46 | 


--------------------------------------------------------------------------------
/language/common/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/common/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilities for file I/O."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import os.path
22 | 
23 | import tensorflow.compat.v1 as tf
24 | 
25 | 
26 | def make_empty_dir(path):
27 |   """Makes an empty directory at `path`, deleting `path` first if needed."""
28 |   if tf.gfile.Exists(path):
29 |     tf.gfile.DeleteRecursively(path)
30 |   tf.gfile.MakeDirs(path)
31 | 
32 | 
33 | def copy_files_to_dir(source_filepattern, dest_dir):
34 |   """Copies files matching `source_filepattern` into `dest_dir`."""
35 |   for source_path in tf.gfile.Glob(source_filepattern):
36 |     dest_path = os.path.join(dest_dir, os.path.basename(source_path))
37 |     tf.gfile.Copy(source_path, dest_path)
38 | 
39 | 
40 | def set_file_contents(data, path):
41 |   """Overwrites `path` with `data."""
42 |   with tf.gfile.Open(path, "w") as output_file:
43 |     output_file.write(data)
44 | 


--------------------------------------------------------------------------------
/language/compgen/csl/augment/merge_tsvs.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utility to merge tsv files."""
16 | 
17 | import random
18 | 
19 | from absl import app
20 | from absl import flags
21 | 
22 | from language.compgen.nqg.tasks import tsv_utils
23 | 
24 | 
25 | FLAGS = flags.FLAGS
26 | 
27 | flags.DEFINE_string("input_1", "", "Input tsv file.")
28 | 
29 | flags.DEFINE_string("input_2", "", "Input tsv file.")
30 | 
31 | flags.DEFINE_string("output", "", "Output tsv file.")
32 | 
33 | flags.DEFINE_integer("duplicate_input_1", 1,
34 |                      "Number of times to duplicate inputs in input_1.")
35 | 
36 | 
37 | def main(unused_argv):
38 |   input_1 = tsv_utils.read_tsv(FLAGS.input_1)
39 |   input_2 = tsv_utils.read_tsv(FLAGS.input_2)
40 |   outputs = input_1 * FLAGS.duplicate_input_1 + input_2
41 |   random.shuffle(outputs)
42 |   tsv_utils.write_tsv(outputs, FLAGS.output)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |   app.run(main)
47 | 


--------------------------------------------------------------------------------
/language/compgen/csl/augment/test_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilities for testing data augmentation."""
16 | 
17 | 
18 | def get_test_config():
19 |   return {
20 |       "min_recursions": 1,
21 |       "max_recursions": 5,
22 |       "temperature": 1,
23 |       "nonterminal_bias": 0,
24 |       "max_single_nt_applications": 1
25 |   }
26 | 


--------------------------------------------------------------------------------
/language/compgen/csl/common/beam_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilities for Beam pipeline."""
16 | 
17 | import apache_beam as beam
18 | 
19 | 
20 | def dict_to_beam_counts(metrics_dict, namespace):
21 |   for metric_name, metric_value in metrics_dict.items():
22 |     beam.metrics.Metrics.counter(namespace, metric_name).inc(metric_value)
23 | 


--------------------------------------------------------------------------------
/language/compgen/csl/common/json_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Function for loading config json file."""
16 | 
17 | import json
18 | 
19 | from tensorflow.io import gfile
20 | 
21 | 
22 | def json_file_to_dict(json_file):
23 |   """Constructs a dictionary from a json file."""
24 |   with gfile.GFile(json_file, "r") as reader:
25 |     text = reader.read()
26 |   return json.loads(text)
27 | 
28 | 
29 | def dict_to_json_file(json_dict, json_file):
30 |   """Saves a dictionary to a json file."""
31 |   with gfile.GFile(json_file, "w") as writer:
32 |     json.dump(json_dict, writer, indent=2)
33 |   print("Saved dict to %s." % json_file)
34 | 


--------------------------------------------------------------------------------
/language/compgen/csl/common/txt_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilties for reading and writing TXT dataset files."""
16 | 
17 | from tensorflow.io import gfile
18 | 
19 | 
20 | def read_txt(filename):
21 |   """Read file to list of lines."""
22 |   examples = []
23 |   with gfile.GFile(filename, "r") as tsv_file:
24 |     for line in tsv_file:
25 |       line = line.rstrip()
26 |       examples.append(line)
27 |   print("Loaded %s lines from %s." % (len(examples), filename))
28 |   return examples
29 | 
30 | 
31 | def write_txt(examples, filename):
32 |   """Write examples to tsv file."""
33 |   with gfile.GFile(filename, "w") as tsv_file:
34 |     for example in examples:
35 |       line = "%s\n" % example
36 |       tsv_file.write(line)
37 |   print("Wrote %s lines to %s." % (len(examples), filename))
38 | 


--------------------------------------------------------------------------------
/language/compgen/csl/common/writer_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilities for TF writer."""
16 | 
17 | import tensorflow as tf
18 | 
19 | 
20 | def get_summary_writer(write_dir):
21 |   return tf.summary.create_file_writer(write_dir)
22 | 
23 | 
24 | def write_metrics(writer, metrics_dict, step):
25 |   for metric_name, metric_value in metrics_dict.items():
26 |     with writer.as_default():
27 |       tf.summary.scalar(metric_name, metric_value, step=step)
28 | 


--------------------------------------------------------------------------------
/language/compgen/csl/csl_flowchart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/compgen/csl/csl_flowchart.jpg


--------------------------------------------------------------------------------
/language/compgen/csl/induction/action_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Defines actions that can mutate the set of rules."""
16 | 
17 | import collections
18 | 
19 | Action = collections.namedtuple(
20 |     "Action",
21 |     [
22 |         "rules_to_add",  # Set of QCFGRule.
23 |         "rules_to_remove",  # Set of QCFGRule.
24 |     ])
25 | 


--------------------------------------------------------------------------------
/language/compgen/csl/model/data_constants.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Constants used in generating tf.Examples that are used across modules."""
16 | 
17 | # Forest node types.
18 | RULE_APPLICATION = 1
19 | AGGREGATION = 2
20 | 


--------------------------------------------------------------------------------
/language/compgen/csl/model/inference/inference_parser_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for inference_parser."""
16 | 
17 | from language.compgen.csl.model import test_utils
18 | from language.compgen.csl.model.inference import inference_parser
19 | from language.compgen.csl.model.inference import inference_wrapper
20 | from language.compgen.csl.qcfg import qcfg_rule
21 | import tensorflow as tf
22 | 
23 | 
24 | class InferenceParserTest(tf.test.TestCase):
25 | 
26 |   def test_get_outputs(self):
27 |     rules = [
28 |         qcfg_rule.rule_from_string("foo NT_1 ### foo NT_1"),
29 |         qcfg_rule.rule_from_string("bar ### bar"),
30 |         qcfg_rule.rule_from_string("foo bar ### foo bar"),
31 |     ]
32 |     config = test_utils.get_test_config()
33 | 
34 |     wrapper = inference_wrapper.InferenceWrapper(rules, config)
35 |     wrapper.compute_application_scores()
36 | 
37 |     source = "foo bar"
38 |     outputs = inference_parser.run_inference(source, wrapper)
39 |     print("outputs: %s" % outputs)
40 | 
41 |     self.assertIsNotNone(outputs)
42 | 
43 | if __name__ == "__main__":
44 |   tf.test.main()
45 | 


--------------------------------------------------------------------------------
/language/compgen/csl/model/test_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilties for testing."""
16 | 
17 | 
18 | def get_test_config():
19 |   return {
20 |       "batch_size": 4,
21 |       "learning_rate": 0.001,
22 |       "training_steps": 10000,
23 |       "steps_per_iteration": 8,
24 |       "num_types": 2,
25 |       "num_lhs_emb": 128,
26 |       "num_rhs_emb": 128,
27 |       "max_num_numerator_nodes": 8,
28 |       "max_num_nts": 2,
29 |       "max_single_nt_applications": 1
30 |   }
31 | 


--------------------------------------------------------------------------------
/language/compgen/csl/qcfg/qcfg_file.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Read and write QCFG grammars to/from human readable txt files."""
16 | 
17 | from language.compgen.csl.qcfg import qcfg_rule
18 | from tensorflow.io import gfile
19 | 
20 | 
21 | def read_rules(filename):
22 |   """Read rule txt file to list of rules."""
23 |   rules = []
24 |   with gfile.GFile(filename, "r") as txt_file:
25 |     for line in txt_file:
26 |       line = line.rstrip()
27 |       rule = qcfg_rule.rule_from_string(line)
28 |       rules.append(rule)
29 |   print("Loaded %s rules from %s." % (len(rules), filename))
30 |   return rules
31 | 
32 | 
33 | def write_rules(rules, filename):
34 |   """Write rules to txt file."""
35 |   with gfile.GFile(filename, "w") as txt_file:
36 |     for rule in rules:
37 |       line = "%s\n" % str(rule)
38 |       txt_file.write(line)
39 |   print("Wrote %s rules to %s." % (len(rules), filename))
40 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/cogs/augment_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "min_recursions": 1,
3 |   "max_recursions": 20,
4 |   "temperature": 1,
5 |   "nonterminal_bias": 6,
6 |   "max_single_nt_applications": 2,
7 |   "min_nonterminal_rule_arity": 2
8 | }
9 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/cogs/induction_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "allow_repeated_target_nts": true,
 3 |   "allow_single_nt_target": true,
 4 |   "max_num_nts": 4,
 5 |   "non_terminal_coef": 1,
 6 |   "terminal_coef": 8,
 7 |   "source_given_target_coef": 1,
 8 |   "target_given_source_coef": 5,
 9 |   "max_num_steps": 8,
10 |   "save_every_step": 0
11 | }
12 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/cogs/model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.01,
 4 |   "training_steps": 3000,
 5 |   "steps_per_iteration": 1,
 6 |   "save_checkpoint_every": 64,
 7 |   "num_lhs_emb": 800,
 8 |   "num_rhs_emb": 800,
 9 |   "max_num_numerator_nodes": 500,
10 |   "max_num_nts": 4,
11 |   "max_single_nt_applications": 1,
12 |   "num_types": 64
13 | }
14 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/cogs/tools/preprocess_cogs_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Preprocess the COGS dataset."""
16 | from absl import app
17 | from absl import flags
18 | 
19 | from language.compgen.csl.tasks.cogs.tools import cogs_converter
20 | from language.compgen.nqg.tasks import tsv_utils
21 | 
22 | FLAGS = flags.FLAGS
23 | 
24 | flags.DEFINE_string("input", "", "TSV file.")
25 | 
26 | flags.DEFINE_string("output", "", "TSV file.")
27 | 
28 | 
29 | def main(_):
30 |   examples = tsv_utils.read_tsv(FLAGS.input, expected_num_columns=3)
31 |   new_examples = []
32 |   for source, target, category in examples:
33 |     if category == "primitive":
34 |       if len(source.split()) != 1:
35 |         raise ValueError(f"Invalid primitive: {source}")
36 |       new_target = source
37 |     else:
38 |       new_target = cogs_converter.cogs_lf_to_funcall(target)
39 |     new_examples.append((source, new_target))
40 |   tsv_utils.write_tsv(new_examples, FLAGS.output)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |   app.run(main)
45 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/exact_match_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for exact_match_utils."""
16 | 
17 | from language.compgen.csl.tasks import exact_match_utils
18 | import tensorflow as tf
19 | 
20 | 
21 | class InitializationUtilsTest(tf.test.TestCase):
22 | 
23 |   def test_exact_match_1(self):
24 |     dataset = [("salary between 8000 and 12000",
25 |                 "salaries between 8000 and 12000 .")]
26 | 
27 |     exact_match_rules = exact_match_utils.get_exact_match_rules(dataset)
28 |     exact_match_rule_strings = {str(rule) for rule in exact_match_rules}
29 |     self.assertEqual(exact_match_rule_strings,
30 |                      {"between 8000 and 12000 ### between 8000 and 12000"})
31 | 
32 | 
33 | if __name__ == "__main__":
34 |   tf.test.main()
35 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/generate_exact_match_rules.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Generates exact match seed rules."""
16 | 
17 | from absl import app
18 | from absl import flags
19 | from language.compgen.csl.qcfg import qcfg_file
20 | from language.compgen.csl.tasks import exact_match_utils
21 | from language.compgen.nqg.tasks import tsv_utils
22 | 
23 | FLAGS = flags.FLAGS
24 | 
25 | flags.DEFINE_string("input", "", "Input tsv file.")
26 | 
27 | flags.DEFINE_string("output", "", "Output txt file.")
28 | 
29 | 
30 | def main(unused_argv):
31 |   examples = tsv_utils.read_tsv(FLAGS.input)
32 |   rules = exact_match_utils.get_exact_match_rules(examples)
33 |   # Sort by target.
34 |   rules = list(rules)
35 |   rules.sort(key=lambda x: x.target)
36 |   qcfg_file.write_rules(rules, FLAGS.output)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |   app.run(main)
41 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/geoquery/augment_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "min_recursions": 1,
3 |   "max_recursions": 20,
4 |   "temperature": 1,
5 |   "nonterminal_bias": 0,
6 |   "max_single_nt_applications": 1
7 | }
8 | 
9 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/geoquery/induction_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "allow_repeated_target_nts": false,
 3 |   "allow_single_nt_target": false,
 4 |   "max_num_nts": 4,
 5 |   "non_terminal_coef": 1,
 6 |   "terminal_coef": 8,
 7 |   "sample_size": 0,
 8 |   "source_given_target_coef": 4,
 9 |   "target_given_source_coef": 16,
10 |   "max_num_steps": 20,
11 |   "save_every_step": 0
12 | }
13 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/geoquery/model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.05,
 4 |   "training_steps": 2000,
 5 |   "steps_per_iteration": 1,
 6 |   "save_checkpoint_every": 64,
 7 |   "num_lhs_emb": 300,
 8 |   "num_rhs_emb": 500,
 9 |   "max_num_numerator_nodes": 500,
10 |   "max_num_nts": 4,
11 |   "max_single_nt_applications": 1,
12 |   "num_types": 64
13 | }
14 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/scan/augment_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "min_recursions": 1,
3 |   "max_recursions": 5,
4 |   "temperature": 1,
5 |   "nonterminal_bias": 0,
6 |   "max_single_nt_applications": 1
7 | }
8 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/scan/induction_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "allow_repeated_target_nts": true,
 3 |   "allow_single_nt_target": true,
 4 |   "max_num_nts": 2,
 5 |   "non_terminal_coef": 1,
 6 |   "terminal_coef": 4,
 7 |   "source_given_target_coef": 0.0,
 8 |   "target_given_source_coef": 100.0,
 9 |   "max_num_steps": 5,
10 |   "num_partitions": 16,
11 |   "allow_duplicate_examples": false,
12 |   "save_every_step": 0
13 | }
14 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/scan/model_config_t2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.01,
 4 |   "training_steps": 1000,
 5 |   "steps_per_iteration": 64,
 6 |   "save_checkpoint_every": 64,
 7 |   "num_lhs_emb": 25,
 8 |   "num_rhs_emb": 15,
 9 |   "max_num_numerator_nodes": 20,
10 |   "max_num_nts": 2,
11 |   "max_single_nt_applications": 0,
12 |   "num_types": 2
13 | }
14 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/scan/model_config_t4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.01,
 4 |   "training_steps": 1000,
 5 |   "steps_per_iteration": 64,
 6 |   "save_checkpoint_every": 64,
 7 |   "num_lhs_emb": 25,
 8 |   "num_rhs_emb": 15,
 9 |   "max_num_numerator_nodes": 20,
10 |   "max_num_nts": 2,
11 |   "max_single_nt_applications": 0,
12 |   "num_types": 4
13 | }
14 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/smcalflow/augment_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "min_recursions": 1,
3 |   "max_recursions": 10,
4 |   "temperature": 1,
5 |   "nonterminal_bias": 0,
6 |   "max_single_nt_applications": 1
7 | }
8 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/smcalflow/induction_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "allow_repeated_target_nts": true,
 3 |   "allow_single_nt_target": false,
 4 |   "max_num_nts": 4,
 5 |   "non_terminal_coef": 1,
 6 |   "terminal_coef": 8,
 7 |   "sample_size": 0,
 8 |   "source_given_target_coef": 4,
 9 |   "target_given_source_coef": 16,
10 |   "max_num_steps": 20,
11 |   "save_every_step": 0
12 | }
13 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/smcalflow/manual_seed_rules.txt:
--------------------------------------------------------------------------------
1 | NT_1 ### AttendeeListHasPeople :people ( NT_1 )
2 | NT_1 ### AttendeeListHasRecipient :recipient ( NT_1 )
3 | NT_1 ### Execute :intension ( refer ( extensionConstraint ( RecipientWithNameLike :constraint ( Constraint[Recipient] ) :name # ( NT_1 ) ) ) )
4 | NT_1 ### AttendeeListHasRecipient :recipient ( Execute :intension ( refer ( extensionConstraint ( RecipientWithNameLike :constraint ( Constraint[Recipient] ) :name # ( NT_1 ) ) ) ) )
5 | NT_1 ### Constraint[DateTime] :date ( ?= ( NextDOW :dow # ( NT_1 ) ) )
6 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/smcalflow/model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.01,
 4 |   "training_steps": 6000,
 5 |   "steps_per_iteration": 1,
 6 |   "save_checkpoint_every": 64,
 7 |   "num_lhs_emb": 32000,
 8 |   "num_rhs_emb": 42000,
 9 |   "max_num_numerator_nodes": 400,
10 |   "max_num_nts": 4,
11 |   "max_single_nt_applications": 1,
12 |   "num_types": 64,
13 |   "max_num_batch_embs": 500,
14 |   "approximate_denominator": true
15 | }
16 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/smcalflow/tools/format_for_t5.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Format targets so they are encoded better by T5's SPM."""
16 | 
17 | from absl import app
18 | from absl import flags
19 | 
20 | from language.compgen.nqg.tasks import tsv_utils
21 | 
22 | FLAGS = flags.FLAGS
23 | 
24 | flags.DEFINE_string("input", "", "TSV file.")
25 | 
26 | flags.DEFINE_string("output", "", "TSV file.")
27 | 
28 | 
29 | def format_target(target):
30 |   """Reformat targets."""
31 |   # """Switches OOV T5 tokens to in-vocabulary tokens."""
32 |   target = target.replace("<", "lb")
33 |   target = target.replace(">", "rb")
34 |   target = target.replace("~", "sim")
35 | 
36 |   return target
37 | 
38 | 
39 | def main(unused_argv):
40 |   examples = tsv_utils.read_tsv(FLAGS.input)
41 |   new_examples = []
42 |   for source, target in examples:
43 |     new_target = format_target(target)
44 |     new_examples.append((source, new_target))
45 |   tsv_utils.write_tsv(new_examples, FLAGS.output)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |   app.run(main)
50 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/smcalflow/tools/merge_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Merge source and target txt files to tsv."""
16 | 
17 | from absl import app
18 | from absl import flags
19 | 
20 | from language.compgen.nqg.tasks import tsv_utils
21 | 
22 | from tensorflow.io import gfile
23 | 
24 | FLAGS = flags.FLAGS
25 | 
26 | flags.DEFINE_string("source", "", "Input txt file.")
27 | 
28 | flags.DEFINE_string("target", "", "Input txt file.")
29 | 
30 | flags.DEFINE_string("output", "", "Output tsv file.")
31 | 
32 | 
33 | def read_txt(filename):
34 |   """Read file to list of lines."""
35 |   lines = []
36 |   with gfile.GFile(filename, "r") as txt_file:
37 |     for line in txt_file:
38 |       line = line.decode().rstrip()
39 |       lines.append(line)
40 |   print("Loaded %s lines from %s." % (len(lines), filename))
41 |   return lines
42 | 
43 | 
44 | def main(unused_argv):
45 |   source = read_txt(FLAGS.source)
46 |   target = read_txt(FLAGS.target)
47 |   examples = list(zip(source, target))
48 |   tsv_utils.write_tsv(examples, FLAGS.output)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |   app.run(main)
53 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/smcalflow/tools/retokenize_inputs.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Retokenize inputs by separating on punctuation."""
16 | 
17 | from absl import app
18 | from absl import flags
19 | 
20 | from language.compgen.csl.tasks.smcalflow.tools import string_utils
21 | from language.compgen.nqg.tasks import tsv_utils
22 | 
23 | FLAGS = flags.FLAGS
24 | 
25 | flags.DEFINE_string("input", "", "TSV file.")
26 | 
27 | flags.DEFINE_string("output", "", "TSV file.")
28 | 
29 | 
30 | def main(unused_argv):
31 |   examples = tsv_utils.read_tsv(FLAGS.input)
32 |   new_examples = []
33 |   for source, target in examples:
34 |     new_source = string_utils.format_source(source)
35 |     new_examples.append((new_source, target))
36 |   tsv_utils.write_tsv(new_examples, FLAGS.output)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |   app.run(main)
41 | 


--------------------------------------------------------------------------------
/language/compgen/csl/tasks/smcalflow/tools/string_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilities for processing SMCalFlow strings."""
16 | 
17 | import string
18 | 
19 | 
20 | def format_source(source):
21 |   for char in string.punctuation:
22 |     source = source.replace(char, " %s " % char)
23 |   source = " ".join(source.split())
24 |   return source
25 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/common/cky/cfg_rule.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Define structures to represent CFG symbols and rules.
16 | 
17 | For efficiency, all symbols are referenced by integers rather than strings.
18 | This typically requires some pre-processing to define terminal
19 | and non-terminal vocabularies and map symbols to corresponding integers.
20 | """
21 | 
22 | import collections
23 | 
24 | # CFGSymbol type constants.
25 | TERMINAL = 0
26 | NON_TERMINAL = 1
27 | 
28 | # Represents a TERMINAL or NON_TERMINAL symbol.
29 | CFGSymbol = collections.namedtuple(
30 |     "CFGSymbol",
31 |     [
32 |         "idx",  # Integer (considered as separate id spaces for different type).
33 |         "type",  # Integer (TERMINAL or NON_TERMINAL).
34 |     ])
35 | 
36 | # Represents a CFG rule.
37 | CFGRule = collections.namedtuple(
38 |     "CFGRule",
39 |     [
40 |         "idx",  # Integer to optionally reference additional rule information.
41 |         "lhs",  # Integer non-terminal index.
42 |         "rhs",  # Tuple of >= 1 CFGSymbols.
43 |     ])
44 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/model/induction/exact_match_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for exact_match."""
16 | 
17 | from language.compgen.nqg.model.induction import exact_match_utils
18 | 
19 | import tensorflow as tf
20 | 
21 | 
22 | class ExactMatchTest(tf.test.TestCase):
23 | 
24 |   def test_exact_match_1(self):
25 |     dataset = [("salary between 8000 and 12000",
26 |                 "salaries between 8000 and 12000 .")]
27 | 
28 |     exact_match_rules = exact_match_utils.get_exact_match_rules(dataset)
29 |     exact_match_rule_strings = {str(rule) for rule in exact_match_rules}
30 |     self.assertEqual(exact_match_rule_strings,
31 |                      {"between 8000 and 12000 ### between 8000 and 12000"})
32 | 
33 | 
34 | if __name__ == "__main__":
35 |   tf.test.main()
36 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/model/parser/config_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Function for loading config json file."""
16 | 
17 | import json
18 | 
19 | from tensorflow.io import gfile
20 | 
21 | 
22 | def json_file_to_dict(json_file):
23 |   """Constructs a dictionary from a json file."""
24 |   with gfile.GFile(json_file, "r") as reader:
25 |     text = reader.read()
26 |   return json.loads(text)
27 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/model/parser/configs/geoquery_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.0001,
 4 |   "training_steps": 256,
 5 |   "warmup_steps": 100,
 6 |   "steps_per_iteration": 8,
 7 |   "save_checkpoint_every": 64,
 8 |   "model_dims": 256,
 9 |   "max_num_wordpieces": 25,
10 |   "max_num_applications": 400,
11 |   "max_num_numerator_nodes": 150,
12 |   "max_num_denominator_nodes": 2000,
13 |   "max_num_rules": 300
14 | }
15 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/model/parser/configs/geoquery_xl_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.0001,
 4 |   "training_steps": 256,
 5 |   "warmup_steps": 100,
 6 |   "steps_per_iteration": 8,
 7 |   "save_checkpoint_every": 64,
 8 |   "model_dims": 256,
 9 |   "max_num_wordpieces": 25,
10 |   "max_num_applications": 2000,
11 |   "max_num_numerator_nodes": 500,
12 |   "max_num_denominator_nodes": 15000,
13 |   "max_num_rules": 300
14 | }
15 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/model/parser/configs/scan_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.0001,
 4 |   "training_steps": 256,
 5 |   "warmup_steps": 100,
 6 |   "steps_per_iteration": 8,
 7 |   "save_checkpoint_every": 64,
 8 |   "model_dims": 256,
 9 |   "max_num_wordpieces": 24,
10 |   "max_num_applications": 50,
11 |   "max_num_numerator_nodes": 32,
12 |   "max_num_denominator_nodes": 64,
13 |   "max_num_rules": 30
14 | }
15 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/model/parser/configs/spider_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch_size": 256,
 3 |   "learning_rate": 0.0001,
 4 |   "training_steps": 256,
 5 |   "warmup_steps": 100,
 6 |   "steps_per_iteration": 8,
 7 |   "save_checkpoint_every": 64,
 8 |   "model_dims": 256,
 9 |   "max_num_wordpieces": 80,
10 |   "max_num_applications": 200,
11 |   "max_num_numerator_nodes": 100,
12 |   "max_num_denominator_nodes": 500,
13 |   "max_num_rules": 6000
14 | }
15 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/model/parser/data/data_constants.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Constants used in generating tf.Examples that are used across modules."""
16 | 
17 | # Forest node types.
18 | RULE_APPLICATION = 1
19 | AGGREGATION = 2
20 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/model/qcfg/qcfg_file.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Read and write QCFG grammars to/from human readable txt files."""
16 | 
17 | from language.compgen.nqg.model.qcfg import qcfg_rule
18 | 
19 | from tensorflow.io import gfile
20 | 
21 | 
22 | def read_rules(filename):
23 |   """Read rule txt file to list of rules."""
24 |   rules = []
25 |   with gfile.GFile(filename, "r") as txt_file:
26 |     for line in txt_file:
27 |       line = line.rstrip()
28 |       rule = qcfg_rule.rule_from_string(line)
29 |       rules.append(rule)
30 |   print("Loaded %s rules from %s." % (len(rules), filename))
31 |   return rules
32 | 
33 | 
34 | def write_rules(rules, filename):
35 |   """Write rules to txt file."""
36 |   with gfile.GFile(filename, "w") as txt_file:
37 |     for rule in rules:
38 |       line = "%s\n" % str(rule)
39 |       txt_file.write(line)
40 |   print("Wrote %s rules to %s." % (len(rules), filename))
41 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/tasks/geoquery/measure_compound_divergence.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Measures and prints compound divergence between two sets of examples."""
16 | 
17 | from absl import app
18 | from absl import flags
19 | 
20 | from language.compgen.nqg.tasks import mcd_utils
21 | from language.compgen.nqg.tasks import tsv_utils
22 | from language.compgen.nqg.tasks.geoquery import tmcd_utils
23 | 
24 | FLAGS = flags.FLAGS
25 | 
26 | flags.DEFINE_string("input_1", "", "Input tsv file.")
27 | 
28 | flags.DEFINE_string("input_2", "", "Input tsv file.")
29 | 
30 | 
31 | def main(unused_argv):
32 |   examples_1 = tsv_utils.read_tsv(FLAGS.input_1)
33 |   examples_2 = tsv_utils.read_tsv(FLAGS.input_2)
34 |   divergence = mcd_utils.measure_example_divergence(
35 |       examples_1, examples_2, get_compounds_fn=tmcd_utils.get_example_compounds)
36 |   print("Compound divergence: %s" % divergence)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |   app.run(main)
41 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/tasks/scan/convert_to_tsv.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert SCAN txt format to standard TSV format."""
16 | 
17 | from absl import app
18 | from absl import flags
19 | 
20 | from language.compgen.nqg.tasks import tsv_utils
21 | 
22 | from tensorflow.io import gfile
23 | 
24 | FLAGS = flags.FLAGS
25 | 
26 | flags.DEFINE_string("input", "", "Input txt file.")
27 | 
28 | flags.DEFINE_string("output", "", "Output tsv file.")
29 | 
30 | 
31 | def load_examples(filename):
32 |   """Load SCAN examples from original data file."""
33 |   examples = []
34 | 
35 |   with gfile.GFile(filename, "r") as input_file:
36 |     for line in input_file:
37 |       splits = line.split("OUT:")
38 |       # Trim "IN:" prefix.
39 |       input_string = splits[0][3:].strip()
40 |       output_string = splits[1].strip()
41 |       examples.append((input_string, output_string))
42 | 
43 |   return examples
44 | 
45 | 
46 | def main(unused_argv):
47 |   examples = load_examples(FLAGS.input)
48 |   tsv_utils.write_tsv(examples, FLAGS.output)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |   app.run(main)
53 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/tasks/spider/measure_compound_divergence.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Measures and prints compound divergence between two sets of examples."""
16 | 
17 | from absl import app
18 | from absl import flags
19 | 
20 | from language.compgen.nqg.tasks import mcd_utils
21 | from language.compgen.nqg.tasks import tsv_utils
22 | from language.compgen.nqg.tasks.spider import tmcd_utils
23 | 
24 | FLAGS = flags.FLAGS
25 | 
26 | flags.DEFINE_string("input_1", "", "Input tsv file.")
27 | 
28 | flags.DEFINE_string("input_2", "", "Input tsv file.")
29 | 
30 | 
31 | def main(unused_argv):
32 |   examples_1 = tsv_utils.read_tsv(FLAGS.input_1)
33 |   examples_2 = tsv_utils.read_tsv(FLAGS.input_2)
34 |   divergence = mcd_utils.measure_example_divergence(
35 |       examples_1, examples_2, get_compounds_fn=tmcd_utils.get_example_compounds)
36 |   print("Compound divergence: %s" % divergence)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |   app.run(main)
41 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/tasks/spider/nqg_preprocess.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Pre-tokenize dataset for NQG which uses space-separated tokenization.
16 | 
17 | Input should be a TSV file, e.g. generated by applying `split_dataset.py` to
18 | the output ofr `spider/write_dataset.py`.
19 | """
20 | 
21 | from absl import app
22 | from absl import flags
23 | 
24 | from language.compgen.nqg.tasks import tsv_utils
25 | 
26 | from language.compgen.nqg.tasks.spider import nqg_tokenization
27 | 
28 | FLAGS = flags.FLAGS
29 | 
30 | flags.DEFINE_string("input", "", "Input tsv file.")
31 | 
32 | flags.DEFINE_string("output", "", "Output tsv file.")
33 | 
34 | 
35 | def main(unused_argv):
36 |   examples = tsv_utils.read_tsv(FLAGS.input)
37 |   new_examples = []
38 |   for source, target in examples:
39 |     new_examples.append((nqg_tokenization.process_source(source),
40 |                          nqg_tokenization.process_target(target)))
41 |   tsv_utils.write_tsv(new_examples, FLAGS.output)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |   app.run(main)
46 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/tasks/spider/sql_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilities for tokenizing SQL."""
16 | 
17 | import sqlparse
18 | 
19 | 
20 | def _is_whitespace(sqlparse_token):
21 |   return sqlparse_token.ttype == sqlparse.tokens.Whitespace
22 | 
23 | 
24 | def tokenize_sql(sql_exp):
25 |   sql_exp = sql_exp.lower()
26 |   sql_exp = sql_exp.rstrip(";")
27 |   parse = sqlparse.parse(sql_exp)
28 |   sql = parse[0]
29 |   flat_tokens = sql.flatten()
30 |   sql_tokens = [
31 |       token.value for token in flat_tokens if not _is_whitespace(token)
32 |   ]
33 |   return sql_tokens
34 | 


--------------------------------------------------------------------------------
/language/compgen/nqg/tasks/strip_targets.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Strip targets from a tsv file and write as newline-separated txt.
16 | 
17 | This file can be useful as input to generate predictions (e.g. for evaluation).
18 | """
19 | 
20 | from absl import app
21 | from absl import flags
22 | 
23 | from language.compgen.nqg.tasks import tsv_utils
24 | 
25 | from tensorflow.io import gfile
26 | 
27 | FLAGS = flags.FLAGS
28 | 
29 | flags.DEFINE_string("input", "", "Input tsv file.")
30 | 
31 | flags.DEFINE_string("output", "", "Output txt file.")
32 | 
33 | flags.DEFINE_string("prefix", "", "Optional prefix to prepend to source.")
34 | 
35 | 
36 | def main(unused_argv):
37 |   examples = tsv_utils.read_tsv(FLAGS.input)
38 |   with gfile.GFile(FLAGS.output, "w") as txt_file:
39 |     for example in examples:
40 |       txt_file.write("%s%s\n" % (FLAGS.prefix, example[0]))
41 | 
42 | 
43 | if __name__ == "__main__":
44 |   app.run(main)
45 | 


--------------------------------------------------------------------------------
/language/compir/utils/dataset_parser_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Gets the parser that corresponds to a specific dataset."""
16 | 
17 | 
18 | from language.compir.dataset_parsers.cfq_parser import CfqParser
19 | from language.compir.dataset_parsers.dataset_parser import DatasetParserInterface
20 | from language.compir.dataset_parsers.scan_parser import ScanParser
21 | from language.compir.dataset_parsers.sql_parser import SqlParser
22 | 
23 | dataset_parsers = {
24 |     "scan": ScanParser,
25 |     "cfq": CfqParser,
26 |     "atis": SqlParser,
27 |     "geo": SqlParser,
28 |     "scholar": SqlParser
29 | }
30 | 
31 | 
32 | def get_parser(dataset):
33 |   """Gets the parser that corresponds to a specific dataset."""
34 |   return dataset_parsers[dataset]
35 | 


--------------------------------------------------------------------------------
/language/decontext/eval_requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py
2 | nltk
3 | numpy
4 | sentencepiece
5 | 


--------------------------------------------------------------------------------
/language/diffqg/.gitignore:
--------------------------------------------------------------------------------
1 | bleurt/
2 | data/
3 | 


--------------------------------------------------------------------------------
/language/diffqg/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/diffqg/requirements.txt:
--------------------------------------------------------------------------------
1 | bleurt @ git+https://github.com/google-research/bleurt
2 | absl-py==1.3.0
3 | huggingface-hub==0.10.1
4 | sentence-transformers==2.2.2
5 | rouge-score==0.1.2
6 | 


--------------------------------------------------------------------------------
/language/emql/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """EmQL library."""
16 | 


--------------------------------------------------------------------------------
/language/frost/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/fruit/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/fruit/postprocessors.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """SeqIO postprocessors for wikidiff tasks."""
16 | 
17 | import json
18 | 
19 | from language.fruit import tf_utils
20 | import tensorflow as tf
21 | 
22 | 
23 | @tf.autograph.experimental.do_not_convert
24 | def postprocess_wikidiff(
25 |     output,
26 |     vocabulary,
27 |     normalize_fn,
28 |     is_target=False,
29 |     example=None,
30 | ):
31 |   """Applies normalization to outputs."""
32 |   del is_target
33 |   inputs = tf_utils.maybe_decode(
34 |       vocabulary.decode_tf(example["inputs"]).numpy())
35 |   targets = tf_utils.maybe_decode(output)
36 |   normalized_inputs, normalized_targets = normalize_fn(inputs, targets)
37 |   results = {
38 |       "inputs":
39 |           inputs,
40 |       "targets":
41 |           targets,
42 |       "normalized_inputs":
43 |           normalized_inputs,
44 |       "normalized_targets":
45 |           normalized_targets,
46 |       "generatable_surfaces":
47 |           json.loads(tf_utils.maybe_decode(example["generatable_surfaces"])),
48 |   }
49 |   return results
50 | 


--------------------------------------------------------------------------------
/language/fruit/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/fruit/scripts/sample_data/pred.jsonl:
--------------------------------------------------------------------------------
1 | {"normalized_targets": "''The Lone Rider in Frontier Fury is a 1941 American Western film directed by Sam Newfield. The film stars George Houston as the \"Lone Rider\" and Al St. John as his sidekick \"Fuzzy\" Jones, with Hillary Brooke, Karl Hackett, Ted Adams and Arch Hall Sr. The film was released on August 8, 1941, by Producers Releasing Corporation. The film is also known as Frontier Fury in the United Kingdom and Rangeland Racket (American reissue title).''"}
2 | {"normalized_targets": "''William Emmanuel Bevan, known by his recording alias Burial, is a British electronic musician from South London. Initially remaining anonymous, Burial became the first artist signed to Kode9's electronic label Hyperdub in 2005. He won acclaim the following year for his self-titled debut album, which showcased a dark, emotive take on UK rave music styles such as UK garage and 2-step; it was named the album of the year by The Wire. Burial's second album, Untrue, was released to further critical acclaim in 2007.''"}
3 | {"normalized_targets": "''Primearth EV Energy Co., Ltd. (abbreviated as PEVE) is a Japanese manufacturer of prismatic nickel–metal hydride (NiMH) and lithium-ion battery packs for hybrid electric vehicles, located in Shizuoka Prefecture, Japan. PEVE's products had been solely based on NiMH until early 2011 when the company has started mass production of Li-ion battery.''"}
4 | 


--------------------------------------------------------------------------------
/language/fruit/t5x/configs/t5_3b_eval.gin:
--------------------------------------------------------------------------------
 1 | # Eval finetuned T5 3B on WikiDiff
 2 | 
 3 | from t5x import utils
 4 | 
 5 | import language.fruit.tasks
 6 | 
 7 | 
 8 | include "t5x/configs/runs/eval.gin"
 9 | include "t5x/examples/t5/t5_1_0/3B.gin"
10 | 
11 | 
12 | RestoreCheckpointConfig.mode = "specific"
13 | utils.DatasetConfig.split = "test"
14 | utils.DatasetConfig.batch_size = 128
15 | # partitioning.PjitPartitioner:
16 |   # model_parallel_submesh=(1,1,1,1)
17 | 
18 | # Ensure truncation
19 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512}
20 | utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS
21 | 


--------------------------------------------------------------------------------
/language/fruit/t5x/configs/t5_3b_finetune.gin:
--------------------------------------------------------------------------------
 1 | # Finetune pre-trained T5 3B on WikiDiff
 2 | 
 3 | import language.fruit.tasks
 4 | 
 5 | 
 6 | include 't5x/configs/runs/finetune.gin'
 7 | include 't5x/examples/t5/t5_1_0/3B.gin'
 8 | 
 9 | 
10 | TASK_FEATURE_LENGTHS = {'inputs': 1024, 'targets': 512}
11 | TRAIN_STEPS = 1_030_000
12 | BATCH_SIZE = 128
13 | INITIAL_CHECKPOINT_PATH = ''
14 | USE_CACHED_TASKS = False
15 | partitioning.PjitPartitioner:
16 |   model_parallel_submesh=(4,8,1,2)
17 | 
18 | # Ensure truncation during inference
19 | infer_eval/utils.DatasetConfig:
20 |   task_feature_lengths = %TASK_FEATURE_LENGTHS
21 | 
22 | trainer.Trainer:
23 |   num_microbatches = 4
24 | 
25 | utils.SaveCheckpointConfig:
26 |   keep = 1
27 | 


--------------------------------------------------------------------------------
/language/fruit/t5x/configs/t5_base_eval.gin:
--------------------------------------------------------------------------------
 1 | # Eval pre-trained T5 Large on WikiDiff
 2 | 
 3 | from t5x import utils
 4 | 
 5 | import language.fruit.tasks
 6 | 
 7 | 
 8 | include "third_party/py/t5x/configs/runs/eval.gin"
 9 | include "t5x/configs/t5/models/base.gin"
10 | 
11 | 
12 | RestoreCheckpointConfig.mode = "specific"
13 | utils.DatasetConfig.split = "test"
14 | utils.DatasetConfig.batch_size = 1024
15 | 
16 | # Ensure truncation
17 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512}
18 | utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS
19 | 


--------------------------------------------------------------------------------
/language/fruit/t5x/configs/t5_base_finetune.gin:
--------------------------------------------------------------------------------
 1 | # Finetune pre-trained T5 Large on WikiDiff
 2 | 
 3 | import language.fruit.tasks
 4 | 
 5 | include "t5x/configs/runs/finetune.gin"
 6 | include "t5x/examples/t5/t5_1_0/base.gin"
 7 | 
 8 | 
 9 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512}
10 | TRAIN_STEPS = 1_030_700  # TODO(rloganiv): Enough?
11 | INITIAL_CHECKPOINT_PATH = ""
12 | USE_CACHED_TASKS = False
13 | 
14 | # NOTE: When fine-tuning the public T5 checkpoints (trained in T5 MeshTF)
15 | # the loss normalizing factor should be set to 1024 * 228 (pretraining
16 | # batch_size * target_token_length).
17 | LOSS_NORMALIZING_FACTOR = 233472
18 | 
19 | # Ensure truncation during inference
20 | infer_eval/utils.DatasetConfig:
21 |   task_feature_lengths = %TASK_FEATURE_LENGTHS
22 | 


--------------------------------------------------------------------------------
/language/fruit/t5x/configs/t5_large_eval.gin:
--------------------------------------------------------------------------------
 1 | # Eval pre-trained T5 Large on WikiDiff
 2 | 
 3 | from t5x import utils
 4 | 
 5 | import language.fruit.tasks
 6 | 
 7 | 
 8 | include "t5x/configs/runs/eval.gin"
 9 | include "t5x/examples/t5/t5_1_0/large.gin"
10 | 
11 | 
12 | RestoreCheckpointConfig.mode = "specific"
13 | utils.DatasetConfig.split = "test"
14 | utils.DatasetConfig.batch_size = 1024
15 | 
16 | # Ensure truncation
17 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512}
18 | utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS
19 | 


--------------------------------------------------------------------------------
/language/fruit/t5x/configs/t5_large_finetune.gin:
--------------------------------------------------------------------------------
 1 | # Finetune pre-trained T5 Large on WikiDiff
 2 | 
 3 | import language.fruit.tasks
 4 | 
 5 | include "t5x/configs/runs/finetune.gin"
 6 | include "t5x/examples/t5/t5_1_0/large.gin"
 7 | 
 8 | 
 9 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512}
10 | TRAIN_STEPS = 1_030_700  # TODO(rloganiv): Enough?
11 | INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
12 | USE_CACHED_TASKS = False
13 | 
14 | # NOTE: When fine-tuning the public T5 checkpoints (trained in T5 MeshTF)
15 | # the loss normalizing factor should be set to 1024 * 228 (pretraining
16 | # batch_size * target_token_length).
17 | LOSS_NORMALIZING_FACTOR = 233472
18 | 
19 | # Ensure truncation during inference
20 | infer_eval/utils.DatasetConfig:
21 |   task_feature_lengths = %TASK_FEATURE_LENGTHS
22 | 


--------------------------------------------------------------------------------
/language/fruit/t5x/configs/t5_small_eval.gin:
--------------------------------------------------------------------------------
 1 | # Eval pre-trained T5 Large on WikiDiff
 2 | 
 3 | from t5x import utils
 4 | 
 5 | import language.fruit.tasks
 6 | 
 7 | 
 8 | include "t5x/configs/runs/eval.gin"
 9 | include "t5x/examples/t5/t5_1_0/small.gin"
10 | 
11 | 
12 | RestoreCheckpointConfig.mode = "specific"
13 | utils.DatasetConfig.split = "test"
14 | utils.DatasetConfig.batch_size = 1024
15 | 
16 | # Ensure truncation
17 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512}
18 | utils.DatasetConfig.task_feature_lengths = %TASK_FEATURE_LENGTHS
19 | 


--------------------------------------------------------------------------------
/language/fruit/t5x/configs/t5_small_finetune.gin:
--------------------------------------------------------------------------------
 1 | # Finetune pre-trained T5 Large on WikiDiff
 2 | 
 3 | import language.fruit.tasks
 4 | 
 5 | include "t5x/configs/runs/finetune.gin"
 6 | include "t5x/examples/t5/t5_1_0/small.gin"
 7 | 
 8 | 
 9 | TASK_FEATURE_LENGTHS = {"inputs": 1024, "targets": 512}
10 | TRAIN_STEPS = 1_030_700  # TODO(rloganiv): Enough?
11 | INITIAL_CHECKPOINT_PATH = %gin.REQUIRED
12 | USE_CACHED_TASKS = False
13 | 
14 | # NOTE: When fine-tuning the public T5 checkpoints (trained in T5 MeshTF)
15 | # the loss normalizing factor should be set to 1024 * 228 (pretraining
16 | # batch_size * target_token_length).
17 | LOSS_NORMALIZING_FACTOR = 233472
18 | 
19 | # Ensure truncation during inference
20 | infer_eval/utils.DatasetConfig:
21 |   task_feature_lengths = %TASK_FEATURE_LENGTHS
22 | 


--------------------------------------------------------------------------------
/language/fruit/testdata/test.diff.tfrecords:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/fruit/testdata/test.diff.tfrecords


--------------------------------------------------------------------------------
/language/fruit/testdata/test_annotations.jsonl:
--------------------------------------------------------------------------------
1 | {"target":{"normalized_inputs": "foo", "inputs": "bar [CONTEXT] bizz",  "normalized_targets": "baz"}, "prediction": {"normalized_targets": "buzz"}}
2 | 


--------------------------------------------------------------------------------
/language/fruit/testdata/test_article_pairs.jsonl:
--------------------------------------------------------------------------------
1 | {"source_article": {"title": "foo", "ns": "0", "id": 0, "text": "is foo", "entities": [], "added_entities": []}, "target_article": {"title": "foo", "ns": "0", "id": 0, "text": "you is foo", "entities": [{"id": "you", "start": 0, "end": 3}], "added_entities": [{"id": "you", "start": 0, "end": 3}]}, "updated": true, "annotated_mentions": [{"mention": {"title": "you", "section": "INTRODUCTION", "text": "foo is you", "entities": [{"id": "foo", "start": 0, "end": 3}], "added_entities": [{"id": "foo", "start": 0, "end": 3}]}, "label": 1}]}
2 | 


--------------------------------------------------------------------------------
/language/fruit/testdata/test_redirects.tsv:
--------------------------------------------------------------------------------
1 | foo	bar
2 | bar	bar
3 | baz	baz
4 | qux	quux
5 | 


--------------------------------------------------------------------------------
/language/fruit/testdata/test_source_articles.jsonl:
--------------------------------------------------------------------------------
1 | {"title": "foo", "ns": "0", "id": 0, "redirect": "bar", "text": "not important."}
2 | {"title": "bar", "ns": "0", "id": 1, "text": "[[baz]] with an r."}
3 | {"title": "baz", "ns": "0", "id": 2, "text": "[[bar|Bar]] with a z."}
4 | {"title": "qux", "ns": "0", "id": 3, "redirect": "quux", "text": "not important"}
5 | 


--------------------------------------------------------------------------------
/language/fruit/testdata/test_target_articles.jsonl:
--------------------------------------------------------------------------------
1 | {"title": "foo", "ns": "0", "id": 0, "redirect": "bar", "text": "not important."}
2 | {"title": "bar", "ns": "0", "id": 1, "text": "[[baz]] with an r. [[new ent]]."}
3 | {"title": "baz", "ns": "0", "id": 2, "text": "[[bar|Bar]] with a z. [[new ent]]."}
4 | {"title": "qux", "ns": "0", "id": 3, "redirect": "quux", "text": "not important"}
5 | 


--------------------------------------------------------------------------------
/language/gscan/xattn_model/model/model_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Util funtion for modeling."""
16 | 
17 | import jax.numpy as jnp
18 | 
19 | 
20 | def shift_right(x, axis=1):
21 |   """Shift the input to the right by padding in the end on axis 1."""
22 |   pad_widths = [(0, 0)] * len(x.shape)
23 |   pad_widths[axis] = (0, 1)
24 |   padded = jnp.pad(
25 |       x[:, 1:], pad_widths, mode='constant', constant_values=x.dtype.type(0))
26 |   return padded
27 | 
28 | 
29 | def shift_left(x, axis=1):
30 |   """Shift the input to the left by padding in the front on axis 1."""
31 |   pad_widths = [(0, 0)] * len(x.shape)
32 |   pad_widths[axis] = (0, 1)
33 |   padded = jnp.pad(
34 |       x[:, :-1], pad_widths, mode='constant', constant_values=x.dtype.type(0))
35 |   return padded
36 | 


--------------------------------------------------------------------------------
/language/gscan/xattn_model/predict_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for predict."""
16 | 
17 | import tempfile
18 | 
19 | from language.gscan.xattn_model import predict
20 | from language.gscan.xattn_model import test_utils
21 | import tensorflow as tf
22 | 
23 | 
24 | class PredictTest(tf.test.TestCase):
25 | 
26 |   def setUp(self):
27 |     super().setUp()
28 |     tf.config.experimental.set_visible_devices([], 'GPU')
29 | 
30 |   def test_train_and_evaluate(self):
31 |     config = test_utils.get_test_config()
32 |     # Create a temporary directory where tensorboard metrics are written.
33 |     workdir = tempfile.mkdtemp()
34 |     predict.predict_and_evaluate(workdir=workdir, config=config)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |   tf.test.main()
39 | 


--------------------------------------------------------------------------------
/language/gscan/xattn_model/testdata/train.tfrecord:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/gscan/xattn_model/testdata/train.tfrecord


--------------------------------------------------------------------------------
/language/gscan/xattn_model/testdata/training_input_vocab.txt:
--------------------------------------------------------------------------------
 1 | {
 2 |   "sos_token": "<SOS>",
 3 |   "eos_token": "<EOS>",
 4 |   "pad_token": "<PAD>",
 5 |   "idx_to_word": [
 6 |     "<PAD>",
 7 |     "<SOS>",
 8 |     "<EOS>",
 9 |     "walk",
10 |     "to",
11 |     "a",
12 |     "red",
13 |     "circle",
14 |     "green",
15 |     "square",
16 |     "yellow",
17 |     "blue",
18 |     "big",
19 |     "small"
20 |   ],
21 |   "word_to_idx": {
22 |     "<PAD>": 0,
23 |     "<SOS>": 1,
24 |     "<EOS>": 2,
25 |     "walk": 3,
26 |     "to": 4,
27 |     "a": 5,
28 |     "red": 6,
29 |     "circle": 7,
30 |     "green": 8,
31 |     "square": 9,
32 |     "yellow": 10,
33 |     "blue": 11,
34 |     "big": 12,
35 |     "small": 13
36 |   },
37 |   "word_frequencies": {
38 |     "walk": 1531,
39 |     "to": 1531,
40 |     "a": 1531,
41 |     "red": 65,
42 |     "circle": 869,
43 |     "green": 146,
44 |     "square": 662,
45 |     "yellow": 78,
46 |     "blue": 140,
47 |     "big": 336,
48 |     "small": 289
49 |   }
50 | }


--------------------------------------------------------------------------------
/language/gscan/xattn_model/testdata/training_target_vocab.txt:
--------------------------------------------------------------------------------
 1 | {
 2 |   "sos_token": "<SOS>",
 3 |   "eos_token": "<EOS>",
 4 |   "pad_token": "<PAD>",
 5 |   "idx_to_word": [
 6 |     "<PAD>",
 7 |     "<SOS>",
 8 |     "<EOS>",
 9 |     "turn right",
10 |     "walk",
11 |     "turn left"
12 |   ],
13 |   "word_to_idx": {
14 |     "<PAD>": 0,
15 |     "<SOS>": 1,
16 |     "<EOS>": 2,
17 |     "turn right": 3,
18 |     "walk": 4,
19 |     "turn left": 5
20 |   },
21 |   "word_frequencies": {
22 |     "turn right": 755,
23 |     "walk": 4794,
24 |     "turn left": 1361
25 |   }
26 | }


--------------------------------------------------------------------------------
/language/gscan/xattn_model/train_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for train."""
16 | 
17 | import tempfile
18 | 
19 | from language.gscan.xattn_model import test_utils
20 | from language.gscan.xattn_model import train
21 | 
22 | import tensorflow as tf
23 | 
24 | 
25 | class TrainTest(tf.test.TestCase):
26 | 
27 |   def setUp(self):
28 |     super().setUp()
29 |     tf.config.experimental.set_visible_devices([], 'GPU')
30 | 
31 |   def test_train_and_evaluate(self):
32 |     config = test_utils.get_test_config()
33 |     # Create a temporary directory where tensorboard metrics are written.
34 |     workdir = tempfile.mkdtemp()
35 |     train.train_and_evaluate(workdir=workdir, config=config)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |   tf.test.main()
40 | 


--------------------------------------------------------------------------------
/language/labs/README:
--------------------------------------------------------------------------------
1 | The "labs" folder contains projects that are works-in-progress but
2 | may still be useful to a broader communtiy, for example partial code that
3 | results from internships with the Google AI Language team.
4 | 
5 | If you use any of this code in your work, please contact the contributors
6 | in the corresponding README to find the appropriate citation.
7 | 


--------------------------------------------------------------------------------
/language/labs/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/bin/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/data_generators/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/models/losses_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for losses.py."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | from language.labs.consistent_zero_shot_nmt.models import losses
22 | import tensorflow.compat.v1 as tf
23 | 
24 | 
25 | class LossesTest(tf.test.TestCase):
26 |   """Tests for losses."""
27 | 
28 |   def test_l2_distance(self):
29 |     """Tests l2 distance."""
30 |     with tf.Graph().as_default():
31 |       x = [1.0, 2.0]
32 |       y = [3.0, 4.0]
33 |       dist = losses.l2_distance(x=x, y=y)
34 |       normalize_dist = losses.l2_distance(x=x, y=y, normalize=True)
35 |       with tf.Session("") as sess:
36 |         tf_dist, tf_normalize_dist = sess.run([dist, normalize_dist])
37 |         self.assertAllClose([tf_dist, tf_normalize_dist], [8.0, 0.0322602])
38 | 
39 | 
40 | if __name__ == "__main__":
41 |   tf.test.main()
42 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/modules/base.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Base functionality ofr modules."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import abc
22 | 
23 | import six
24 | import tensorflow.compat.v1 as tf
25 | 
26 | 
27 | __all__ = ["AbstractNMTModule"]
28 | 
29 | 
30 | @six.add_metaclass(abc.ABCMeta)
31 | class AbstractNMTModule(object):
32 |   """Abstract base class for neural machine translation modules."""
33 | 
34 |   def __init__(self, name):
35 |     """Creates a new NMT module.
36 | 
37 |     Args:
38 |       name: String used as the scope name of the module's subgraph.
39 |     """
40 |     self.name = name
41 | 
42 |   def __call__(self, reuse=None, **kwargs):
43 |     with tf.variable_scope(self.name, reuse=reuse):
44 |       outputs = self._build(**kwargs)
45 |     return outputs
46 | 
47 |   @abc.abstractmethod
48 |   def _build(self, **kwargs):
49 |     """Must be implemented by a subclass."""
50 |     raise NotImplementedError("Abstract Method")
51 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/scripts/datagen_europarl.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/usr/bin/env bash
16 | 
17 | set -e
18 | 
19 | # Parse cmd arguments.
20 | SCRIPTS_DIR="$( dirname "${BASH_SOURCE[0]}" )"
21 | source "${SCRIPTS_DIR}/parse-args.sh"
22 | 
23 | ORIG_DATA_PATH="${EXP_DATASET_DIR}/original"
24 | OVERLAP_DATA_PATH="${EXP_DATASET_DIR}/overlap"
25 | TFRECORD_DATA_PATH="${EXP_DATASET_DIR}/tfrecords"
26 | TMP_DIR="${EXP_DATASET_DIR}/tmp"
27 | 
28 | mkdir -p $TFRECORD_DATA_PATH $TMP_DIR
29 | 
30 | python -m language.labs.consistent_zero_shot_nmt.bin.t2t_datagen \
31 |   --data_dir=${TFRECORD_DATA_PATH} \
32 |   --europarl_orig_data_path=${ORIG_DATA_PATH} \
33 |   --europarl_overlap_data_path=${OVERLAP_DATA_PATH} \
34 |   --problem=${EXP_PROBLEM_NAME} \
35 |   --tmp_dir=${TMP_DIR} \
36 |   --alsologtostderr
37 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/scripts/datagen_iwslt17.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/usr/bin/env bash
16 | 
17 | set -e
18 | 
19 | # Parse cmd arguments.
20 | SCRIPTS_DIR="$( dirname "${BASH_SOURCE[0]}" )"
21 | source "${SCRIPTS_DIR}/parse-args.sh"
22 | 
23 | ORIG_DATA_PATH="${EXP_DATASET_DIR}/original"
24 | OVERLAP_DATA_PATH="${EXP_DATASET_DIR}/overlap"
25 | TFRECORD_DATA_PATH="${EXP_DATASET_DIR}/tfrecords"
26 | TMP_DIR="${EXP_DATASET_DIR}/tmp"
27 | 
28 | mkdir -p $TFRECORD_DATA_PATH $TMP_DIR
29 | 
30 | python -m language.labs.consistent_zero_shot_nmt.bin.t2t_datagen \
31 |   --data_dir=${TFRECORD_DATA_PATH} \
32 |   --iwslt17_orig_data_path=${ORIG_DATA_PATH} \
33 |   --iwslt17_overlap_data_path=${OVERLAP_DATA_PATH} \
34 |   --problem=${EXP_PROBLEM_NAME} \
35 |   --tmp_dir=${TMP_DIR} \
36 |   --alsologtostderr
37 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/scripts/datagen_uncorpus.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | PROBLEM=translate_uncorpus_exp1_lm
18 | DATA_DIR=$1
19 | TMP_DIR=$2
20 | UNCORPUS_ORIG_DATA_EXP1=$3
21 | UNCORPUS_ORIG_DATA_EXP1_LM=$4
22 | UNCORPUS_ORIG_DATA_EXP2=$5
23 | UNCORPUS_ORIG_DATA_EXP2_LM=$6
24 | 
25 | mkdir -p $DATA_DIR $TMP_DIR
26 | 
27 | python -m language.labs.consistent_zero_shot_nmt.bin.t2t_datagen \
28 |   --data_dir=$DATA_DIR \
29 |   --uncorpus_orig_data_exp1=$UNCORPUS_ORIG_DATA_EXP1 \
30 |   --uncorpus_orig_data_exp1_lm=$UNCORPUS_ORIG_DATA_EXP1_LM \
31 |   --uncorpus_orig_data_exp2=$UNCORPUS_ORIG_DATA_EXP2 \
32 |   --uncorpus_orig_data_exp2_lm=$UNCORPUS_ORIG_DATA_EXP2_LM \
33 |   --problem=$PROBLEM \
34 |   --tmp_dir=$TMP_DIR \
35 |   --alsologtostderr
36 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/scripts/run_nmt_experiment.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/usr/bin/env bash
16 | 
17 | set -e
18 | 
19 | # Parse cmd arguments.
20 | SCRIPTS_DIR="$( dirname "${BASH_SOURCE[0]}" )"
21 | source "${SCRIPTS_DIR}/parse-args.sh"
22 | 
23 | rm -rf ${EXP_OUTPUT_DIR}
24 | 
25 | # Additional parameters.
26 | EXP_HPARAMS=""
27 | EXP_TRAIN_STEPS=1000000
28 | EXP_LOCAL_EVAL_FREQ=500
29 | 
30 | python -m language.labs.consistent_zero_shot_nmt.bin.t2t_trainer \
31 |   --problem=${EXP_PROBLEM_NAME} \
32 |   --model=${EXP_MODEL_NAME} \
33 |   --hparams=${EXP_HPARAMS} \
34 |   --hparams_set=${EXP_CONF_NAME} \
35 |   --data_dir=${EXP_DATASET_DIR}/tfrecords \
36 |   --train_steps=${EXP_TRAIN_STEPS} \
37 |   --output_dir=${EXP_OUTPUT_DIR} \
38 |   --local_eval_frequency=${EXP_LOCAL_EVAL_FREQ} \
39 |   --schedule=train_and_evaluate \
40 |   --alsologtostderr
41 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/consistent_zero_shot_nmt/utils/common_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Common utilities."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | 
22 | # Attention types.
23 | ATT_LUONG = "luong"
24 | ATT_LUONG_SCALED = "luong_scaled"
25 | ATT_BAHDANAU = "bahdanau"
26 | ATT_BAHDANAU_NORM = "bahdanau_norm"
27 | ATT_TYPES = (ATT_LUONG, ATT_LUONG_SCALED, ATT_BAHDANAU, ATT_BAHDANAU_NORM)
28 | 
29 | # Encoder types.
30 | ENC_UNI = "uni"
31 | ENC_BI = "bi"
32 | ENC_GNMT = "gnmt"
33 | ENC_TYPES = (ENC_UNI, ENC_BI, ENC_GNMT)
34 | 
35 | # Decoder types.
36 | DEC_BASIC = "basic"
37 | DEC_ATTENTIVE = "attentive"
38 | DEC_TYPES = (DEC_BASIC, DEC_ATTENTIVE)
39 | 
40 | 
41 | # Language model types.
42 | LM_L2R = "left2right"
43 | LM_TYPES = (LM_L2R,)
44 | 


--------------------------------------------------------------------------------
/language/labs/drkit/README.md:
--------------------------------------------------------------------------------
1 | ## Multi-Hop Reasoning over a Virtual KB
2 | 
3 | This repository contains the code for running multi-hop reasoning templates
4 | against a Virtual Knowledge Base (KB). A virtual KB is an index of contextual
5 | representations of entity mentions in text (here Wikipedia). Multi-hop
6 | reasoning is done using a combination of maximum inner product search (MIPS)
7 | over the index, followed by sparse matrix operations. The templates are
8 | used to answer natural language questions from three benchmarks.
9 | 


--------------------------------------------------------------------------------
/language/labs/drkit/hotpotqa/scripts/run_demo.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | BERT_DIR="wwm_uncased_L-24_H-1024_A-16/"
18 | TEST_DIR="data/tiny-preprocessed-corpus"
19 | DRKIT_DIR="models/multihop"
20 | BERT_CKPT="models/answer"
21 | PASSAGES="data/tiny-wiki.json"
22 | OUTPUT="/tmp/demo"
23 | WEB="language/labs/drkit/hotpotqa/web"
24 | 
25 | python language.labs.drkit.hotpotqa.demo \
26 |   --vocab_file $BERT_DIR/vocab.txt \
27 |   --bert_config_file $BERT_DIR/bert_config.json \
28 |   --output_dir $OUTPUT \
29 |   --init_checkpoint $DRKIT_DIR \
30 |   --hotpot_init_checkpoint $BERT_CKPT \
31 |   --raw_passages $PASSAGES \
32 |   --train_data_dir $TEST_DIR \
33 |   --model_type "hotpotqa" \
34 |   --sparse_strategy "sparse_first" \
35 |   --num_hops 2 \
36 |   --port 8888 \
37 |   --web_path $WEB \
38 |   --logtostderr
39 | 


--------------------------------------------------------------------------------
/language/labs/drkit/hotpotqa/scripts/run_hotpotqa_answer.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | BERT_DIR="wwm_uncased_L-24_H-1024_A-16/"
18 | HOTPOT_DIR="hotpot"
19 | OUTPUT="models/answer"
20 | 
21 | # Train answer extraction model.
22 | python -m language.labs.drkit.hotpotqa.answer_extractor \
23 |   --vocab_file $BERT_DIR/vocab.txt \
24 |   --bert_config_file $BERT_DIR/bert_config.json \
25 |   --init_checkpoint $BERT_DIR/bert_model.ckpt \
26 |   --output_dir $OUTPUT \
27 |   --train_file $HOTPOT_DIR/hotpot_train_v1.1.json \
28 |   --predict_file $HOTPOT_DIR/hotpot_dev_distractor_v1.json \
29 |   --do_train=True \
30 |   --do_predict=True \
31 |   --train_batch_size 32 \
32 |   --num_train_epochs 5.0 \
33 |   --use_tpu=False \
34 |   --logtostderr
35 | 


--------------------------------------------------------------------------------
/language/labs/drkit/hotpotqa/web/static/drkit.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   color: #5f6368;
 3 |   font-family: 'Google Sans', Arial, Helvetica, sans-serif;
 4 |   font-size: 16px;
 5 | }
 6 | 
 7 | div.page-background-image {
 8 |   padding: 20px;
 9 |   background-image: url(https://ai.google/static/images/about/about_hero.jpg);
10 | }
11 | 
12 | /* Styling for the sub-card that holds the options controls. */
13 | div.results {
14 |   width: 800px;
15 |   padding: 20px;
16 | }
17 | div.results table tr td.results-field-label {
18 |   width: 220px;
19 | }
20 | div.results table tr td.results-field-textbox div {
21 |   margin-right: 80px;
22 |   width: 100px;
23 | }
24 | 
25 | .answer table {
26 |   border-spacing: 50px;
27 |   border-collapse: separate;
28 |   border: 1px solid grey;
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/language/labs/drkit/metaqa/scripts/index_metaqa_corpus.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | BERT_BASE_DIR="wwm_uncased_L-24_H-1024_A-16/"
18 | DATA_DIR="data/preprocessed"
19 | PRETRAIN_DIR="models/pretraining"
20 | 
21 | for HOP in "1" "2" "3"; do
22 | 
23 |   # Index training corpus.
24 |   python -m language.labs.drkit.wikidata.index \
25 |     --data_dir $DATA_DIR \
26 |     --qry_dir "$DATA_DIR/$HOP-hop/" \
27 |     --vocab_file=$BERT_BASE_DIR/vocab.txt \
28 |     --bert_config_file=$BERT_BASE_DIR/bert_config.json \
29 |     --multihop_output_dir="$DATA_DIR/$HOP-hop/indexed" \
30 |     --predict_batch_size=32 \
31 |     --output_dir="$PRETRAIN_DIR/$HOP-hop" \
32 |     --projection_dim=200 \
33 |     --pretrain_dir="$PRETRAIN_DIR/$HOP-hop" \
34 |     --max_seq_length 256 \
35 |     --logtostderr
36 | 
37 | done
38 | 


--------------------------------------------------------------------------------
/language/labs/drkit/metaqa/scripts/run_metaqa_pretraining.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | set -e
18 | 
19 | BERT_DIR="wwm_uncased_L-24_H-1024_A-16/"
20 | INIT_CKPT="../wikidata/models/pretraining/model.ckpt-?????"
21 | DATA_DIR="data/pretraining/2-hop"
22 | OUTPUT_DIR="models/pretraining/2-hop"
23 | 
24 | python -m language.labs.drkit.run_dualencoder_lsf \
25 |   --vocab_file=$BERT_DIR/vocab.txt \
26 |   --bert_config_file=$BERT_DIR/bert_config.json \
27 |   --init_checkpoint=$INIT_CKPT \
28 |   --do_train=True \
29 |   --train_file=$DATA_DIR/train.json \
30 |   --do_predict=False \
31 |   --do_test=True \
32 |   --test_file=$DATA_DIR/dev.json \
33 |   --output_dir=$OUTPUT_DIR \
34 |   --projection_dim=200 \
35 |   --train_batch_size 48 \
36 |   --num_train_epochs 12.0 \
37 |   --max_seq_length 256 \
38 |   --logtostderr
39 | 


--------------------------------------------------------------------------------
/language/labs/drkit/wikidata/scripts/run_wikidata_pretraining.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | set -e
18 | 
19 | BERT_DIR="wwm_uncased_L-24_H-1024_A-16/"
20 | DATA_DIR="data/pretraining"
21 | OUTPUT_DIR="models/pretraining"
22 | 
23 | python -m language.labs.drkit.run_dualencoder_lsf \
24 |   --vocab_file=$BERT_DIR/vocab.txt \
25 |   --bert_config_file=$BERT_DIR/bert_config.json \
26 |   --init_checkpoint=$BERT_DIR/bert_model.ckpt \
27 |   --do_train=True \
28 |   --train_file=$DATA_DIR/train.json \
29 |   --do_predict=False \
30 |   --do_test=True \
31 |   --test_file=$DATA_DIR/dev.json \
32 |   --output_dir=$OUTPUT_DIR \
33 |   --projection_dim=200 \
34 |   --logtostderr
35 | 


--------------------------------------------------------------------------------
/language/labs/exemplar_decoding/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/labs/exemplar_decoding/docs/giga_hyperparameters.txt:
--------------------------------------------------------------------------------
 1 | dataset = "giga"
 2 | use_bpe = true
 3 | vocab_size = 26000
 4 | use_copy = false
 5 | reuse_attention = false
 6 | random_neighbor = false
 7 | use_cluster = false
 8 | encode_neighbor = true
 9 | sum_neighbor = false
10 | att_neighbor = false
11 | binary_neighbor = false
12 | binary_dim = 0
13 | neighbor_dim = 32
14 | num_neighbors = 10
15 | max_enc_steps = 1000
16 | max_dec_steps = 50
17 | max_grad_norm = 1.0
18 | num_eval_steps = 10000
19 | save_checkpoints_steps = 5000
20 | lr_schedule = 240000
21 | total_steps = 1000000
22 | beam_width = 10
23 | length_norm = 1.0
24 | coverage_penalty = 0.
25 | batch_size = 64
26 | rnn_cell = "hyper_lstm"
27 | att_type = "luong"
28 | use_bridge = true
29 | use_residual = true
30 | trainer = "adam"
31 | num_mlp_layers = 1
32 | sampling_probability = 0.0
33 | sample_neighbor = false
34 | weight_decay = 1e-2
35 | tie_embedding = true
36 | decoder_drop = 0.0
37 | num_decoder_layers = 1
38 | sigma_norm = 16.0
39 | learning_rate = 1e-3
40 | emb_dim = 256
41 | num_encoder_layers = 3
42 | encoder_dim = 256
43 | drop = 0.15
44 | emb_drop = [0.15, 0.25, 0.35] // (one of these is optimal)
45 | out_drop = [0.15, 0.25, 0.35] // (one of these is optimal)
46 | encoder_drop = [0.0, 0.1] // (one of these is optimal)
47 | decoder_dim = 256
48 | rank = 256
49 | 


--------------------------------------------------------------------------------
/language/labs/exemplar_decoding/docs/nyt_hyperparameters.txt:
--------------------------------------------------------------------------------
 1 | dataset = "nyt"
 2 | sigma_norm = 1.0
 3 | weight_decay = 1e-2
 4 | tie_embedding = true
 5 | learning_rate = 1e-3
 6 | emb_dim = 300
 7 | num_encoder_layers = [1, 2] // (one of these is optimal)
 8 | encoder_dim = 300
 9 | drop = 0.2
10 | emb_drop = [0.15, 0.25] // (one of these is optimal)
11 | out_drop = [0.15, 0.25] // (one of these is optimal)
12 | encoder_drop = 0.0
13 | decoder_drop = 0.0
14 | num_decoder_layers = 1
15 | decoder_dim = 300
16 | rank = 300
17 | use_bpe = true
18 | vocab_size = 11000 // 11000 if using bpe, else 124500
19 | use_copy = true
20 | reuse_attention = false
21 | random_neighbor = false
22 | use_cluster = false
23 | encode_neighbor = true
24 | sum_neighbor = false
25 | att_neighbor = true
26 | binary_neighbor = false
27 | neighbor_dim = 150
28 | num_neighbors = 10
29 | max_enc_steps = 750
30 | max_dec_steps = 400
31 | beam_width = 10
32 | max_grad_norm = 0.1
33 | num_eval_steps = 10000
34 | save_checkpoints_steps = 500
35 | lr_schedule = 10000
36 | total_steps = 150000
37 | cp = 0.0
38 | length_norm = 1.0
39 | batch_size = 64
40 | rnn_cell = "hyper_lstm"
41 | att_type = "my"
42 | use_bridge = true
43 | use_residual = true
44 | trainer = "adam"
45 | sampling_probability = 0.25
46 | num_mlp_layers = 1
47 | sample_neighbor = false
48 | 


--------------------------------------------------------------------------------
/language/labs/exemplar_decoding/experiments/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/labs/exemplar_decoding/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/labs/exemplar_decoding/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/labs/exemplar_decoding/utils/tensor_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for tensor_utils.py."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | from language.labs.exemplar_decoding.utils import tensor_utils
22 | 
23 | import tensorflow.compat.v1 as tf
24 | 
25 | 
26 | class TensorUtilsTest(tf.test.TestCase):
27 | 
28 |   def test_linear_interpolation(self):
29 |     with tf.Graph().as_default():
30 |       result = tensor_utils.linear_interpolation([1, 2, 3, 4, 5], 2, 10)
31 |       with tf.Session("") as sess:
32 |         tf_result = sess.run(result)
33 |         self.assertAllEqual(tf_result, [2, 4, 6, 8, 10])
34 | 
35 | 
36 | if __name__ == "__main__":
37 |   tf.test.main()
38 | 


--------------------------------------------------------------------------------
/language/labs/memory/README:
--------------------------------------------------------------------------------
 1 | This project consists of experimental code to investigate a few different
 2 | memory mechanisms on synthetic baselines.
 3 | 
 4 | Contributors:
 5 | 
 6 | * Jessy Lin (jessylin@)
 7 | * David Weiss (djweiss@)
 8 | * Eugene Ie (eugeneie@)
 9 | * Zora Tung (gatoatigrado@)
10 | 


--------------------------------------------------------------------------------
/language/labs/memory/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/mentionmemory/README.md:
--------------------------------------------------------------------------------
 1 | # MentionMemory model
 2 | 
 3 | This repository contains the code for the MentionMemory project.
 4 | 
 5 | ## Requirements
 6 | 
 7 | ```
 8 | git clone https://github.com/google-research/language
 9 | pip install -r language/mentionmemory/requirements.txt
10 | ```
11 | 
12 | Unit tests can be run via:
13 | 
14 | ```bash
15 | python -m language.mentionmemory.run_tests
16 | ```
17 | 
18 | Note that these tests might need to be run independently
19 | 
20 | ```bash
21 | python -m language.mentionmemory.encoders.mention_memory_encoder_test
22 | python -m language.mentionmemory.encoders.readtwice_encoder_test
23 | python -m language.mentionmemory.modules.kmeans_test
24 | python -m language.mentionmemory.modules.memory_attention_layer_test
25 | python -m language.mentionmemory.modules.memory_extraction_layer_test
26 | python -m language.mentionmemory.modules.mention_losses_test
27 | python -m language.mentionmemory.tasks.mention_memory_task_test
28 | python -m language.mentionmemory.tasks.readtwice_task_test
29 | python -m language.mentionmemory.training.trainer_test
30 | python -m language.mentionmemory.utils.data_utils_test
31 | ```
32 | 
33 | When running the unit tests and all python commands mentioned later, the current working directory must the root the git project.
34 | 
35 | ## Pre-trained models.
36 | 


--------------------------------------------------------------------------------
/language/mentionmemory/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Find and register unittests.
16 | 
17 | See https://docs.python.org/3/library/unittest.html#load-tests-protocol
18 | for details or
19 | https://github.com/python/cpython/blob/main/Lib/unittest/test/__main__.py
20 | for sample implementation.
21 | """
22 | 
23 | import os
24 | 
25 | 
26 | def load_tests(loader, standard_tests, unused_pattern):
27 |   """Our tests end in `_test.py`, so need to override the test discovery."""
28 |   this_dir = os.path.dirname(__file__)
29 |   package_tests = loader.discover(start_dir=this_dir, pattern="*_test.py")
30 |   standard_tests.addTests(package_tests)
31 |   return standard_tests
32 | 


--------------------------------------------------------------------------------
/language/mentionmemory/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/mentionmemory/encoders/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/mentionmemory/encoders/import_encoders.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Import encoders so that decorated encoders are added to registry."""
16 | 
17 | # pylint: disable=unused-import
18 | from language.mentionmemory.encoders import bert_encoder
19 | from language.mentionmemory.encoders import eae_encoder
20 | from language.mentionmemory.encoders import mauto_encoder
21 | from language.mentionmemory.encoders import mention_memory_encoder
22 | from language.mentionmemory.encoders import readtwice_encoder
23 | 
24 | # pylint: enable=unused-import
25 | 


--------------------------------------------------------------------------------
/language/mentionmemory/experiments/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/mentionmemory/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/mentionmemory/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py>=0.10.0
 2 | clu>=0.0.3
 3 | flax>=0.3.4
 4 | jax>=0.2.14
 5 | ml_collections>=0.1
 6 | numpy>=1.16
 7 | spacy>=3.1.2
 8 | scikit-learn>=0.24.2
 9 | scipy>=1.2.1
10 | tensorflow>=1.15.0
11 | 


--------------------------------------------------------------------------------
/language/mentionmemory/run.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | set -e
17 | set -x
18 | 
19 | # Install dependencies and run tests.
20 | 
21 | virtualenv -p python3 .
22 | source ./bin/activate
23 | 
24 | pip install tensorflow
25 |   pip install -r language/mentionmemory/requirements.txt
26 | python -m language.mentionmemory.run_tests
27 | 


--------------------------------------------------------------------------------
/language/mentionmemory/run_tests.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Find and run the tests.
16 | 
17 | Run as: python -m language.mentionmemory.run_tests
18 | """
19 | from absl.testing import absltest
20 | import language.mentionmemory
21 | 
22 | absltest.main(module=language.mentionmemory)
23 | 


--------------------------------------------------------------------------------
/language/mentionmemory/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/mentionmemory/tasks/import_tasks.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Import tasks so that decorated tasks are added to registry."""
16 | 
17 | # pylint: disable=unused-import
18 | # Block of imports needed to allow different tasks to get registered
19 | # with task registry.
20 | from language.mentionmemory.tasks import eae_task
21 | from language.mentionmemory.tasks import embedding_based_entity_qa_task
22 | from language.mentionmemory.tasks import example_task
23 | from language.mentionmemory.tasks import mauto_task
24 | from language.mentionmemory.tasks import mention_based_entity_qa_task
25 | from language.mentionmemory.tasks import mention_memory_task
26 | from language.mentionmemory.tasks import readtwice_task
27 | from language.mentionmemory.tasks import relation_classifier_task
28 | from language.mentionmemory.tasks import text_classifier
29 | from language.mentionmemory.tasks import ultra_fine_entity_typing_task
30 | 


--------------------------------------------------------------------------------
/language/mentionmemory/tasks/testdata/tacred/README.md:
--------------------------------------------------------------------------------
1 | Sample predictions (test set) for SpanBERT were downloaded from
2 | https://github.com/DFKI-NLP/tacrev/blob/master/results/test_results/spanbert_tacred_test.txt
3 | 


--------------------------------------------------------------------------------
/language/mentionmemory/training/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/mentionmemory/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/mentionmemory/utils/custom_types.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Contains custom type definitions."""
16 | from typing import Any, Callable, Dict, Sequence
17 | 
18 | import jax.numpy as jnp
19 | 
20 | Array = jnp.ndarray
21 | PRNGKey = jnp.ndarray
22 | Dtype = Any
23 | Shape = Sequence[int]
24 | InitType = Callable[[PRNGKey, Shape, Dtype], Array]
25 | MetricGroups = Dict[str, Dict[str, Array]]
26 | 


--------------------------------------------------------------------------------
/language/mentionmemory/utils/default_values.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Gather default values in central location."""
16 | 
17 | import flax.linen as nn
18 | 
19 | from language.mentionmemory.utils import initializers
20 | 
21 | kernel_init = initializers.truncated_normal(stddev=0.02)
22 | bias_init = nn.initializers.zeros
23 | layer_norm_epsilon = 1e-12
24 | 
25 | CLS_TOKEN = 101
26 | SEP_TOKEN = 102
27 | MASK_TOKEN = 103
28 | ENTITY_START_TOKEN = 1
29 | ENTITY_END_TOKEN = 2
30 | 
31 | # Value typically used to prevent division by zero.
32 | SMALL_NUMBER = 1e-8
33 | 
34 | LARGE_NUMBER = 1e10
35 | 


--------------------------------------------------------------------------------
/language/mentionmemory/utils/initializers.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Contains custom parameter initializers."""
16 | 
17 | import jax
18 | import jax.numpy as jnp
19 | 
20 | from language.mentionmemory.utils.custom_types import Array, Dtype, InitType, PRNGKey, Shape  # pylint: disable=g-multiple-import
21 | 
22 | 
23 | def truncated_normal(stddev: float) -> InitType:
24 |   """Truncated normal initializer."""
25 | 
26 |   def init(key: PRNGKey, shape: Shape, dtype: Dtype = jnp.float32) -> Array:
27 |     return jax.random.truncated_normal(
28 |         key=key, lower=-2., upper=2., shape=shape, dtype=dtype) * stddev
29 | 
30 |   return init
31 | 


--------------------------------------------------------------------------------
/language/mentionmemory/utils/testdata/eae_paper-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/mentionmemory/utils/testdata/eae_paper-00000-of-00001


--------------------------------------------------------------------------------
/language/mentionmemory/utils/testdata/mtb.v5-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/mentionmemory/utils/testdata/mtb.v5-00000-of-00001


--------------------------------------------------------------------------------
/language/multivec/requirements.txt:
--------------------------------------------------------------------------------
1 | h5py
2 | tf-hub-nightly
3 | scann
4 | 


--------------------------------------------------------------------------------
/language/multivec/utils/download.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | mkdir $DATA_DIR
17 | cd $DATA_DIR
18 | wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
19 | wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz
20 | tar -xf collectionandqueries.tar.gz
21 | tar -xf top1000.dev.tar.gz
22 | 
23 | 


--------------------------------------------------------------------------------
/language/nqg/README.md:
--------------------------------------------------------------------------------
1 | This directory has moved to `../compgen/nqg/`.
2 | 
3 | https://github.com/google-research/language/tree/master/language/compgen/nqg
4 | 


--------------------------------------------------------------------------------
/language/nql/demos/data/royal92/README.md:
--------------------------------------------------------------------------------
 1 | # royal92 data
 2 | 
 3 | This file contains instances of 12 familial relations which were extracted from royal92.ged, a widely-distributed public domain GEDCOM file containing
 4 | information on 3010 individuals and 1422 families of European royalty. The
 5 | 12 relations are those used originally in (Hinton 1986).  The data parsed used to convert the code was modified from code distributed on with (Yang et al, 2017).
 6 | 
 7 | Comment from original data source on http://www.daml.org/2001/01/gedcom/: _royal92.ged is a public domain GEDCOM file containing information on 3010
 8 | individuals and 1422 families of European royalty. royal92.daml was produced
 9 | using ged2daml._
10 | 
11 | ## Bibliography
12 | 
13 | * Hinton, G.E. (1986). _Learning distributed representations of concepts._ Proceedings of the Eighth Annual Conference of the Cognitive Science Society.
14 |  * Paper URL: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.408.7684&rep=rep1&type=pdf
15 | * Yang, Fan, Zhilin Yang, and William W. Cohen (2017). _Differentiable
16 | learning of logical rules for knowledge base completion_, in NeurIPS 2017.
17 |  * Paper URL: http://papers.nips.cc/paper/6826-differentiable-learning-of-logical-rules-for-knowledge-base-reasoning
18 |  * GitHub URL: https://github.com/fanyangxyz/Neural-LP
19 | 


--------------------------------------------------------------------------------
/language/nql/demos/gridworld_scaling/README.txt:
--------------------------------------------------------------------------------
 1 | Scalability experiments with NQL for the paper Scalable Neural Methods
 2 |   for Reasoning With a Symbolic Knowledge Base (ICLR 2020)
 3 | 
 4 | To create something very similar to the figure 1 graphic:
 5 | 
 6 |   % cd [this directory]
 7 |   % bash figure1.bash
 8 | 
 9 | Output will be in $HOME/new-results/figure1.png.
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/language/nql/demos/gridworld_scaling/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/nql/demos/gridworld_scaling/figure1.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # where experimental data will be stored
 4 | DATA_DIR=${HOME}/new-results
 5 | DATA_STEM=${DATA_DIR}/nql
 6 | SOURCE_DIR=`pwd`
 7 | 
 8 | # generate data
 9 | 
10 | bash ${SOURCE_DIR}/gendata_figure1.bash ${SOURCE_DIR} ${DATA_STEM} ${DATA_DIR}
11 | 
12 | # generate plots from data
13 | 
14 | python ${SOURCE_DIR}/plot_figure1.py ${DATA_STEM} ${DATA_DIR}
15 | 
16 | 


--------------------------------------------------------------------------------
/language/nql/demos/metaqa/README.txt:
--------------------------------------------------------------------------------
 1 | MetaQA experiments with NQL for the paper Scalable Neural Methods
 2 |   for Reasoning With a Symbolic Knowledge Base (ICLR 2020)
 3 | 
 4 | To reproduce experiment results in Table 3:
 5 | 
 6 | 0. cd [this directory]
 7 | 
 8 | 1. Download MetaQA datasets from
 9 | 
10 |     https://github.com/yuyuz/MetaQA
11 | 
12 | 2. Preprocess data
13 | 
14 |     python preprocess_data.py
15 | 
16 | 3. Run tensorflow experiments
17 | 
18 |     MetaQA-2hop:
19 |       python metaqa.py --rootdir /home/haitiansun/metaqa --num_hops=2 \
20 |       --train_file=qa_van2_train.exam --dev_file=qa_van2_dev.exam \
21 |       --test_file=qa_van2_test.exam --mask_seeds=False
22 | 
23 |     MetaQA-3hop:
24 |       python metaqa.py --rootdir /home/haitiansun/metaqa --num_hops=3 \
25 |       --train_file=qa_van3_train.exam --dev_file=qa_van3_dev.exam \
26 |       --test_file=qa_van3_test.exam --mask_seeds=False
27 | 
28 |     Note: you may change --mask_seeds=True for results with "ReifKB + mask"
29 | 


--------------------------------------------------------------------------------
/language/nql/demos/nell995/README.txt:
--------------------------------------------------------------------------------
 1 | Nell995 experiments with NQL for the paper Scalable Neural Methods
 2 |   for Reasoning With a Symbolic Knowledge Base (ICLR 2020)
 3 | 
 4 | To reproduce experiment results in Table 4:
 5 | 
 6 | 0. cd [this directory]
 7 | 
 8 | 1. Download MetaQA datasets from
 9 | 
10 |     git clone https://github.com/shehzaadzd/MINERVA.git
11 | 
12 | 2. Preprocess data
13 | 
14 |     python preprocess_data.py
15 | 
16 | 3. Run tensorflow experiments
17 | 
18 |     python nell995.py --rootdir=nell995/ --task=concept_athletehomestadium \
19 |     --num_hops=5 --epochs=50
20 | 
21 |     Note: you may change --task to run experiments on other queries.
22 | 
23 |     Available tasks:
24 |         concept_agentbelongstoorganization
25 |         concept_athletehomestadium
26 |         concept_athleteplaysforteam
27 |         concept_athleteplaysinleague
28 |         concept_athleteplayssport
29 |         concept_organizationheadquarteredincity
30 |         concept_organizationhiredperson
31 |         concept_personborninlocation
32 |         concept_personleadsorganization
33 |         concept_teamplaysinleague
34 |         concept_teamplayssport
35 |         concept_worksfor
36 | 


--------------------------------------------------------------------------------
/language/nql/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Install Neural Query Language."""
16 | 
17 | from setuptools import find_packages
18 | from setuptools import setup
19 | 
20 | setup(
21 |     name="nql",
22 |     version="0.0.1.dev",
23 |     packages=find_packages(),
24 |     description="Neural Query Language",
25 |     author="Google Inc.",
26 |     author_email="no-reply@google.com",
27 |     url="https://github.com/google-research/language/tree/master/language/nql",
28 |     license="Apache 2.0",
29 |     install_requires=[
30 |         "tensorflow-gpu",
31 |         "scipy",
32 |         "mock",
33 |         "numpy",
34 |     ],
35 | )
36 | 


--------------------------------------------------------------------------------
/language/orqa/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/orqa/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/orqa/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/orqa/experiments/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/orqa/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/orqa/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ORQA ops."""
16 | import os
17 | import tensorflow.compat.v1 as tf
18 | 
19 | try:
20 |   orqa_ops
21 | except NameError:
22 |   orqa_ops = tf.load_op_library(
23 |       os.path.join(os.path.dirname(os.path.abspath(__file__)), "orqa_ops.so"))
24 | 
25 | has_answer = orqa_ops.has_answer
26 | reader_inputs = orqa_ops.reader_inputs
27 | 


--------------------------------------------------------------------------------
/language/orqa/predict/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/orqa/preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/orqa/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorflow~=2.1.0
 2 | tensorflow-text~=2.1.0
 3 | tf-models-official==2.1.0.dev2
 4 | bert-tensorflow==1.0.4
 5 | tf-hub-nightly
 6 | Jinja2~=2.11.2
 7 | tornado~=4.5.1
 8 | wikiextractor==0.1
 9 | sentencepiece==0.1.91
10 | beautifulsoup4==4.9.3
11 | lxml==4.6.3
12 | https://storage.googleapis.com/scann/releases/1.0.0/scann-1.0.0-cp37-cp37m-linux_x86_64.whl
13 | 


--------------------------------------------------------------------------------
/language/orqa/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/orqa/utils/scann_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for scann_utils.py."""
16 | import os
17 | 
18 | from language.orqa.utils import scann_utils
19 | import numpy as np
20 | import tensorflow.compat.v1 as tf
21 | 
22 | 
23 | class ScannUtilsTest(tf.test.TestCase):
24 | 
25 |   def test_scann_searcher(self):
26 |     temp_dir = self.create_tempdir().full_path
27 |     checkpoint_path = os.path.join(temp_dir, "dummy_db.ckpt")
28 | 
29 |     dummy_db = np.random.uniform(size=[1024, 32]).astype(np.float32)
30 |     scann_utils.write_array_to_checkpoint("dummy_db", dummy_db, checkpoint_path)
31 | 
32 |     dummy_queries = np.random.uniform(size=[4, 32]).astype(np.float32)
33 |     _, searcher = scann_utils.load_scann_searcher(
34 |         var_name="dummy_db", checkpoint_path=checkpoint_path, num_neighbors=10)
35 |     distance, index = searcher.search_batched(dummy_queries)
36 |     self.assertAllEqual(distance.numpy().shape, [4, 10])
37 |     self.assertAllEqual(index.numpy().shape, [4, 10])
38 | 
39 | 
40 | if __name__ == "__main__":
41 |   tf.test.main()
42 | 


--------------------------------------------------------------------------------
/language/qa_counterfactuals/figure1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/language/865fae65f63ef7e6b2989d4ff8b47f61750415a8/language/qa_counterfactuals/figure1.jpeg


--------------------------------------------------------------------------------
/language/qresp/README.md:
--------------------------------------------------------------------------------
1 | Code for [Entity-Centric Query Refinement] (https://arxiv.org/abs/2204.00743) will be released here.
2 | 


--------------------------------------------------------------------------------
/language/quest/common/document_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilities for reading and writing documents files."""
16 | 
17 | import dataclasses
18 | 
19 | from language.quest.common import jsonl_utils
20 | 
21 | 
22 | @dataclasses.dataclass(frozen=True)
23 | class Document:
24 |   """Represents a document with its title and text."""
25 |   # Document title (should be unique in corpus).
26 |   title: str
27 |   # Document text.
28 |   text: str
29 | 
30 | 
31 | def read_documents(filepath, limit=None):
32 |   documents_json = jsonl_utils.read(filepath, limit=limit, verbose=True)
33 |   return [Document(**document) for document in documents_json]
34 | 
35 | 
36 | def write_documents(filepath, documents):
37 |   documents_json = [dataclasses.asdict(document) for document in documents]
38 |   jsonl_utils.write(filepath, documents_json)
39 | 


--------------------------------------------------------------------------------
/language/quest/common/vocab_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Utilities for dealing with T5 sentence piece model."""
16 | 
17 | from sentencepiece import SentencePieceProcessor
18 | 
19 | 
20 | class T5SpmWrapper(object):
21 |   """Wrapper for T5 sentence piece model."""
22 | 
23 |   def __init__(self, sp_model):
24 |     self.sp = SentencePieceProcessor()
25 |     self.sp.Load(sp_model)
26 | 
27 |   def tokenize(self, input_string):
28 |     """Return list of tokens for input."""
29 |     return self.sp.EncodeAsPieces(input_string)
30 | 
31 |   def truncate(self, input_string, num_tokens):
32 |     """Truncate input to be `num_tokens`."""
33 |     tokens = self.sp.EncodeAsPieces(input_string)
34 |     truncated_tokens = tokens[:num_tokens]
35 |     return self.sp.DecodePieces(truncated_tokens)
36 | 


--------------------------------------------------------------------------------
/language/quest/eval/README.md:
--------------------------------------------------------------------------------
1 | These scripts expect that systems have produced predictions
2 | following the same jsonl format of the original examples files. Only the `query` and `docs` fields need to be populated
3 | for predictions.
4 | 
5 | Use `run_eval.py` to compute average precision, recall, and F1.
6 | 
7 | To analyze the average recall and MRecall of a candidate set produced by a retriever prior to thresholding or classifying candidates to produce a final set, use `analyze_retriever.py`.
8 | 


--------------------------------------------------------------------------------
/language/quest/t5xr/README.md:
--------------------------------------------------------------------------------
1 | We provide data preprocessing scripts to help setup dual
2 | encoder experiments. To run fine-tuning and inference follow
3 | the instructions in the `t5x_retrieval` library:
4 | 
5 | https://github.com/google-research/t5x_retrieval
6 | 
7 | You can use `write_doc_idx_maps.py` and `convert_examples.py` to
8 | convert examples and documents jsonl files to the indexed format used by the `t5x_retrieval` library.
9 | 


--------------------------------------------------------------------------------
/language/quest/xattn/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains scripts to train a T5-based cross-attention classifier.
 2 | The codebase relies upon the [t5x repository](https://github.com/google-research/t5x).
 3 | Follow instructions from that library to define a task, run fine-tuning, and
 4 | generate scores at inference time.
 5 | 
 6 | To generate training examples, you can run `gen_training_examples.py`.
 7 | This script can also run on the validation set to generate an evaluation set to efficiently
 8 | evaluate model performance during fine-tuning.
 9 | 
10 | To generate predictions, you should first run `gen_inference_inputs.py`. Then, generate scores following the inference instructions from the `t5x` library with `--gin.infer.mode="'score'"`.
11 | You can determine a threshold by running `determine_threshold.py` on the validation
12 | set.
13 | Then, you can run `filter_predictions.py` to filter a set of retrieved documents based on the cross-attention classifier.
14 | 


--------------------------------------------------------------------------------
/language/quest/xattn/xattn_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Common constants and functions for xattn model."""
16 | 
17 | # Labels used for T5 output.
18 | POS_LABEL = "relevant"
19 | NEG_LABEL = "not relevant"
20 | 
21 | # Input format for T5.
22 | INPUT_FORMAT = "query: {query}, doc: {doc}"
23 | 
24 | 
25 | def get_example(
26 |     query,
27 |     doc_title,
28 |     doc_title_to_text,
29 |     spm_wrapper,
30 |     is_relevant,
31 |     context_size
32 | ):
33 |   """Adds a tuple representing an example to `outputs`."""
34 |   if doc_title not in doc_title_to_text:
35 |     raise Exception("Missing document title: %s" % doc_title)
36 | 
37 |   doc_text = doc_title + " " + doc_title_to_text[doc_title]
38 |   truncated_text = spm_wrapper.truncate(doc_text, context_size)
39 |   input_string = INPUT_FORMAT.format(
40 |       query=query, doc=truncated_text)
41 |   output_string = (
42 |       POS_LABEL if is_relevant else NEG_LABEL)
43 |   return (input_string, output_string)
44 | 


--------------------------------------------------------------------------------
/language/question_answering/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/question_answering/b2t2/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | tensorflow
3 | bert-tensorflow
4 | 


--------------------------------------------------------------------------------
/language/question_answering/bert_joint/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/question_answering/decatt_docreader/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/question_answering/decatt_docreader/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/question_answering/decatt_docreader/experiments/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/question_answering/decatt_docreader/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/question_answering/decatt_docreader/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/question_answering/decatt_docreader/preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/language/question_answering/decatt_docreader/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/realm/preprocessing.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto2";
 2 | 
 3 | import "tensorflow/core/example/example.proto";
 4 | 
 5 | package language.realm;
 6 | 
 7 | service Preprocessing {
 8 |   // Return a tf.Example given an unused (usually empty) input tf.Example.
 9 |   rpc PopExample(tensorflow.Example) returns (tensorflow.Example) {
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/language/relation_learning/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/relation_learning/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/relation_learning/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/search_agents/demo.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Demo for testing the environment server."""
16 | 
17 | from absl import app
18 | from absl import flags
19 | from absl import logging
20 | import grpc
21 | 
22 | from language.search_agents import environment_pb2
23 | from language.search_agents import environment_pb2_grpc
24 | 
25 | flags.DEFINE_string('server_address', 'localhost:50055',
26 |                     'Address of the Environment Server.')
27 | FLAGS = flags.FLAGS
28 | 
29 | 
30 | def main(_):
31 |   channel_creds = grpc.local_channel_credentials()
32 |   channel = grpc.secure_channel(FLAGS.server_address, channel_creds)
33 |   grpc.channel_ready_future(channel).result(timeout=10)
34 |   stub = environment_pb2_grpc.EnvironmentServiceStub(channel)
35 | 
36 |   request = environment_pb2.GetQueryRequest()
37 |   response = stub.GetQuery(request, timeout=10)
38 |   logging.info('\n\nReceived GetQueryResponse:\n%s\n', response)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |   app.run(main)
43 | 


--------------------------------------------------------------------------------
/language/search_agents/muzero/utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tests for language.search_agents.muzero.utils.py."""
16 | 
17 | from language.search_agents.muzero import utils
18 | 
19 | import tensorflow as tf
20 | 
21 | 
22 | class UtilsTest(tf.test.TestCase):
23 | 
24 |   def test_escape_for_lucene(self):
25 |     self.assertEqual(utils.escape_for_lucene("foo:bar-baz"), "foo\\:bar\\-baz")
26 | 
27 | 
28 | if __name__ == "__main__":
29 |   tf.test.main()
30 | 


--------------------------------------------------------------------------------
/language/search_agents/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py
 2 | apache_beam
 3 | attrs
 4 | cloudpickle==1.3.0
 5 | grpcio>=1.32.0
 6 | grpcio-tools
 7 | gym
 8 | keras
 9 | nltk
10 | numpy
11 | pygtrie
12 | tensorflow==2.4.1
13 | tensorflow-addons
14 | tensorflow-probability==0.11.0
15 | tensorflow-serving-api
16 | tensorflow_text
17 | transformers
18 | 


--------------------------------------------------------------------------------
/language/serene/constants.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Constants for fever data."""
16 | 
17 | 
18 | VERIFIABLE = 'VERIFIABLE'
19 | NOT_VERIFIABLE = 'NOT VERIFIABLE'
20 | 
21 | # Classes used for claim classification and labeling which evidence
22 | # support/refute the claim
23 | NOT_ENOUGH_INFO = 'NOT ENOUGH INFO'
24 | REFUTES = 'REFUTES'
25 | SUPPORTS = 'SUPPORTS'
26 | FEVER_CLASSES = [REFUTES, SUPPORTS, NOT_ENOUGH_INFO]
27 | 
28 | # Classes used for scoring candidate evidence relevance
29 | MATCHING = 'MATCHING'
30 | NOT_MATCHING = 'NOT_MATCHING'
31 | EVIDENCE_MATCHING_CLASSES = [NOT_MATCHING, MATCHING]
32 | 
33 | UKP_WIKI = 'ukp_wiki'
34 | UKP_PRED = 'ukp_pred'
35 | UKP_TYPES = [UKP_PRED, UKP_WIKI]
36 | DRQA = 'drqa'
37 | LUCENE = 'lucene'
38 | DOC_TYPES = [UKP_WIKI, UKP_PRED, DRQA, LUCENE]
39 | 


--------------------------------------------------------------------------------
/language/serene/fever.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto2";
 2 | 
 3 | package language.google.fact_check;
 4 | 
 5 | message WikipediaDump {
 6 |   message Entity {
 7 |     optional string mention = 1;
 8 |     optional string entity = 2;
 9 |   }
10 | 
11 |   message Sentence {
12 |     optional string text = 1;
13 |     repeated Entity entities = 2;
14 |   }
15 | 
16 |   optional string id = 1;
17 |   optional string text = 2;
18 |   optional string title = 3;
19 |   map<int32, Sentence> sentences = 4;
20 | }
21 | 
22 | message FeverExample {
23 |   // Format is
24 |   //    [Annotation ID, Evidence ID, Wikipedia URL, sentence ID]
25 |   // see http://fever.ai/2018/task.html#TrainingDevelopment_Data_format_30
26 |   // We do not care about the unused Annotation and Evidence IDs, and optionally
27 |   // add `text' which represents the actual sentence contents.
28 |   message Evidence {
29 |     optional string wikipedia_url = 1;
30 |     optional string sentence_id = 2;
31 | 
32 |     // Not populated in gold data.
33 |     optional string sentence = 3;
34 | 
35 |     optional string page_title = 4;
36 |   }
37 | 
38 |   message EvidenceSet {
39 |     repeated Evidence evidence = 1;
40 |   }
41 | 
42 |   enum Label {
43 |     UNKNOWN_LABEL = 0;
44 |     SUPPORTS = 1;
45 |     REFUTES = 2;
46 |     NOT_ENOUGH_INFO = 3;
47 |   }
48 | 
49 |   optional string id = 1;
50 |   optional Label label = 2;
51 | 
52 |   optional string claim = 3;
53 | 
54 |   repeated EvidenceSet evidences = 4;
55 | }
56 | 


--------------------------------------------------------------------------------
/language/serene/retrieval.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto2";
 2 | 
 3 | package language.google.fact_check;
 4 | 
 5 | message Document {
 6 |   optional string doc_id = 1;
 7 |   optional string content = 2;
 8 |   optional double ir_score = 3;
 9 | }
10 | 
11 | message GetDocumentsResponse {
12 |   repeated Document documents = 1;
13 | }
14 | 
15 | message GetDocumentsRequest {
16 |   optional string query = 1;
17 |   optional int32 max_num_results = 2;
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/language/serene/serene.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # """Minimal build target for initial checkin."""
16 | 


--------------------------------------------------------------------------------
/language/serene/types.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Types for fever data."""
16 | 
17 | import dataclasses
18 | 
19 | Json = Dict[Text, Any]
20 | 
21 | 
22 | # An evidence set contains a list of tuples, each representing one line
23 | # of evidence
24 | # First two ints are IDs competition runners use, then the wiki page, then the
25 | # sentence number.
26 | @dataclasses.dataclass
27 | class Evidence:
28 |   annotation_id: Optional[int]
29 |   evidence_id: int
30 |   # fever_identifier: not actually a url, but page title
31 |   wikipedia_url: Optional[Text]
32 |   sentence_id: Optional[int]
33 | 
34 | 
35 | # This must go after Evidence, otherwise python cannot parse it
36 | EvidenceSet = List[Evidence]
37 | EvidenceFromJson = Tuple[Optional[int], int, Optional[Text], Optional[int]]
38 | 
39 | 
40 | @dataclasses.dataclass
41 | class FeverMetrics:
42 |   strict_score: float
43 |   accuracy_score: float
44 |   precision: float
45 |   recall: float
46 |   f1: float
47 |   n_examples: int
48 | 


--------------------------------------------------------------------------------
/language/serene/web_api.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """A simple web api wrapper around the wikipedia sql db.
16 | 
17 | This is helpful if trying to query the fever wikipedia dump without needing
18 | to use directly access protobufs.
19 | """
20 | from absl import app
21 | from absl import flags
22 | import flask
23 | from language.serene import wiki_db
24 | 
25 | FLAGS = flags.FLAGS
26 | flags.DEFINE_string('wiki_db_path', None, '')
27 | 
28 | 
29 | def main(_):
30 |   db = wiki_db.WikiDatabase.from_local(FLAGS.wiki_db_path)
31 |   flask_app = flask.Flask(__name__)
32 | 
33 |   @flask_app.route('/wiki_page_sentence', methods=['POST'])
34 |   def get_page_sentence():  # pylint: disable=unused-variable
35 |     request = flask.request.json
36 |     maybe_sentence = db.get_page_sentence(request['wikipedia_url'],
37 |                                           int(request['sentence_id']))
38 |     return flask.jsonify({'text': maybe_sentence})
39 | 
40 |   flask_app.run()
41 | 
42 | 
43 | if __name__ == '__main__':
44 |   app.run(main)
45 | 


--------------------------------------------------------------------------------
/language/table_text_eval/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/templama/install.sh:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #!/bin/bash
16 | 
17 | SLING_BASE="$1"
18 | 
19 | cd $SLING_BASE
20 | 
21 | # Install SLING via pip.
22 | sudo -H pip3 install https://ringgaard.com/data/dist/sling-3.0.0-py3-none-linux_x86_64.whl
23 | 
24 | # Download SLING KB and en wikipedia mapping.
25 | sling fetch --dataset kb,mapping
26 | 


--------------------------------------------------------------------------------
/language/templama/templates.csv:
--------------------------------------------------------------------------------
 1 | Wikidata ID,Relation,Template
 2 | P54,member of sports team,<subject> plays for <object>.
 3 | P39,position held,<subject> holds the position of <object>.
 4 | P108,employer,<subject> works for <object>.
 5 | P102,political party,<subject> is a member of the <object>.
 6 | P286,head coach,<object> is the head coach of <subject>.
 7 | P69,educated at,<subject> attended <object>.
 8 | P488,chairperson,<object> is the chair of <subject>.
 9 | P6,head of government,<object> is the head of the government of <subject>.
10 | P127,owned by,<subject> is owned by <object>.
11 | 


--------------------------------------------------------------------------------
/language/totto/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/totto/baseline_preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/totto/eval_requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py
2 | pytest
3 | sacrebleu
4 | six
5 | wheel
6 | 


--------------------------------------------------------------------------------
/language/totto/sample/example-3.html:
--------------------------------------------------------------------------------
 1 | <!doctype html> <html> <head><style>
 2 |              table { border-collapse: collapse; }
 3 |              th, td {
 4 |                word-wrap: break-word;
 5 |                max-width: 100%;
 6 |                font-family: "Trebuchet MS", Arial, Helvetica, sans-serif;
 7 |                border-bottom: 1px solid #ddd;
 8 |                padding: 5px;
 9 |                text-align: left;
10 |              }
11 |             tr:hover {background: #f4f4f4;}
12 |             tr:hover .highlighted {background: repeating-linear-gradient(
13 |                     45deg,
14 |                     #ffff99,
15 |                     #ffff99 10px,
16 |                     #f4f4f4 10px,
17 |                     #f4f4f4 20px
18 |                   );}
19 |            .highlighted { background-color: #ffff99; }
20 |           </style></head><body><h2>Demetrius the Fair</h2><br> <b>Section Title</b>: Sources <br><b>Table Section Text</b>: "smith-bio/0198". ancientlibrary.com. <br> <table>
21 | <tr> <td class="highlighted"  colspan=3 rowspan=1 > Demetrius the Fair Died: 249 BC </td></tr>
22 | <tr> <th colspan=3 rowspan=1 > Regnal titles </th></tr>
23 | <tr> <td colspan=1 rowspan=1 > Preceded by Magas </td><td class="highlighted"  colspan=1 rowspan=1 > King of Cyrene 250 BC – 249 BC </td><td colspan=1 rowspan=1 > VacantRepublic, under Ptolemaic rule from 246 BCTitle next held byPtolemy VIII Physcon </td></tr>
24 | </table> <br> <h3>Sentence(s)</h3>Demetrius was a King of Cyrene.<br> Demetrius the Fair was a King of Cyrene.<br> Demetrius the Fair (250 BC) was the king of Cyrene. </body></html>
25 | 


--------------------------------------------------------------------------------
/language/totto/sample/output_sample.txt:
--------------------------------------------------------------------------------
1 | Colin Hanlon starred as Pete in The 12 in 2015.
2 | École Polytechnique has 4 Fields Medal winners.
3 | The New Hampshire census of 2010 reported that there were 7,230 people living in Swanzey.
4 | King Demetrius reigned over Cyrene.
5 | On October 10, 2012, The Nashville series premiered to over 8.93 million viewers.
6 | 


--------------------------------------------------------------------------------
/language/wino_dict/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/language/wino_dict/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py~=1.1.0
2 | nltk~=3.7
3 | spacy~=3.3.1
4 | tensorflow~=2.8.2
5 | tfds_nightly~=4.6.0
6 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/academic-prefix.txt:
--------------------------------------------------------------------------------
1 | create database academic;
2 | use academic;
3 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/add_indices.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Adds indices to databases which require them."""
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | import argparse
21 | import sqlite3
22 | 
23 | 
24 | def main(db_name):
25 |   with open('data_utils/extra_' + db_name + '_indices.txt') as infile:
26 |     indices = infile.read().split('\n')
27 | 
28 |   db = sqlite3.connect('databases/' + db_name + '.db')
29 |   c = db.cursor()
30 | 
31 |   for index in indices:
32 |     print('Adding index:')
33 |     print(index)
34 |     q = index
35 |     c.execute(q)
36 |     db.commit()
37 | 
38 |   db.close()
39 | 
40 | 
41 | if __name__ == '__main__':
42 |   parser = argparse.ArgumentParser()
43 |   parser.add_argument(
44 |       '--database_name', type=str, help='The database to add indices to.')
45 |   args = parser.parse_args()
46 |   main(args.database_name)
47 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/advising-prefix.txt:
--------------------------------------------------------------------------------
1 | create database advising;
2 | use advising;
3 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/atis-prefix.txt:
--------------------------------------------------------------------------------
1 | create database atis;
2 | use atis;
3 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/extra_academic_indices.txt:
--------------------------------------------------------------------------------
1 | CREATE INDEX IF NOT EXISTS "author_oid" ON "author" ("oid");
2 | CREATE INDEX IF NOT EXISTS "cite_cited" ON "cite" ("cited");
3 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/extra_imdb_indices.txt:
--------------------------------------------------------------------------------
 1 | CREATE INDEX IF NOT EXISTS "cast_msid" ON "cast" ("msid");
 2 | CREATE INDEX IF NOT EXISTS "directed_by_did" ON "directed_by" ("did");
 3 | CREATE INDEX IF NOT EXISTS "directed_by_msid" ON "directed_by" ("msid");
 4 | CREATE INDEX IF NOT EXISTS "made_by_msid" ON "made_by" ("msid");
 5 | CREATE INDEX IF NOT EXISTS "made_by_pid" ON "made_by" ("pid");
 6 | CREATE INDEX IF NOT EXISTS "cast_aid" ON "cast" ("aid");
 7 | CREATE INDEX IF NOT EXISTS "actor_aid" ON "actor" ("aid");
 8 | CREATE INDEX IF NOT EXISTS "actor_gender" ON "actor" ("gender");
 9 | CREATE INDEX IF NOT EXISTS "movie_mid" ON "movie" ("mid");
10 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/extra_scholar_indices.txt:
--------------------------------------------------------------------------------
 1 | CREATE INDEX IF NOT EXISTS "writes_authorId" ON "writes" ("authorId");
 2 | CREATE INDEX IF NOT EXISTS "writes_paperId" ON "writes" ("paperId");
 3 | DROP INDEX IF EXISTS "author_authorName";
 4 | CREATE INDEX IF NOT EXISTS "author_authorName" ON "author" ("authorName" collate nocase);
 5 | DROP INDEX IF EXISTS "dataset_datasetName";
 6 | CREATE INDEX IF NOT EXISTS "dataset_datasetName" ON "dataset" ("datasetName" collate nocase);
 7 | DROP INDEX IF EXISTS "journal_journalName";
 8 | CREATE INDEX IF NOT EXISTS "journal_journalName" ON "journal" ("journalName" collate nocase);
 9 | DROP INDEX IF EXISTS "keyphrase_keyphraseName";
10 | CREATE INDEX IF NOT EXISTS "keyphrase_keyphraseName" ON "keyphrase" ("keyphraseName" collate nocase);
11 | CREATE INDEX IF NOT EXISTS "paper_title" ON "paper" ("title" collate nocase);
12 | CREATE INDEX IF NOT EXISTS "venue_venueName" ON "venue" ("venueName" collate nocase);
13 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/geoquery-prefix.txt:
--------------------------------------------------------------------------------
1 | create database geoquery;
2 | use geoquery;
3 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/imdb-prefix.txt:
--------------------------------------------------------------------------------
1 | create database imdb;
2 | use imdb;
3 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/scholar-prefix.txt:
--------------------------------------------------------------------------------
1 | create database scholar;
2 | use scholar;
3 | 


--------------------------------------------------------------------------------
/language/xsp/data_utils/yelp-prefix.txt:
--------------------------------------------------------------------------------
1 | create database yelp;
2 | use yelp;
3 | 


--------------------------------------------------------------------------------
/language/xsp/model/local_model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_options": {
 3 |     "bert_vocab_path": "",
 4 |     "max_num_tokens": 512,
 5 |     "max_decode_length": 100
 6 |   },
 7 |   "model_parameters": {
 8 |     "use_segment_ids": false,
 9 |     "use_foreign_key_features": false,
10 |     "use_alignment_features": false,
11 |     "pretrained_bert_dir": "",
12 |     "source_embedding_dims": 128,
13 |     "target_embedding_dims": 128,
14 |     "encoder_dims": 128,
15 |     "decoder_dims": 128,
16 |     "max_decoder_relative_distance": 8,
17 |     "num_decoder_layers": 2,
18 |     "num_heads": 8,
19 |     "decoder_ff_layer_hidden_size": 512
20 |   },
21 |   "training_options": {
22 |     "tpu_iterations_per_loop": 1000,
23 |     "batch_size": 2,
24 |     "training_steps": 1000,
25 |     "layer_dropout_rate": 0.3,
26 |     "optimizer_learning_rate": 0.00008,
27 |     "optimizer_warmup_steps": 200,
28 |     "freeze_pretrained_steps": 0,
29 |     "after_restart_learning_rate": 0.00008
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/language/xsp/model/model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_options": {
 3 |     "bert_vocab_path": "",
 4 |     "max_num_tokens": 512,
 5 |     "max_decode_length": 100
 6 |   },
 7 |   "model_parameters": {
 8 |     "use_segment_ids": false,
 9 |     "use_foreign_key_features": false,
10 |     "use_alignment_features": false,
11 |     "pretrained_bert_dir": "",
12 |     "source_embedding_dims": 128,
13 |     "target_embedding_dims": 128,
14 |     "encoder_dims": 128,
15 |     "decoder_dims": 128,
16 |     "max_decoder_relative_distance": 8,
17 |     "num_decoder_layers": 2,
18 |     "num_heads": 8,
19 |     "decoder_ff_layer_hidden_size": 512
20 |   },
21 |   "training_options": {
22 |     "tpu_iterations_per_loop": 1000,
23 |     "batch_size": 32,
24 |     "training_steps": 30000,
25 |     "layer_dropout_rate": 0.3,
26 |     "optimizer_learning_rate": 0.00008,
27 |     "optimizer_warmup_steps": 5625,
28 |     "freeze_pretrained_steps": 2100,
29 |     "after_restart_learning_rate": 0.00008
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/language/xsp/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-beam==2.22.0
2 | sqlparse==0.3.1
3 | tf-slim==1.1.0
4 | timeout-decorator==0.4.1
5 | tqdm==4.49.0
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Install projects from the Language Team."""
16 | import os
17 | from setuptools import find_packages
18 | from setuptools import setup
19 | 
20 | 
21 | def read(fname):
22 |   return open(os.path.join(os.path.dirname(__file__), fname)).read()
23 | 
24 | 
25 | setup(
26 |     name="language",
27 |     version="0.0.1.dev",
28 |     packages=find_packages(),
29 |     description="Google AI Language.",
30 |     long_description=read("README.md"),
31 |     author="Google Inc.",
32 |     url="https://github.com/google-research/language",
33 |     license="Apache 2.0",
34 |     install_requires=[
35 |         "tensorflow-gpu~=1.15.0",
36 |     ],
37 |     extras_require={
38 |         "consistent-zero-shot-nmt": [
39 |             "tensorflow-probability==0.6.0",
40 |             "tensor2tensor==1.11.0",
41 |         ],
42 |     })
43 | 


--------------------------------------------------------------------------------