├── .gitignore ├── README.md ├── data ├── atis │ ├── atis_dev.tsv │ ├── atis_extra.tsv │ ├── atis_lexicon.txt │ ├── atis_ontology.txt │ ├── atis_test.tsv │ └── atis_train.tsv ├── geo │ ├── geo_dev.tsv │ ├── geo_extra.tsv │ ├── geo_lexicon.txt │ ├── geo_test.tsv │ ├── geo_train.tsv │ └── train600.tsv └── overnight │ ├── basketball_extra.tsv │ ├── basketball_lexicon.txt │ ├── basketball_test.tsv │ ├── basketball_train.tsv │ ├── blocks_extra.tsv │ ├── blocks_lexicon.txt │ ├── blocks_test.tsv │ ├── blocks_train.tsv │ ├── calendar_extra.tsv │ ├── calendar_lexicon.txt │ ├── calendar_test.tsv │ ├── calendar_train.tsv │ ├── calendarplus_test.tsv │ ├── calendarplus_train.tsv │ ├── housing_extra.tsv │ ├── housing_lexicon.txt │ ├── housing_test.tsv │ ├── housing_train.tsv │ ├── publications_extra.tsv │ ├── publications_lexicon.txt │ ├── publications_test.tsv │ ├── publications_train.tsv │ ├── recipes_extra.tsv │ ├── recipes_lexicon.txt │ ├── recipes_test.tsv │ ├── recipes_train.tsv │ ├── restaurants_extra.tsv │ ├── restaurants_lexicon.txt │ ├── restaurants_test.tsv │ ├── restaurants_train.tsv │ ├── socialnetwork_extra.tsv │ ├── socialnetwork_lexicon.txt │ ├── socialnetwork_test.tsv │ └── socialnetwork_train.tsv ├── models ├── Beam.py ├── attention │ └── attention_rnn.py ├── construct_models.py ├── decoder │ ├── decoder_rnn.py │ └── decoder_rnn_pointer.py ├── dual_learning.py ├── embedding │ └── embedding_rnn.py ├── enc2dec │ └── state_transition.py ├── encoder │ └── encoder_rnn.py ├── encoder_decoder.py ├── generator │ ├── generator_naive.py │ └── generator_pointer.py ├── language_model.py ├── model_attn.py ├── model_attnptr.py ├── model_utils.py ├── penalties.py └── reward.py ├── pull_dependency.sh ├── requirements.txt ├── run ├── run_dual_learning.sh ├── run_language_model.sh ├── run_pseudo_method.sh ├── run_question_generation.sh └── run_semantic_parsing.sh ├── scripts ├── dual_learning.py ├── language_model.py ├── pseudo_method.py ├── question_generation.py └── semantic_parsing.py └── utils ├── batch.py ├── bleu.py ├── constants.py ├── domain ├── atis_evaluator.py ├── domain_atis.py ├── domain_base.py ├── domain_geo.py └── domain_overnight.py ├── example.py ├── gpu.py ├── hyperparam.py ├── lexicon.py ├── logger.py ├── loss.py ├── optimizer.py ├── seed.py ├── solver ├── solver_base.py ├── solver_dual_learning.py ├── solver_language_model.py ├── solver_pseduo_method.py ├── solver_question_generation.py └── solver_semantic_parsing.py ├── statistics.py ├── vocab.py └── word2vec.py /.gitignore: -------------------------------------------------------------------------------- 1 | exp 2 | tmp 3 | log 4 | lib 5 | state 6 | evaluator 7 | data/.cache 8 | data/*/*_vocab.* 9 | */*/*/__pycache__ 10 | */*/__pycache__ 11 | */__pycache__ 12 | */*/*/*.pyc 13 | */*/*.pyc 14 | */*.pyc 15 | module-classes.txt 16 | .ipynb_checkpoints 17 | *.ipynb 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Semantic Parsing with Dual Learning 2 | 3 | This repository contains source code and data for the ACL 2019 Long Paper ["Semantic Parsing with Dual Learning"](https://www.aclweb.org/anthology/P19-1007.pdf). 4 | 5 | If you use our framework in your work, please cite it as follows: 6 | 7 | @inproceedings{cao-etal-2019-semantic, 8 | title = "Semantic Parsing with Dual Learning", 9 | author = "Cao, Ruisheng and 10 | Zhu, Su and 11 | Liu, Chen and 12 | Li, Jieyu and 13 | Yu, Kai", 14 | booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", 15 | month = jul, 16 | year = "2019", 17 | address = "Florence, Italy", 18 | publisher = "Association for Computational Linguistics", 19 | url = "https://www.aclweb.org/anthology/P19-1007", 20 | doi = "10.18653/v1/P19-1007", 21 | pages = "51--64" 22 | } 23 | 24 | ---- 25 | 26 | ## Setup 27 | 28 | * First, create the environment 29 | 30 | conda create -n sp python=3.6 31 | source activate sp 32 | pip3 install -r requirements.txt 33 | 34 | * Second, pull all the dependencies from remote repository, including `evaluator`, `lib` and `glove6B word embeddings`. 35 | 36 | ./pull_dependency.sh 37 | 38 | * Construct vocabulary for all datasets in advance under corresponding directory `data`, run 39 | 40 | python3 utils/statistics.py 41 | 42 | ---- 43 | 44 | ## **Dataset** 45 | 46 | ---- 47 | 48 | Experiments are conducted on two semantic parsing dataset **ATIS** and **OVERNIGHT**, including traditional __train__, __dev__ and __test__ files, plus elaborated __lexicon__ files for *entity mapping* and *reverse entity mapping* techniques, and __extra__ files for synthesized unlabeled logical forms. An additional ontology file are created for dataset **ATIS** since there is no evaluator available. 49 | 50 | ---- 51 | 52 | ### **ATIS** 53 | 54 | Files: 55 | 56 | - *atis_train.tsv*: training dataset, 4433 samples. 57 | - *atis_dev.tsv*: validation dataset, 491 samples. 58 | - *atis_test.tsv*: test dataset, 448 samples. 59 | - *atis_extra.tsv*: synthesized logical forms (Lambda Calculus), 3797 samples. 60 | - *atis_lexicon.txt*: each line specifies a one-to-one mapping between a natural language noun phrase and its corresponding entity representation in knowledge base, such as pair `(first class, fist:cl)`. 61 | - *atis_ontology.txt*: specify all the entity types, unary, and binary predicates used in the logical form. 62 | 63 | **Attention**: Since there is no evaluator for this domain, we provide a simple type consistency checker for the target logical form (`utils/domain/atis_evaluator.py`). *atis_train.tsv*, *aits_dev.tsv* and *atis_test.tsv* are preprocessed version provided by [Dong and Lapata (2018)](https://www.aclweb.org/anthology/P16-1004.pdf), where natural language queries are lowercased and stemmed with NLTK, and entity mentions are replaced by numbered markers. For example: 64 | 65 | flight from ci0 to ci1 ( lambda $0 e ( and ( flight $0 ) ( from $0 ci0 ) ( to $0 ci1 ) ) ) 66 | 67 | ---- 68 | 69 | ### **OVERNIGHT** 70 | 71 | It contains eight sub-domains in total, namely *basketball*, *blocks*, *calendar*, *housing*, *publications*, *recipes*, *restaurants* and *socialnetwork*. 72 | 73 | - *[domain]_train.tsv*: training and dev dataset. There is no isolate validation dataset in **OVERNIGHT**. We follow the traditional 80%/20% (train/dev) split in experiments. 74 | - *[domain]_test.tsv*: test datset. 75 | - *[domain]_extra.tsv*: synthesized logical forms (Lambda DCS). We revise the template rules in [SEMPRE](https://github.com/percyliang/sempre) to generate new instances. 76 | - *[domain]_lexicon.txt*: each line specifies a one-to-one mapping between a natural language noun phrase and its corresponding entity representation in knowledge base, such as pair `(kobe bryant, en.player.kobe_bryant 77 | )`. 78 | 79 | **Attention**: There is also a evaluator program provided by [Jia and Liang (2016)](https://www.aclweb.org/anthology/P16-1002.pdf) in each domain to obtain denotations (`utils/domain/domain_overnight.py`). Each sample in *[domain]_train.tsv* and *[domain]_test.tsv* is of the form: 80 | 81 | what player did not play point guard ( call SW.listValue ( call SW.getProperty ( ( lambda s ( call SW.filter ( var s ) ( string position ) ( string ! = ) en.position.point_guard ) ) ( call SW.domain ( string player ) ) ) ( string player ) ) ) 82 | 83 | ---- 84 | 85 | ## Experiments 86 | 87 | ---- 88 | 89 | ### Semantic Parsing (Supervised|Pretrain) 90 | 91 | Refer to script in `run/run_semantic_parsing.sh`, for example 92 | 93 | ./run/run_semantic_parsing.sh dataset_name [attn|attnptr] labeled 94 | 95 | `dataset_name` must be in choices `[atis, basketball, blocks, calendar, housing, publications, recipes, restaurants, socialnetwork]` and `labeled` denotes the ratio of labeled examples in training set we are going to use. 96 | 97 | ---- 98 | 99 | ### Question Generation (Supervised|Pretrain) 100 | 101 | The procedure is similar to that of Semantic Parsing since we use similar model architecture. 102 | 103 | ./run/run_question_generation.sh dataset_name [attn|attnptr] labeled 104 | 105 | ---- 106 | 107 | ### Language Model (Unsupervised|Pretrain) 108 | 109 | Language model is used to calculate the validity reward during the closed cycle. 110 | 111 | ./run/run_language_model.sh dataset_name [question|logical_form] 112 | 113 | ---- 114 | 115 | ### Pseudo Method (Semi-supervised) 116 | 117 | Use pretrained models of Semantic Parsing and Question Generation to generate pseudo samples. Then shuffle these pseudo samples with labeled samples together to train an improved Semantic Parsing and Question Generation Model. 118 | 119 | ./run/run_pseudo_method.sh dataset_name [attn|attnptr] labeled 120 | 121 | **Attention:** in the script `run/run_pseudo_method.sh`, `read_sp_model_path` and `read_qg_model_path` are paths to the pretrained models(semantic parsing and question generation). `labeled` and `seed` should be kept the same for both the pretraining phases and pseudo method. By default, model type (attn/attnptr) is the same for both semantic parsing and question generation models. 122 | 123 | ---- 124 | 125 | ### Dual Learning (Semi-supervised) 126 | 127 | Use pretrained models of semantic parsing, question generation and language models to form two closed cycles with different starting points. Combine dual reinforcement learning algorithm and supervised training together. Running script: 128 | 129 | ./run/run_dual_learning.sh dataset_name [attn|attnptr] labeled 130 | 131 | **Attention:** in the script `run/run_dual_learning.sh`, `read_sp_model_path`, `read_qg_model_path`, `read_qlm_path` and `read_lflm_path` are paths to the pretrained models(semantic parsing, question generation, question language model and logical form language model). `labeled` and `seed` should be kept the same for both the pretraining phases and dual learning framework. By default, model type (attn/attnptr) is the same for both semantic parsing and question generation models. 132 | -------------------------------------------------------------------------------- /data/atis/atis_lexicon.txt: -------------------------------------------------------------------------------- 1 | ci0 :- NP : ci0 2 | ci1 :- NP : ci1 3 | ci2 :- NP : ci2 4 | ap0 :- NP : ap0 5 | ap1 :- NP : ap1 6 | da0 :- NP : da0 7 | da1 :- NP : da1 8 | da2 :- NP : da2 9 | da3 :- NP : da3 10 | da4 :- NP : da4 11 | al0 :- NP : al0 12 | al1 :- NP : al1 13 | al2 :- NP : al2 14 | ti0 :- NP : ti0 15 | ti1 :- NP : ti1 16 | dn0 :- NP : dn0 17 | dn1 :- NP : dn1 18 | mn0 :- NP : mn0 19 | mn1 :- NP : mn1 20 | fn0 :- NP : fn0 21 | fn1 :- NP : fn1 22 | ac0 :- NP : ac0 23 | fb0 :- NP : fb0 24 | fb1 :- NP : fb1 25 | yr0 :- NP : yr0 26 | st0 :- NP : st0 27 | morn :- NP : morning:pd 28 | am :- NP : morning:pd 29 | afternoon :- NP : afternoon:pd 30 | even :- NP : evening:pd 31 | earli :- NP : early:pd 32 | late :- NP : late:pd 33 | night :- NP : late:pd 34 | late night :- NP : late_night:pd 35 | late even :- NP : late_evening:pd 36 | late in the even :- NP : late_evening:pd 37 | pm :- NP : pm:pd 38 | day time :- NP : daytime:pd 39 | mealtim :- NP : mealtime:pd 40 | breakfast :- NP : breakfast:me 41 | dinner :- NP : dinner:me 42 | lunch :- NP : lunch:me 43 | snack :- NP : snack:me 44 | dure breakfast :- NP : breakfast:pd 45 | ground transport :- NP : ground_transport 46 | rental car :- NP : rental_car 47 | rental a car :- NP : rental_car 48 | car rental :- NP : rental_car 49 | car :- NP : rental_car 50 | limousin :- NP : limousine 51 | limo :- NP : limousine 52 | air taxi oper :- NP : air_taxi_operation 53 | first class :- NP : first:cl 54 | coach :- NP : coach:cl 55 | busi :- NP : business:cl 56 | busi class :- NP : business:cl 57 | thrift :- NP : thrift:cl 58 | thrift economi :- NP : thrift:cl 59 | economi thrift :- NP : thrift:cl 60 | daili :- NP : daily 61 | economi :- NP : economy 62 | cheap :- NP : economy 63 | one way :- NP : oneway 64 | oneway :- NP : oneway 65 | round trip :- NP : round_trip 66 | no stopov :- NP : nonstop 67 | direct :- NP : nonstop 68 | connect :- NP : connecting 69 | boe :- NP : boeing:mf 70 | one :- NP : 1:i 71 | two :- NP : 2:i 72 | 3 :- NP : 3:i 73 | 9 hour :- NP : 9:hr 74 | minimum connect time :- NP : minimum_connection_time 75 | day after tomorrow :- NP : day_after_tomorrow 76 | distanc :- NP : miles_distant 77 | discount :- NP : discounted 78 | time zone :- NP : time_zone_code -------------------------------------------------------------------------------- /data/atis/atis_ontology.txt: -------------------------------------------------------------------------------- 1 | entity: dn no 2 | entity: fb yes 3 | entity: me no 4 | entity: pd no 5 | entity: mf no 6 | entity: ti yes 7 | entity: st no 8 | entity: rc yes 9 | entity: al no 10 | entity: ci no 11 | entity: do yes 12 | entity: dc no 13 | entity: fn no 14 | entity: cl no 15 | entity: ac no 16 | entity: yr no 17 | entity: i yes 18 | entity: hr yes 19 | entity: mn no 20 | entity: ap no 21 | entity: da no 22 | unary: has_stops 23 | unary: air_taxi_operation 24 | cat: aircraft_code:t ac 25 | unary: connecting 26 | unary: jet 27 | unary: nonstop 28 | unary: day_after_tomorrow 29 | unary: rapid_transit 30 | unary: tomorrow 31 | unary: class_of_service 32 | unary: rental_car 33 | unary: overnight 34 | unary: airport 35 | cat: airport ap 36 | unary: fare_basis_code 37 | unary: fare 38 | unary: tomorrow_arrival 39 | unary: airline 40 | cat: airline al 41 | unary: time_zone_code 42 | cat: time_zone_code fb 43 | cat: time_zone_code rc 44 | unary: miles_distant 45 | unary: has_meal 46 | unary: economy 47 | unary: taxi 48 | unary: city 49 | cat: city ci 50 | unary: discounted 51 | unary: airline_name 52 | unary: meal:t 53 | unary: today 54 | unary: limousine 55 | unary: restriction_code 56 | cat: restriction_code fb 57 | cat: restriction_code rc 58 | unary: meal_code 59 | unary: ground_transport 60 | unary: aircraft 61 | unary: turboprop 62 | unary: tonight 63 | unary: daily 64 | unary: round_trip 65 | unary: weekday 66 | unary: flight 67 | unary: booking_class:t 68 | unary: oneway 69 | binary: type:al services type:ci 70 | binary: type:al services type:ap 71 | binary: type:flight fare type:do 72 | binary: type:flight fare type:fb 73 | binary: type:fb fare type:do 74 | binary: type:flight cost type:do 75 | binary: type:flight day_return type:da 76 | binary: type:flight approx_return_time type:ti 77 | binary: type:flight day_number_return type:dn 78 | binary: type:flight class_type type:cl 79 | binary: type:flight month_arrival type:mn 80 | binary: type:flight stop type:ci 81 | binary: type:flight stop type:ap 82 | binary: type:flight flight_number type:fn 83 | binary: type:flight month_return type:mn 84 | binary: type:flight approx_arrival_time type:ti 85 | binary: type:flight stop_arrival_time type:ti 86 | binary: type:flight day_arrival type:da 87 | binary: type:flight aircraft_code type:ac 88 | binary: type:flight after_day type:da 89 | binary: type:flight meal type:me 90 | binary: type:flight arrival_month type:mn 91 | binary: type:flight day_number_arrival type:dn 92 | binary: type:flight arrival_time type:ti 93 | binary: type:flight next_days type:i 94 | binary: type:flight manufacturer type:mf 95 | binary: type:flight before_day type:da 96 | binary: type:flight minutes_distant type:i 97 | binary: type:flight capacity type:i 98 | binary: type:ac capacity type:i 99 | binary: type:mf capacity type:i 100 | binary: type:flight stops type:i 101 | binary: type:flight to type:ci 102 | binary: type:flight to type:ap 103 | binary: type:flight to type:st 104 | binary: type:flight time_elapsed type:hr 105 | binary: type:flight year type:yr 106 | binary: type:flight booking_class type:fb 107 | binary: type:flight booking_class type:cl 108 | binary: type:flight from type:ci 109 | binary: type:flight from type:ap 110 | binary: type:flight airport type:ap 111 | binary: type:flight month type:mn 112 | binary: type:flight day_number type:dn 113 | binary: type:flight fare_basis_code type:fb 114 | binary: type:flight ground_fare type:do 115 | binary: type:ap loc:t type:ci 116 | binary: type:ci loc:t type:fb 117 | binary: type:ap loc:t type:st 118 | binary: type:flight approx_departure_time type:ti 119 | binary: type:al named type:al 120 | binary: type:flight to_city type:ci 121 | binary: type:flight minimum_connection_time type:ti 122 | binary: type:ap minimum_connection_time type:ti 123 | binary: type:flight departure_time type:ti 124 | binary: type:flight airline type:al 125 | binary: type:flight airline:e type:al 126 | binary: type:flight from_airport type:ap 127 | binary: type:flight day type:da 128 | binary: type:flight during_day_arrival type:pd 129 | binary: type:flight during_day type:pd 130 | binary: type:flight days_from_today type:i 131 | binary: type:flight aircraft type:ac 132 | binary: type:al abbrev type:al 133 | -------------------------------------------------------------------------------- /data/geo/geo_extra.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhythmcao/semantic-parsing-dual/bd419acf391a9f5fe9eeedbcf7799b350138a6da/data/geo/geo_extra.tsv -------------------------------------------------------------------------------- /data/geo/geo_lexicon.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhythmcao/semantic-parsing-dual/bd419acf391a9f5fe9eeedbcf7799b350138a6da/data/geo/geo_lexicon.txt -------------------------------------------------------------------------------- /data/overnight/basketball_lexicon.txt: -------------------------------------------------------------------------------- 1 | kobe bryant :- NP : en.player.kobe_bryant 2 | kobe :- NP : en.player.kobe_bryant 3 | kobe bryants :- NP : en.player.kobe_bryant 4 | kob bryant :- NP : en.player.kobe_bryant 5 | kobe bryan :- NP : en.player.kobe_bryant 6 | lebron james :- NP : en.player.lebron_james 7 | la laker :- NP : en.team.lakers 8 | la lakers :- NP : en.team.lakers 9 | los angeles lakers :- NP : en.team.lakers 10 | lakers :- NP : en.team.lakers 11 | laker :- NP : en.team.lakers 12 | cavaliers :- NP : en.team.cavaliers 13 | cleveland cavaliers :- NP : en.team.cavaliers 14 | point guards :- NP : en.position.point_guard 15 | point guard :- NP : en.position.point_guard 16 | forward :- NP : en.position.forward 17 | forwards :- NP : en.position.forward 18 | -------------------------------------------------------------------------------- /data/overnight/blocks_lexicon.txt: -------------------------------------------------------------------------------- 1 | block :- NP : en.block 2 | block 1 :- NP : en.block.block1 3 | brick 1 :- NP : en.block.block1 4 | block 1s :- NP : en.block.block1 5 | block one :- NP : en.block.block1 6 | block 2 :- NP : en.block.block2 7 | brick 2 :- NP : en.block.block2 8 | block two :- NP : en.block.block2 9 | pyramid :- NP : en.shape.pyramid 10 | pyramidshaped :- NP : en.shape.pyramid 11 | pyramidshape :- NP : en.shape.pyramid 12 | cube :- NP : en.shape.cube 13 | cube shaped :- NP : en.shape.cube 14 | inch :- NP : en.inch 15 | inches :- NP : en.inch 16 | 17 | -------------------------------------------------------------------------------- /data/overnight/calendar_lexicon.txt: -------------------------------------------------------------------------------- 1 | meeting :- NP : en.meeting 2 | meetings :- NP : en.meeting 3 | weekly standup :- NP : en.meeting.weekly_standup 4 | weekly stand up :- NP : en.meeting.weekly_standup 5 | weekly roundup :- NP : en.meeting.weekly_standup 6 | weekly startup :- NP : en.meeting.weekly_standup 7 | week startup :- NP : en.meeting.weekly_standup 8 | annual review :- NP : en.meeting.annual_review 9 | person :- NP : en.person 10 | alice :- NP : en.person.alice 11 | bob :- NP : en.person.bob 12 | greenberg cafe :- NP : en.location.greenberg_cafe 13 | central office :- NP : en.location.central_office 14 | -------------------------------------------------------------------------------- /data/overnight/housing_lexicon.txt: -------------------------------------------------------------------------------- 1 | 123 sesame street :- NP : en.housing_unit.123_sesame_street 2 | 123 sesame st :- NP : en.housing_unit.123_sesame_street 3 | 900 mission avenue :- NP : en.housing_unit.900_mission_ave 4 | 900 mission ave :- NP : en.housing_unit.900_mission_ave 5 | dollar :- NP : en.dollar 6 | dollars :- NP : en.dollar 7 | square :- NP : en.square_feet 8 | square feet :- NP : en.square_feet 9 | square foot :- NP : en.square_feet 10 | midtown west :- NP : en.neighborhood.midtown_west 11 | midtown east :- NP : en.neighborhood.midtown_west 12 | west midtown :- NP : en.neighborhood.midtown_west 13 | chelsea :- NP : en.neighborhood.chelsea 14 | apartment :- NP : en.housing.apartment 15 | aprtment :- NP : en.housing.apartment 16 | apartments :- NP : en.housing.apartment 17 | condo :- NP : en.housing.condo 18 | condos :- NP : en.housing.condo 19 | -------------------------------------------------------------------------------- /data/overnight/publications_lexicon.txt: -------------------------------------------------------------------------------- 1 | article :- NP : en.article 2 | articles :- NP : en.article 3 | multivariate data analysis :- NP : en.article.multivariate_data_analysis 4 | person :- NP : en.person 5 | persons :- NP : en.person 6 | efron :- NP : en.person.efron 7 | efrons :- NP : en.person.efron 8 | lakoff :- NP : en.person.lakoff 9 | annals of statistics :- NP : en.venue.annals_of_statistics 10 | computational linguistics :- NP : en.venue.computational_linguistics 11 | -------------------------------------------------------------------------------- /data/overnight/recipes_lexicon.txt: -------------------------------------------------------------------------------- 1 | recipe :- NP : en.recipe 2 | recipes :- NP : en.recipe 3 | rice pudding :- NP : en.recipe.rice_pudding 4 | rice puddings :- NP : en.recipe.rice_pudding 5 | quiche :- NP : en.recipe.quiche 6 | quice :- NP : en.recipe.quiche 7 | ingredient :- NP : en.ingredient 8 | ingredients :- NP : en.ingredient 9 | milk :- NP : en.ingredient.milk 10 | spinach :- NP : en.ingredient.spinach 11 | lunch :- NP : en.meal.lunch 12 | dinner :- NP : en.meal.dinner 13 | supper :- NP : en.meal.dinner 14 | -------------------------------------------------------------------------------- /data/overnight/restaurants_lexicon.txt: -------------------------------------------------------------------------------- 1 | restaurant :- NP : en.restaurant 2 | restaurants :- NP : en.restaurant 3 | thai cafe :- NP : en.restaurant.thai_cafe 4 | pizzeria juno :- NP : en.restaurant.pizzeria_juno 5 | pizzeria :- NP : en.restaurant.pizzeria_juno 6 | stars :- NP : en.star 7 | star :- NP : en.star 8 | dollar sign :- NP : en.dollar_sign 9 | dollar signs :- NP : en.dollar_sign 10 | reviews :- NP : en.review 11 | midtown west :- NP : en.neighborhood.midtown_west 12 | west midtown :- NP : en.neighborhood.midtown_west 13 | chelsea :- NP : en.neighborhood.chelsea 14 | thai :- NP : en.cuisine.thai 15 | italian :- NP : en.cuisine.italian 16 | lunch :- NP : en.food.lunch 17 | dinner :- NP : en.food.dinner 18 | -------------------------------------------------------------------------------- /data/overnight/socialnetwork_lexicon.txt: -------------------------------------------------------------------------------- 1 | person :- NP : en.person 2 | persons :- NP : en.person 3 | alice :- NP : en.person.alice 4 | bob :- NP : en.person.bob 5 | alices :- NP : en.person.alice 6 | bobs :- NP : en.person.bob 7 | male :- NP : en.gender.male 8 | males :- NP : en.gender.male 9 | female :- NP : en.gender.female 10 | females :- NP : en.gender.female 11 | single :- NP : en.relationship_status.single 12 | singles :- NP : en.relationship_status.single 13 | singlestatus :- NP : en.relationship_status.single 14 | married :- NP : en.relationship_status.married 15 | city :- NP : en.city 16 | cities :- NP : en.city 17 | citys :- NP : en.city 18 | new york :- NP : en.city.new_york 19 | new yorks :- NP : en.city.new_york 20 | newyork :- NP : en.city.new_york 21 | beijing :- NP : en.city.beijing 22 | cm :- NP : en.cm 23 | brown university :- NP : en.university.brown 24 | ucla :- NP : en.university.ucla 25 | ucla university :- NP : en.university.ucla 26 | computer science :- NP : en.field.computer_science 27 | mckinsey :- NP : en.company.mckinsey 28 | mckinseys :- NP : en.company.mckinsey 29 | google :- NP : en.company.google 30 | software engineer :- NP : en.job_title.software_engineer 31 | software engineers :- NP : en.job_title.software_engineer 32 | program manager :- NP : en.job_title.program_manager 33 | program managers :- NP : en.job_title.program_manager 34 | -------------------------------------------------------------------------------- /models/Beam.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | from models import penalties 4 | from utils.constants import * 5 | 6 | class Beam(object): 7 | """ 8 | Class for managing the internals of the beam search process. 9 | Takes care of beams, back pointers, and scores. (Revised from OpenNMT.) 10 | @args: 11 | size (int): beam size 12 | vocab (dict): contains indices of padding, beginning, and ending. 13 | min_length (int): minimum length to generate 14 | global_scorer (:obj:`GlobalScorer`) 15 | device (torch.device) 16 | """ 17 | 18 | def __init__(self, size, vocab, min_length=2, 19 | global_scorer=None, device=None): 20 | 21 | self.size = size 22 | self.device = device 23 | # The score for each translation on the beam. 24 | self.scores = torch.zeros(size, dtype=torch.float, device=self.device) 25 | 26 | # The backpointers at each time-step. 27 | self.prev_ks = [] 28 | 29 | # The outputs at each time-step. 30 | self.next_ys = [torch.zeros(size, dtype=torch.long, device=self.device).fill_(vocab[PAD])] 31 | self.next_ys[0][0] = vocab[BOS] 32 | 33 | # Has EOS topped the beam yet. 34 | self._eos = vocab[EOS] 35 | self.eos_top = False 36 | 37 | # Other special symbols 38 | self._bos = vocab[BOS] 39 | self._pad = vocab[PAD] 40 | 41 | # Time and k pair for finished. 42 | self.finished = [] 43 | 44 | # Information for global scoring. 45 | self.global_scorer = global_scorer 46 | 47 | # Minimum prediction length 48 | self.min_length = min_length 49 | 50 | def get_current_state(self): 51 | "Get the outputs for the current timestep." 52 | return self.next_ys[-1] 53 | 54 | def get_current_origin(self): 55 | "Get the backpointers for the current timestep." 56 | return self.prev_ks[-1] 57 | 58 | def advance(self, word_probs): 59 | """ 60 | Given prob over words for every last beam `K x vocab` and update the beam. 61 | 62 | Parameters: 63 | 64 | * `word_probs`- probs of advancing from the last step (K x words) 65 | 66 | Returns: True if beam search is complete. 67 | """ 68 | num_words = word_probs.size(1) 69 | # force the output to be longer than self.min_length 70 | cur_len = len(self.next_ys) 71 | masks = torch.zeros(word_probs.size(), requires_grad=False, dtype=torch.float, device=self.device) 72 | masks[:, self._bos] = 1e20 73 | masks[:, self._pad] = 1e20 # prevent generate symbol 74 | if cur_len < self.min_length: 75 | masks[:, self._eos] = 1e20 # prevent terminate too early 76 | word_probs = word_probs - masks 77 | 78 | # Sum the previous scores. 79 | if len(self.prev_ks) > 0: 80 | beam_scores = word_probs + self.scores.unsqueeze(1) 81 | # Don't let EOS have children. 82 | masks = torch.zeros(beam_scores.size(), requires_grad=False, dtype=torch.float, device=self.device) 83 | for i in range(self.next_ys[-1].size(0)): 84 | if self.next_ys[-1][i] == self._eos: 85 | masks[i] = 1e20 86 | beam_scores = beam_scores - masks 87 | else: 88 | beam_scores = word_probs[0] # only start from , not 89 | flat_beam_scores = beam_scores.contiguous().view(-1) 90 | best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0, True, True) 91 | 92 | self.scores = best_scores 93 | 94 | # best_scores_id is flattened beam x word array, so calculate which 95 | # word and beam each score came from 96 | prev_k = best_scores_id / num_words 97 | self.prev_ks.append(prev_k) 98 | self.next_ys.append((best_scores_id - prev_k * num_words)) 99 | 100 | # check whether some sequence has terminated 101 | for i in range(self.next_ys[-1].size(0)): 102 | if self.next_ys[-1][i] == self._eos: 103 | global_scores = self.global_scorer.score(self, self.scores) # normalize score by length penalty 104 | rank_s, s = global_scores[i], self.scores[i] 105 | self.finished.append(([rank_s, s], len(self.next_ys) - 1, i)) 106 | 107 | # End condition is when top-of-beam is EOS and no global score. 108 | if self.next_ys[-1][0] == self._eos: 109 | self.eos_top = True 110 | return self.done() 111 | 112 | def done(self): 113 | return self.eos_top and len(self.finished) >= self.size 114 | 115 | def sort_best(self): 116 | """ 117 | Sort the current beam. 118 | """ 119 | return torch.sort(self.scores, 0, True) # beam size 120 | 121 | def sort_finished(self, minimum=None): 122 | if minimum is not None: 123 | i = 0 124 | # Add from beam until we have minimum outputs. 125 | while len(self.finished) < minimum: 126 | global_scores = self.global_scorer.score(self, self.scores) 127 | rank_s, s = global_scores[i], self.scores[i] 128 | self.finished.append(([rank_s, s], len(self.next_ys) - 1, i)) 129 | i += 1 130 | 131 | self.finished.sort(key=lambda a: -a[0][0]) 132 | scores = [sc[1] for sc, _, _ in self.finished] 133 | ks = [(t, k) for _, t, k in self.finished] 134 | return scores, ks 135 | 136 | def get_temporary_hyp(self, k): 137 | """ 138 | Get current hypotheses of rank k ( 0 <= rank <= beam_size-1 ). 139 | """ 140 | hyp = [] 141 | for j in range(len(self.prev_ks) - 1, -1, -1): 142 | hyp.append(self.next_ys[j + 1][k]) 143 | k = self.prev_ks[j][k] 144 | return torch.stack(hyp[::-1]) 145 | 146 | def get_hyp(self, timestep, k): 147 | """ 148 | Walk back to construct the full hypothesis. 149 | hyp contains but does not contain 150 | @return: 151 | hyp: LongTensor of size tgt_len 152 | """ 153 | hyp = [] 154 | for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1): 155 | hyp.append(self.next_ys[j + 1][k]) 156 | k = self.prev_ks[j][k] 157 | return torch.stack(hyp[::-1]) 158 | 159 | class GNMTGlobalScorer(object): 160 | """ 161 | Re-ranking score revised from 162 | "Google's Neural Machine Translation System" :cite:`wu2016google` 163 | 164 | Args: 165 | alpha (float): length parameter 166 | """ 167 | 168 | def __init__(self, alpha, len_penalty): 169 | self.alpha = alpha 170 | penalty_builder = penalties.PenaltyBuilder(len_penalty) 171 | # Probability will be divided by this 172 | self.length_penalty = penalty_builder.length_penalty() 173 | 174 | def score(self, beam, logprobs): 175 | """ 176 | Rescores a prediction based on penalty functions 177 | """ 178 | normalized_probs = self.length_penalty(beam, logprobs, self.alpha) 179 | return normalized_probs 180 | -------------------------------------------------------------------------------- /models/attention/attention_rnn.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class Attention(nn.Module): 7 | METHODS = ['general', 'feedforward'] 8 | 9 | def __init__(self, enc_dim, dec_dim, method='feedforward'): 10 | 11 | super(Attention, self).__init__() 12 | self.enc_dim, self.dec_dim = enc_dim, dec_dim 13 | assert method in Attention.METHODS 14 | self.method = method 15 | if self.method == 'general': 16 | self.Wa = nn.Linear(self.enc_dim, self.dec_dim, bias=False) 17 | else: 18 | self.Wa = nn.Linear(self.enc_dim + self.dec_dim, self.dec_dim, bias=False) 19 | self.Va = nn.Linear(self.dec_dim, 1, bias=False) 20 | 21 | def forward(self, hiddens, decoder_state, masks): 22 | ''' 23 | hiddens : bsize x src_lens x enc_dim 24 | decoder_state : bsize x dec_dim 25 | masks : bsize x src_lens, ByteTensor 26 | @return: 27 | context : bsize x 1 x enc_dim 28 | a : normalized coefficient, bsize x src_lens 29 | ''' 30 | if self.method == 'general': 31 | m = self.Wa(hiddens) # bsize x src_len x dec_dim 32 | e = torch.bmm(m, decoder_state.unsqueeze(-1)).squeeze(dim=-1) # bsize x src_len 33 | else: 34 | d = decoder_state.unsqueeze(dim=1).repeat(1, hiddens.size(1), 1) 35 | e = self.Wa(torch.cat([d, hiddens], dim=-1)) 36 | e = self.Va(torch.tanh(e)).squeeze(dim=-1) 37 | e.masked_fill_(masks == 0, -float('inf')) 38 | a = F.softmax(e, dim=1) 39 | context = torch.bmm(a.unsqueeze(1), hiddens) 40 | return context, a -------------------------------------------------------------------------------- /models/construct_models.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | from utils.constants import * 5 | from models.embedding.embedding_rnn import RNNEmbeddings 6 | from models.encoder.encoder_rnn import RNNEncoder 7 | from models.enc2dec.state_transition import StateTransition 8 | from models.attention.attention_rnn import Attention 9 | from models.decoder.decoder_rnn import RNNDecoder 10 | from models.decoder.decoder_rnn_pointer import RNNDecoderPointer 11 | from models.generator.generator_naive import Generator 12 | from models.generator.generator_pointer import GeneratorPointer 13 | from models.model_attn import AttnModel 14 | from models.model_attnptr import AttnPtrModel 15 | 16 | def construct_model(*args, **kargs): 17 | copy = kargs.pop('copy', True) 18 | if copy: 19 | return construct_attnptr(*args, **kargs) 20 | else: 21 | return construct_attn(*args, **kargs) 22 | 23 | def construct_attn( 24 | src_vocab=None, tgt_vocab=None, src_unk_idx=1, tgt_unk_idx=1, pad_src_idxs=[0], pad_tgt_idxs=[0], 25 | src_emb_size=100, tgt_emb_size=100, hidden_dim=200, num_layers=1, bidirectional=True, 26 | cell='lstm', dropout=0.5, init=None, **kargs 27 | ): 28 | """ 29 | Construct Seq2Seq model with attention mechanism 30 | """ 31 | num_directions = 2 if bidirectional else 1 32 | enc2dec_model = StateTransition(num_layers, cell=cell, bidirectional=bidirectional, hidden_dim=hidden_dim) 33 | attn_model = Attention(hidden_dim * num_directions, hidden_dim) 34 | src_embeddings = RNNEmbeddings(src_emb_size, src_vocab, src_unk_idx, pad_token_idxs=pad_src_idxs, dropout=dropout) 35 | encoder = RNNEncoder(src_emb_size, hidden_dim, num_layers, cell=cell, bidirectional=bidirectional, dropout=dropout) 36 | tgt_embeddings = RNNEmbeddings(tgt_emb_size, tgt_vocab, tgt_unk_idx, pad_token_idxs=pad_tgt_idxs, dropout=dropout) 37 | decoder = RNNDecoder(tgt_emb_size, hidden_dim, num_layers, attn=attn_model, cell=cell, dropout=dropout) 38 | generator_model = Generator(tgt_emb_size, tgt_vocab, dropout=dropout) 39 | model = AttnModel(src_embeddings, encoder, tgt_embeddings, decoder, enc2dec_model, generator_model) 40 | 41 | if init: 42 | for p in model.parameters(): 43 | p.data.uniform_(-init, init) 44 | for pad_token_idx in pad_src_idxs: 45 | model.src_embed.embed.weight.data[pad_token_idx].zero_() 46 | for pad_token_idx in pad_tgt_idxs: 47 | model.tgt_embed.embed.weight.data[pad_token_idx].zero_() 48 | return model 49 | 50 | def construct_attnptr( 51 | src_vocab=None, tgt_vocab=None, src_unk_idx=1, tgt_unk_idx=1, pad_src_idxs=[0], pad_tgt_idxs=[0], 52 | src_emb_size=100, tgt_emb_size=100, hidden_dim=200, bidirectional=True, num_layers=1, 53 | cell='lstm', dropout=0.5, init=None, **kargs 54 | ): 55 | """ 56 | Construct Seq2Seq model with attention mechanism and pointer network 57 | """ 58 | num_directions = 2 if bidirectional else 1 59 | enc2dec_model = StateTransition(num_layers, cell=cell, bidirectional=bidirectional, hidden_dim=hidden_dim) 60 | attn_model = Attention(hidden_dim * num_directions, hidden_dim) 61 | src_embeddings = RNNEmbeddings(src_emb_size, src_vocab, src_unk_idx, pad_token_idxs=pad_src_idxs, dropout=dropout) 62 | encoder = RNNEncoder(src_emb_size, hidden_dim, num_layers, cell=cell, bidirectional=bidirectional, dropout=dropout) 63 | tgt_embeddings = RNNEmbeddings(tgt_emb_size, tgt_vocab, tgt_unk_idx, pad_token_idxs=pad_tgt_idxs, dropout=dropout) 64 | decoder = RNNDecoderPointer(tgt_emb_size, hidden_dim, num_layers, attn=attn_model, cell=cell, dropout=dropout) 65 | generator_model = GeneratorPointer(tgt_emb_size, tgt_vocab, dropout=dropout) 66 | model = AttnPtrModel(src_embeddings, encoder, tgt_embeddings, decoder, enc2dec_model, generator_model) 67 | 68 | if init: 69 | for p in model.parameters(): 70 | p.data.uniform_(-init, init) 71 | for pad_token_idx in pad_src_idxs: 72 | model.src_embed.embed.weight.data[pad_token_idx].zero_() 73 | for pad_token_idx in pad_tgt_idxs: 74 | model.tgt_embed.embed.weight.data[pad_token_idx].zero_() 75 | return model 76 | -------------------------------------------------------------------------------- /models/decoder/decoder_rnn.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | 5 | class RNNDecoder(nn.Module): 6 | """ 7 | Generic unidirectional RNN layers containing Attention modules. 8 | """ 9 | def __init__(self, tgt_emb_size, hidden_dim, num_layers, attn, cell="lstm", dropout=0.5): 10 | super(RNNDecoder, self).__init__() 11 | self.tgt_emb_size = tgt_emb_size 12 | self.hidden_dim = hidden_dim 13 | self.num_layers = num_layers 14 | self.dropout = dropout if self.num_layers > 1 else 0 15 | self.cell = cell.upper() 16 | self.rnn_decoder = getattr(nn, self.cell)(self.tgt_emb_size, self.hidden_dim, 17 | num_layers=self.num_layers, bidirectional=False, batch_first=True, dropout=self.dropout) 18 | self.attn = attn 19 | self.affine = nn.Linear(self.hidden_dim + self.attn.enc_dim, self.tgt_emb_size) 20 | self.dropout_layer = nn.Dropout(p=dropout) 21 | 22 | def forward(self, x, hidden_states, memory, src_mask, copy_tokens=None): 23 | """ 24 | x: decoder input embeddings, bsize x tgt_len x emb_size 25 | hidden_states: previous decoder state 26 | memory: encoder output, bsize x src_len x hidden_dim*2 27 | src_mask: bsize x src_lens 28 | copy_tokens: to be compatible with pointer network 29 | """ 30 | out, hidden_states = self.rnn_decoder(x, hidden_states) 31 | context = [] 32 | for i in range(out.size(1)): 33 | tmp_context, _ = self.attn(memory, out[:, i, :], src_mask) 34 | context.append(tmp_context) 35 | context = torch.cat(context, dim=1) 36 | feats = torch.cat([out, context], dim=-1) 37 | feats = self.affine(self.dropout_layer(feats)) 38 | return feats, hidden_states 39 | -------------------------------------------------------------------------------- /models/decoder/decoder_rnn_pointer.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | 5 | class RNNDecoderPointer(nn.Module): 6 | """ 7 | Generic unidirectional RNN layers containing StateTransition and Attention modules. 8 | """ 9 | def __init__(self, tgt_emb_size, hidden_dim, num_layers, attn, cell="lstm", dropout=0.5): 10 | super(RNNDecoderPointer, self).__init__() 11 | self.tgt_emb_size = tgt_emb_size 12 | self.hidden_dim = hidden_dim 13 | self.num_layers = num_layers 14 | self.dropout = dropout if self.num_layers > 1 else 0 15 | self.cell = cell.upper() 16 | self.rnn_decoder = getattr(nn, self.cell)(self.tgt_emb_size, self.hidden_dim, 17 | num_layers=self.num_layers, bidirectional=False, batch_first=True, dropout=self.dropout) 18 | self.attn = attn 19 | self.gate = nn.Linear(self.hidden_dim + self.attn.enc_dim + self.tgt_emb_size, 1) 20 | self.affine = nn.Linear(self.hidden_dim + self.attn.enc_dim, self.tgt_emb_size) 21 | self.dropout_layer = nn.Dropout(p=dropout) 22 | 23 | def forward(self, x, hidden_states, memory, src_mask, copy_tokens=None): 24 | """ 25 | x: decoder input embeddings, bsize x tgt_len x emb_size 26 | hidden_states: previous decoder state 27 | memory: memory and hidden_states 28 | src_mask: mask on src input, bsize x src_lens 29 | copy_tokens: bsize x src_lens x vocab_size 30 | @return: 31 | feats: bsize x tgt_lens x (dec_dim + enc_dim) 32 | copy_distribution: bsize x tgt_lens x (vocab_size + MAX_OOV_NUM) 33 | gate_scores: bsize x tgt_lens x 1 34 | """ 35 | out, hidden_states = self.rnn_decoder(x, hidden_states) 36 | context, pointer = [], [] 37 | for i in range(out.size(1)): 38 | tmp_context, tmp_ptr = self.attn(memory, out[:, i, :], src_mask) 39 | context.append(tmp_context) 40 | pointer.append(tmp_ptr.unsqueeze(dim=1)) 41 | context, pointer = torch.cat(context, dim=1), torch.cat(pointer, dim=1) 42 | feats = self.dropout_layer(torch.cat([out, context], dim=-1)) 43 | gate_scores = torch.sigmoid(self.gate(torch.cat([feats, x], dim=-1))) 44 | feats = self.affine(feats) 45 | copy_distribution = torch.bmm(pointer, copy_tokens) 46 | return feats, hidden_states, copy_distribution, gate_scores 47 | -------------------------------------------------------------------------------- /models/dual_learning.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import os, sys 3 | import torch 4 | import torch.nn as nn 5 | from utils.example import Example 6 | from utils.constants import PAD, UNK, BOS, EOS 7 | from utils.batch import get_minibatch_sp, get_minibatch_qg 8 | 9 | class DualLearning(nn.Module): 10 | 11 | def __init__(self, sp_model, qg_model, reward_model, sp_vocab, qg_vocab, 12 | alpha=0.5, beta=0.5, sample=5, reduction='sum', sp_device=None, qg_device=None, **kargs): 13 | """ 14 | @args: 15 | 1. alpha: reward for cycle starting from sp_reward = val_reward * alpha + rec_reward * (1 - alpha) 16 | 2. beta: reward for cycle starting from qg_reward = val_reward * beta + rec_reward * (1 - beta) 17 | 3. sample: beam search and sample size for training in dual learning cycles 18 | """ 19 | super(DualLearning, self).__init__() 20 | self.sp_device = sp_device 21 | self.qg_device = qg_device 22 | self.sp_model = sp_model.to(self.sp_device) 23 | self.qg_model = qg_model.to(self.qg_device) 24 | self.reward_model = reward_model 25 | self.alpha, self.beta, self.sample = alpha, beta, sample 26 | self.reduction = reduction 27 | self.sp_vocab = sp_vocab 28 | self.qg_vocab = qg_vocab 29 | 30 | def forward(self, *args, start_from='semantic_parsing', **kargs): 31 | """ 32 | @args: 33 | *args(tensors): positional arguments for semantic parsing or question generation 34 | start_from(enum): semantic_parsing or question_generation 35 | """ 36 | if start_from == 'semantic_parsing': 37 | return self.cycle_start_from_sp(*args, **kargs) 38 | elif start_from == 'question_generation': 39 | return self.cycle_start_from_qg(*args, **kargs) 40 | else: 41 | raise ValueError('[Error]: dual learning cycle with unknown starting point !') 42 | 43 | def cycle_start_from_sp(self, inputs, lens, copy_tokens, oov_list, raw_in): 44 | domain = Example.domain 45 | # primal model 46 | results = self.sp_model.decode_batch(inputs, lens, self.sp_vocab.lf2id, copy_tokens, self.sample, self.sample) 47 | predictions, sp_scores = results['predictions'], results['scores'] 48 | predictions = [idx for each in predictions for idx in each] 49 | predictions = domain.reverse(predictions, self.sp_vocab.id2lf, oov_list=oov_list) 50 | raw_in = [each for each in raw_in for _ in range(self.sample)] # repeat sample times 51 | 52 | # calculate validity reward 53 | sp_val_reward = self.reward_model(predictions, choice='sp_val').contiguous().view(-1, self.sample) 54 | baseline = sp_val_reward.mean(dim=-1, keepdim=True) 55 | sp_val_reward -= baseline 56 | 57 | # dual model 58 | qg_inputs, qg_lens, qg_dec_inputs, qg_dec_outputs, qg_out_lens, qg_copy_tokens = \ 59 | self.sp2qg(predictions, raw_in, vocab=self.qg_vocab, device=self.qg_device) 60 | logscore = self.qg_model(qg_inputs, qg_lens, qg_dec_inputs[:, :-1], qg_copy_tokens) 61 | 62 | # calculate reconstruction reward 63 | rec_reward = self.reward_model(logscore, qg_dec_outputs[:, 1:], qg_out_lens - 1, choice='sp_rec').contiguous().view(-1, self.sample) 64 | sp_rec_reward = rec_reward.detach().cpu() 65 | baseline = sp_rec_reward.mean(dim=-1, keepdim=True) 66 | sp_rec_reward = sp_rec_reward - baseline 67 | 68 | total_reward = self.alpha * sp_val_reward + (1 - self.alpha) * sp_rec_reward 69 | sp_loss = - torch.mean(total_reward.to(self.sp_device) * sp_scores, dim=1) 70 | sp_loss = torch.sum(sp_loss) if self.reduction == 'sum' else torch.mean(sp_loss) 71 | qg_loss = - torch.mean((1 - self.alpha) * rec_reward, dim=1) 72 | qg_loss = torch.sum(qg_loss) if self.reduction == 'sum' else torch.mean(qg_loss) 73 | return sp_loss, qg_loss 74 | 75 | def sp2qg(self, lf_list, utterances, vocab, device): 76 | ex_list = [Example(' '.join(sent), ' '.join(lf)) for sent, lf in zip(utterances, lf_list)] 77 | inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = \ 78 | get_minibatch_qg(ex_list, vocab, device, copy=self.qg_model.copy) 79 | return inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens 80 | 81 | def cycle_start_from_qg(self, inputs, lens, copy_tokens, oov_list, raw_in): 82 | domain = Example.domain 83 | # primal model 84 | results = self.qg_model.decode_batch(inputs, lens, self.qg_vocab.word2id, copy_tokens, self.sample, self.sample) 85 | predictions, qg_scores = results['predictions'], results['scores'] 86 | predictions = [idx for each in predictions for idx in each] 87 | predictions = domain.reverse(predictions, self.qg_vocab.id2word, oov_list=oov_list) 88 | raw_in = [each for each in raw_in for _ in range(self.sample)] # repeat sample times 89 | 90 | # calculate validity reward 91 | qg_val_reward = self.reward_model(predictions, choice='qg_val').contiguous().view(-1, self.sample) 92 | baseline = qg_val_reward.mean(dim=-1, keepdim=True) 93 | qg_val_reward -= baseline 94 | 95 | # dual model 96 | sp_inputs, sp_lens, sp_dec_inputs, sp_dec_outputs, sp_out_lens, sp_copy_tokens = \ 97 | self.qg2sp(predictions, raw_in, self.sp_vocab, self.sp_device) 98 | logscore = self.sp_model(sp_inputs, sp_lens, sp_dec_inputs[:, :-1], sp_copy_tokens) 99 | 100 | # calculate reconstruction reward 101 | rec_reward = self.reward_model(logscore, sp_dec_outputs[:, 1:], sp_out_lens - 1, choice='qg_rec').contiguous().view(-1, self.sample) 102 | qg_rec_reward = rec_reward.detach().cpu() 103 | baseline = qg_rec_reward.mean(dim=-1, keepdim=True) 104 | qg_rec_reward = qg_rec_reward - baseline 105 | 106 | total_reward = self.beta * qg_val_reward + (1 - self.beta) * qg_rec_reward 107 | qg_loss = - torch.mean(total_reward.to(self.qg_device) * qg_scores, dim=1) 108 | qg_loss = torch.sum(qg_loss) if self.reduction == 'sum' else torch.mean(qg_loss) 109 | sp_loss = - torch.mean((1 - self.beta) * rec_reward, dim=1) 110 | sp_loss = torch.sum(sp_loss) if self.reduction == 'sum' else torch.mean(sp_loss) 111 | return sp_loss, qg_loss 112 | 113 | def qg2sp(self, utterances, lf_list, vocab, device): 114 | ex_list = [Example(' '.join(sent), ' '.join(lf)) for sent, lf in zip(utterances, lf_list)] 115 | inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = \ 116 | get_minibatch_sp(ex_list, vocab, device, copy=self.sp_model.copy) 117 | return inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens 118 | 119 | def decode_batch(self, *args, task='semantic_parsing', **kargs): 120 | if task == 'semantic_parsing': 121 | return self.sp_model.decode_batch(*args, **kargs) 122 | elif task == 'question_generation': 123 | return self.qg_model.decode_batch(*args, **kargs) 124 | else: 125 | raise ValueError('[Error]: unknown task name !') 126 | 127 | def pad_embedding_grad_zero(self): 128 | self.sp_model.pad_embedding_grad_zero() 129 | self.qg_model.pad_embedding_grad_zero() 130 | 131 | def load_model(self, sp_load_dir=None, qg_load_dir=None): 132 | if sp_load_dir is not None: 133 | self.sp_model.load_model(sp_load_dir) 134 | if qg_load_dir is not None: 135 | self.qg_model.load_model(qg_load_dir) 136 | 137 | def save_model(self, sp_save_dir=None, qg_save_dir=None): 138 | if sp_save_dir is not None: 139 | self.sp_model.save_model(sp_save_dir) 140 | if qg_save_dir is not None: 141 | self.qg_model.save_model(qg_save_dir) 142 | -------------------------------------------------------------------------------- /models/embedding/embedding_rnn.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | 5 | class RNNEmbeddings(nn.Module): 6 | def __init__(self, emb_size, vocab, unk_idx=1, pad_token_idxs=[0], dropout=0.5): 7 | super(RNNEmbeddings, self).__init__() 8 | self.embed = nn.Embedding(vocab, emb_size) 9 | self.vocab = vocab 10 | self.emb_size = emb_size 11 | self.dropout_layer = nn.Dropout(p=dropout) 12 | self.pad_token_idxs = pad_token_idxs 13 | self.unk_idx = unk_idx 14 | 15 | def forward(self, x): 16 | token_mask = x >= self.vocab 17 | if token_mask.any(): 18 | x = x.masked_fill_(token_mask, self.unk_idx) 19 | return self.dropout_layer(self.embed(x)) 20 | 21 | def pad_embedding_grad_zero(self): 22 | for pad_token_idx in self.pad_token_idxs: 23 | self.embed.weight.grad[pad_token_idx].zero_() -------------------------------------------------------------------------------- /models/enc2dec/state_transition.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | 5 | class StateTransition(nn.Module): 6 | 7 | METHODS = ['affine', 'reverse', 'tanh(affine)', 'empty'] 8 | 9 | def __init__(self, num_layers, cell='lstm', bidirectional=True, hidden_dim=None, method='empty'): 10 | """ 11 | Transform encoder final hidden states to decoder initial hidden states 12 | """ 13 | super(StateTransition, self).__init__() 14 | self.cell = cell.upper() 15 | self.num_layers = num_layers 16 | self.num_directions = 2 if bidirectional else 1 17 | assert method in StateTransition.METHODS 18 | self.method = method 19 | if 'affine' in self.method: 20 | assert hidden_dim 21 | self.h_affine = nn.Linear(hidden_dim * self.num_directions, hidden_dim) 22 | if self.cell == 'LSTM': 23 | self.c_affine = nn.Linear(hidden_dim * self.num_directions, hidden_dim) 24 | 25 | def forward(self, hidden_states): 26 | if self.method == 'empty': 27 | if 'LSTM' in self.cell: 28 | enc_h, enc_c = hidden_states 29 | dec_h = enc_h.new_zeros(self.num_layers, enc_h.size(1), enc_h.size(2)) 30 | dec_c = enc_c.new_zeros(self.num_layers, enc_c.size(1), enc_c.size(2)) 31 | hidden_states = (dec_h, dec_c) 32 | else: 33 | enc_h = hidden_states 34 | dec_h = enc_h.new_zeros(self.num_layers, enc_h.size(1), enc_h.size(2)) 35 | hidden_states = dec_h 36 | elif self.method == 'reverse': 37 | if self.num_directions == 2: 38 | index_slices = [2 * i + 1 for i in range(self.num_layers)] # from reversed path 39 | index_slices = torch.tensor(index_slices, dtype=torch.long, device=hidden_states[0].device) 40 | if self.cell == 'LSTM': 41 | enc_h, enc_c = hidden_states 42 | dec_h = torch.index_select(enc_h, 0, index_slices) 43 | dec_c = torch.index_select(enc_c, 0, index_slices) 44 | hidden_states = (dec_h.contiguous(), dec_c.contiguous()) 45 | else: 46 | enc_h = hidden_states 47 | dec_h = torch.index_select(enc_h, 0, index_slices) 48 | hidden_states = dec_h.contiguous() 49 | else: 50 | pass # do nothing, pass states directly 51 | else: 52 | if self.cell == 'LSTM': 53 | enc_h, enc_c = hidden_states 54 | batches = enc_h.size(1) 55 | dec_h = self.h_affine(enc_h.transpose(0, 1).contiguous().view(batches * self.num_layers, -1)) 56 | dec_c = self.c_affine(enc_c.transpose(0, 1).contiguous().view(batches * self.num_layers, -1)) 57 | if "tanh" in self.method: 58 | dec_h, dec_c = torch.tanh(dec_h), torch.tanh(dec_c) 59 | dec_h = dec_h.contiguous().view(batches, self.num_layers, -1).transpose(0, 1).contiguous() 60 | dec_c = dec_c.contiguous().view(batches, self.num_layers, -1).transpose(0, 1).contiguous() 61 | hidden_states = (dec_h, dec_c) 62 | else: 63 | enc_h, batches = hidden_states, hidden_states.size(1) 64 | dec_h = self.h_affine(enc_h.transpose(0, 1).contiguous().view(batches * self.num_layers, -1)) 65 | if "tanh" in self.method: 66 | dec_h = torch.tanh(dec_h) 67 | dec_h = dec_h.contiguous().view(batches, self.num_layers, -1).transpose(0, 1).contiguous() 68 | hidden_states = dec_h 69 | return hidden_states 70 | -------------------------------------------------------------------------------- /models/encoder/encoder_rnn.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | from models.model_utils import rnn_wrapper 5 | 6 | class RNNEncoder(nn.Module): 7 | """ 8 | Core encoder is a stack of N RNN layers 9 | """ 10 | def __init__(self, src_emb_size, hidden_dim, num_layers, cell="lstm", bidirectional=True, dropout=0.5): 11 | super(RNNEncoder, self).__init__() 12 | self.src_emb_size = src_emb_size 13 | self.hidden_dim = hidden_dim 14 | self.num_layers = num_layers 15 | self.bidirectional = bidirectional 16 | self.dropout = dropout if self.num_layers > 1 else 0 17 | self.cell = cell.upper() 18 | self.rnn_encoder = getattr(nn, self.cell)(self.src_emb_size, self.hidden_dim, 19 | num_layers=self.num_layers, bidirectional=self.bidirectional, 20 | batch_first=True, dropout=self.dropout) 21 | 22 | def forward(self, x, lens): 23 | """ 24 | Pass the x and lens through each RNN layer. 25 | """ 26 | out, hidden_states = rnn_wrapper(self.rnn_encoder, x, lens, cell=self.cell) # bsize x srclen x dim 27 | return out, hidden_states -------------------------------------------------------------------------------- /models/encoder_decoder.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class EncoderDecoder(nn.Module): 7 | """ 8 | A standard Encoder-Decoder architecture. 9 | """ 10 | def __init__(self, src_embed, encoder, tgt_embed, decoder, enc2dec, generator): 11 | """ 12 | All the arguments are of type nn.Module 13 | """ 14 | super(EncoderDecoder, self).__init__() 15 | self.src_embed = src_embed 16 | self.encoder = encoder 17 | self.enc2dec = enc2dec 18 | self.tgt_embed = tgt_embed 19 | self.decoder = decoder 20 | self.generator = generator 21 | 22 | def forward(self, *args, **kargs): 23 | raise NotImplementedError 24 | 25 | def decode_batch(self, *args, **kargs): 26 | raise NotImplementedError 27 | 28 | def decode_greed(self, *args, **kargs): 29 | raise NotImplementedError 30 | 31 | def decode_beam_search(self, *args, **kargs): 32 | raise NotImplementedError 33 | 34 | def pad_embedding_grad_zero(self): 35 | self.src_embed.pad_embedding_grad_zero() 36 | self.tgt_embed.pad_embedding_grad_zero() 37 | 38 | def load_model(self, load_dir): 39 | self.load_state_dict(torch.load(open(load_dir, 'rb'), map_location=lambda storage, loc: storage)) 40 | 41 | def save_model(self, save_dir): 42 | torch.save(self.state_dict(), open(save_dir, 'wb')) -------------------------------------------------------------------------------- /models/generator/generator_naive.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class Generator(nn.Module): 7 | """ 8 | Define standard linear + softmax generation step. 9 | """ 10 | def __init__(self, feats, vocab, dropout=0.5): 11 | super(Generator, self).__init__() 12 | self.proj = nn.Linear(feats, vocab) 13 | self.dropout_layer = nn.Dropout(p=dropout) 14 | 15 | def forward(self, x): 16 | return F.log_softmax(self.proj(self.dropout_layer(x)), dim=-1) -------------------------------------------------------------------------------- /models/generator/generator_pointer.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from utils.constants import MAX_OOV_NUM 6 | 7 | class GeneratorPointer(nn.Module): 8 | """ 9 | Define standard linear + softmax generation step plus pointer copy step. 10 | """ 11 | def __init__(self, feats, vocab, dropout=0.5): 12 | super(GeneratorPointer, self).__init__() 13 | self.proj = nn.Linear(feats, vocab) 14 | self.dropout_layer = nn.Dropout(p=dropout) 15 | 16 | def forward(self, x, copy_distribution, gate_scores): 17 | """ 18 | x: bsize x tgt_lens x (dec_dim + enc_dim) 19 | copy_distribution: bsize x tgt_lens x (vocab_size + MAX_OOV_NUM) 20 | gate_scores: bsize x tgt_lens x 1 21 | """ 22 | out = F.softmax(self.proj(self.dropout_layer(x)), dim=-1) 23 | extra_zeros = torch.zeros(x.size(0), x.size(1), MAX_OOV_NUM, dtype=torch.float, device=x.device) 24 | generate_distribution = torch.cat([out, extra_zeros], dim=-1) 25 | final_scores = torch.log(gate_scores * generate_distribution + (1 - gate_scores) * copy_distribution + 1e-20) 26 | return final_scores -------------------------------------------------------------------------------- /models/language_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | 3 | ''' Language Model ''' 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from models.model_utils import rnn_wrapper, lens2mask 9 | 10 | class LanguageModel(nn.Module): 11 | """ 12 | Container module with an encoder, a recurrent module, and a decoder. 13 | """ 14 | def __init__(self, vocab_size=950, emb_size=1024, hidden_dim=256, 15 | num_layers=1, cell='lstm', pad_token_idxs=[], dropout=0.5, 16 | decoder_tied=False, init=0.2, **kargs): 17 | super(LanguageModel, self).__init__() 18 | self.dropout_layer = nn.Dropout(dropout) 19 | self.encoder = nn.Embedding(vocab_size, emb_size) 20 | self.cell = cell.upper() # RNN/LSTM/GRU 21 | self.rnn = getattr(nn, self.cell)( 22 | emb_size, hidden_dim, num_layers, 23 | batch_first=True, dropout=(dropout if num_layers > 1 else 0) 24 | ) 25 | self.affine = nn.Linear(hidden_dim, emb_size) 26 | self.decoder = nn.Linear(emb_size, vocab_size) 27 | 28 | if decoder_tied: 29 | self.decoder.weight = self.encoder.weight # shape: vocab_size, emb_size 30 | 31 | self.hidden_dim = hidden_dim 32 | self.num_layers = num_layers 33 | self.pad_token_idxs = list(pad_token_idxs) 34 | 35 | if init: 36 | for p in self.parameters(): 37 | p.data.uniform_(-init, init) 38 | for pad_token_idx in pad_token_idxs: 39 | self.encoder.weight.data[pad_token_idx].zero_() 40 | 41 | def pad_embedding_grad_zero(self): 42 | for pad_token_idx in self.pad_token_idxs: 43 | self.encoder.weight.grad[pad_token_idx].zero_() 44 | 45 | def forward(self, input_feats, lens): 46 | input_feats, lens = input_feats[:, :-1], lens - 1 47 | emb = self.dropout_layer(self.encoder(input_feats)) # bsize, seq_length, emb_size 48 | output, _ = rnn_wrapper(self.rnn, emb, lens, self.cell) 49 | decoded = self.decoder(self.affine(self.dropout_layer(output))) 50 | scores = F.log_softmax(decoded, dim=-1) 51 | return scores 52 | 53 | def sent_logprobability(self, input_feats, lens): 54 | ''' 55 | Given sentences, calculate its length-normalized log-probability 56 | Sequence must contain and symbol 57 | lens: length tensor 58 | ''' 59 | lens = lens - 1 60 | input_feats, output_feats = input_feats[:, :-1], input_feats[:, 1:] 61 | emb = self.dropout_layer(self.encoder(input_feats)) # bsize, seq_len, emb_size 62 | output, _ = rnn_wrapper(self.rnn, emb, lens, self.cell) 63 | decoded = self.decoder(self.affine(self.dropout_layer(output))) 64 | scores = F.log_softmax(decoded, dim=-1) 65 | log_prob = torch.gather(scores, 2, output_feats.unsqueeze(-1)).contiguous().view(output.size(0), output.size(1)) 66 | sent_log_prob = torch.sum(log_prob * lens2mask(lens).float(), dim=-1) 67 | return sent_log_prob / lens.float() 68 | 69 | def load_model(self, load_dir): 70 | self.load_state_dict(torch.load(open(load_dir, 'rb'), map_location=lambda storage, loc: storage)) 71 | 72 | def save_model(self, save_dir): 73 | torch.save(self.state_dict(), open(save_dir, 'wb')) 74 | -------------------------------------------------------------------------------- /models/model_attn.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | from utils.constants import BOS, EOS, MAX_DECODE_LENGTH 5 | from models.model_utils import tile, lens2mask 6 | from models.Beam import Beam, GNMTGlobalScorer 7 | from models.encoder_decoder import EncoderDecoder 8 | 9 | class AttnModel(EncoderDecoder): 10 | 11 | def __init__(self, *args, **kargs): 12 | super(AttnModel, self).__init__(*args, **kargs) 13 | self.copy = False 14 | 15 | """ 16 | We use copy_tokens, just to be compatible with Attention Pointer Model 17 | """ 18 | def forward(self, src_inputs, src_lens, tgt_inputs, copy_tokens=None): 19 | """ 20 | Used during training time. 21 | """ 22 | enc_out, hidden_states = self.encoder(self.src_embed(src_inputs), src_lens) 23 | hidden_states = self.enc2dec(hidden_states) 24 | src_mask = lens2mask(src_lens) 25 | dec_out, _ = self.decoder(self.tgt_embed(tgt_inputs), hidden_states, enc_out, src_mask, copy_tokens) 26 | out = self.generator(dec_out) 27 | return out 28 | 29 | def decode_batch(self, src_inputs, src_lens, vocab, copy_tokens=None, 30 | beam_size=5, n_best=1, alpha=0.6, length_pen='avg'): 31 | enc_out, hidden_states = self.encoder(self.src_embed(src_inputs), src_lens) 32 | hidden_states = self.enc2dec(hidden_states) 33 | src_mask = lens2mask(src_lens) 34 | if beam_size == 1: 35 | return self.decode_greed(hidden_states, enc_out, src_mask, vocab, copy_tokens) 36 | else: 37 | return self.decode_beam_search(hidden_states, enc_out, src_mask, vocab, copy_tokens, 38 | beam_size=beam_size, n_best=n_best, alpha=alpha, length_pen=length_pen) 39 | 40 | def decode_greed(self, hidden_states, memory, src_mask, vocab, copy_tokens=None): 41 | """ 42 | hidden_states: hidden_states from encoder 43 | memory: encoder output, bsize x src_len x enc_dim 44 | src_mask: ByteTensor, bsize x max_src_len 45 | vocab: tgt word2idx dict containing BOS, EOS 46 | """ 47 | results = {"scores":[], "predictions":[]} 48 | 49 | # first target token is BOS 50 | batches = memory.size(0) 51 | ys = torch.ones(batches, 1, dtype=torch.long).fill_(vocab[BOS]).to(memory.device) 52 | # record whether each sample is finished 53 | all_done = torch.tensor([False] * batches, dtype=torch.uint8, device=memory.device) 54 | scores = torch.zeros(batches, 1, dtype=torch.float, device=memory.device) 55 | predictions = [[] for i in range(batches)] 56 | 57 | for i in range(MAX_DECODE_LENGTH): 58 | logprob, hidden_states = self.decode_one_step(ys, hidden_states, memory, src_mask, copy_tokens) 59 | maxprob, ys = torch.max(logprob, dim=1, keepdim=True) 60 | for i in range(batches): 61 | if not all_done[i]: 62 | scores[i] += maxprob[i] 63 | predictions[i].append(ys[i]) 64 | done = ys.squeeze(dim=1) == vocab[EOS] 65 | all_done |= done 66 | if all_done.all(): 67 | break 68 | results["predictions"], results["scores"] = [[torch.cat(pred).tolist()] for pred in predictions], scores 69 | return results 70 | 71 | def decode_one_step(self, ys, hidden_states, memory, src_mask, copy_tokens=None): 72 | """ 73 | ys: bsize x 1 74 | """ 75 | dec_out, hidden_states = self.decoder(self.tgt_embed(ys), hidden_states, memory, src_mask, copy_tokens) 76 | out = self.generator(dec_out) 77 | return out.squeeze(dim=1), hidden_states 78 | 79 | def decode_beam_search(self, hidden_states, memory, src_mask, vocab, copy_tokens=None, 80 | beam_size=5, n_best=1, alpha=0.6, length_pen='avg'): 81 | """ 82 | Beam search decoding 83 | """ 84 | results = {"scores":[], "predictions":[]} 85 | 86 | # Construct beams, we donot use stepwise coverage penalty nor ngrams block 87 | remaining_sents = memory.size(0) 88 | global_scorer = GNMTGlobalScorer(alpha, length_pen) 89 | beam = [ Beam(beam_size, vocab, global_scorer=global_scorer, device=memory.device) 90 | for _ in range(remaining_sents) ] 91 | 92 | # repeat beam_size times 93 | memory, src_mask, copy_tokens = tile([memory, src_mask, copy_tokens], beam_size, dim=0) 94 | hidden_states = tile(hidden_states, beam_size, dim=1) 95 | h_c = type(hidden_states) in [list, tuple] 96 | batch_idx = list(range(remaining_sents)) 97 | 98 | for i in range(MAX_DECODE_LENGTH): 99 | # (a) construct beamsize * remaining_sents next words 100 | ys = torch.stack([b.get_current_state() for b in beam if not b.done()]).contiguous().view(-1,1) 101 | 102 | # (b) pass through the decoder network 103 | out, hidden_states = self.decode_one_step(ys, hidden_states, memory, src_mask, copy_tokens) 104 | out = out.contiguous().view(remaining_sents, beam_size, -1) 105 | 106 | # (c) advance each beam 107 | active, select_indices_array = [], [] 108 | # Loop over the remaining_batch number of beam 109 | for b in range(remaining_sents): 110 | idx = batch_idx[b] # idx represent the original order in minibatch_size 111 | beam[idx].advance(out[b]) 112 | if not beam[idx].done(): 113 | active.append((idx, b)) 114 | select_indices_array.append(beam[idx].get_current_origin() + b * beam_size) 115 | 116 | # (d) update hidden_states history 117 | select_indices_array = torch.cat(select_indices_array, dim=0) 118 | if h_c: 119 | hidden_states = (hidden_states[0].index_select(1, select_indices_array), hidden_states[1].index_select(1, select_indices_array)) 120 | else: 121 | hidden_states = hidden_states.index_select(1, select_indices_array) 122 | 123 | if not active: 124 | break 125 | 126 | # (e) reserve un-finished batches 127 | active_idx = torch.tensor([item[1] for item in active], dtype=torch.long, device=memory.device) # original order in remaining batch 128 | batch_idx = { idx: item[0] for idx, item in enumerate(active) } # order for next remaining batch 129 | 130 | def update_active(t): 131 | if t is None: return t 132 | t_reshape = t.contiguous().view(remaining_sents, beam_size, -1) 133 | new_size = list(t.size()) 134 | new_size[0] = -1 135 | return t_reshape.index_select(0, active_idx).view(*new_size) 136 | 137 | if h_c: 138 | hidden_states = ( 139 | update_active(hidden_states[0].transpose(0, 1)).transpose(0, 1).contiguous(), 140 | update_active(hidden_states[1].transpose(0, 1)).transpose(0, 1).contiguous() 141 | ) 142 | else: 143 | hidden_states = update_active(hidden_states.transpose(0, 1)).transpose(0, 1).contiguous() 144 | memory = update_active(memory) 145 | src_mask = update_active(src_mask) 146 | copy_tokens = update_active(copy_tokens) 147 | remaining_sents = len(active) 148 | 149 | for b in beam: 150 | scores, ks = b.sort_finished(minimum=n_best) 151 | hyps = [] 152 | for i, (times, k) in enumerate(ks[:n_best]): 153 | hyp = b.get_hyp(times, k) 154 | hyps.append(hyp.tolist()) # hyp contains but does not contain 155 | results["predictions"].append(hyps) # batch list of variable_tgt_len 156 | results["scores"].append(torch.stack(scores)[:n_best]) # list of [n_best], torch.FloatTensor 157 | results["scores"] = torch.stack(results["scores"]) 158 | return results 159 | -------------------------------------------------------------------------------- /models/model_attnptr.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | from models.model_utils import lens2mask 3 | from models.model_attn import AttnModel 4 | 5 | class AttnPtrModel(AttnModel): 6 | 7 | def __init__(self, *args, **kargs): 8 | super(AttnPtrModel, self).__init__(*args, **kargs) 9 | self.copy = True 10 | 11 | def forward(self, src_inputs, src_lens, tgt_inputs, copy_tokens): 12 | """ 13 | Used during training time. 14 | """ 15 | enc_out, hidden_states = self.encoder(self.src_embed(src_inputs), src_lens) 16 | hidden_states = self.enc2dec(hidden_states) 17 | src_mask = lens2mask(src_lens) 18 | dec_out, _, copy_dist, gates = self.decoder(self.tgt_embed(tgt_inputs), hidden_states, enc_out, src_mask, copy_tokens) 19 | out = self.generator(dec_out, copy_dist, gates) 20 | return out 21 | 22 | def decode_one_step(self, ys, hidden_states, memory, src_mask, copy_tokens): 23 | dec_out, hidden_states, copy_dist, gates = self.decoder(self.tgt_embed(ys), hidden_states, memory, src_mask, copy_tokens) 24 | out = self.generator(dec_out, copy_dist, gates).squeeze(dim=1) 25 | return out, hidden_states 26 | -------------------------------------------------------------------------------- /models/model_utils.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.utils.rnn as rnn_utils 5 | 6 | def tile(x, count, dim=0): 7 | """ 8 | Tiles x on dimension dim count times. 9 | E.g. [1, 2, 3], count=2 ==> [1, 1, 2, 2, 3, 3] 10 | [[1, 2], [3, 4]], count=3, dim=1 ==> [[1, 1, 1, 2, 2, 2], [3, 3, 3, 4, 4, 4]] 11 | Different from torch.repeat 12 | """ 13 | if x is None: 14 | return x 15 | elif type(x) in [list, tuple]: 16 | return type(x)([tile(each, count, dim) for each in x]) 17 | else: 18 | perm = list(range(len(x.size()))) 19 | if dim != 0: 20 | perm[0], perm[dim] = perm[dim], perm[0] 21 | x = x.permute(perm).contiguous() 22 | out_size = list(x.size()) 23 | out_size[0] *= count 24 | batch = x.size(0) 25 | x = x.contiguous().view(batch, -1) \ 26 | .transpose(0, 1) \ 27 | .repeat(count, 1) \ 28 | .transpose(0, 1) \ 29 | .contiguous() \ 30 | .view(*out_size) 31 | if dim != 0: 32 | x = x.permute(perm).contiguous() 33 | return x 34 | 35 | def lens2mask(lens): 36 | bsize = lens.numel() 37 | max_len = lens.max() 38 | masks = torch.arange(0, max_len).type_as(lens).to(lens.device).repeat(bsize, 1).lt(lens.unsqueeze(1)) 39 | masks.requires_grad = False 40 | return masks 41 | 42 | def rnn_wrapper(encoder, inputs, lens, cell='lstm'): 43 | """ 44 | @args: 45 | encoder(nn.Module): rnn series bidirectional encoder, batch_first=True 46 | inputs(torch.FloatTensor): rnn inputs, bsize x max_seq_len x in_dim 47 | lens(torch.LongTensor): seq len for each sample, bsize 48 | @return: 49 | out(torch.FloatTensor): output of encoder, bsize x max_seq_len x hidden_dim*2 50 | hidden_states(tuple or torch.FloatTensor): final hidden states, num_layers*2 x bsize x hidden_dim 51 | """ 52 | # rerank according to lens and temporarily remove empty inputs 53 | sorted_lens, sort_key = torch.sort(lens, descending=True) 54 | nonzero_index = torch.sum(sorted_lens > 0).item() 55 | sorted_inputs = torch.index_select(inputs, dim=0, index=sort_key[:nonzero_index]) 56 | # forward non empty inputs 57 | packed_inputs = rnn_utils.pack_padded_sequence(sorted_inputs, sorted_lens[:nonzero_index].tolist(), batch_first=True) 58 | packed_out, h = encoder(packed_inputs) # bsize x srclen x dim 59 | out, _ = rnn_utils.pad_packed_sequence(packed_out, batch_first=True) 60 | if cell.upper() == 'LSTM': 61 | h, c = h 62 | # pad zeros due to empty inputs 63 | pad_zeros = torch.zeros(sorted_lens.size(0) - out.size(0), out.size(1), out.size(2)).type_as(out).to(out.device) 64 | sorted_out = torch.cat([out, pad_zeros], dim=0) 65 | pad_hiddens = torch.zeros(h.size(0), sorted_lens.size(0) - h.size(1), h.size(2)).type_as(h).to(h.device) 66 | sorted_hiddens = torch.cat([h, pad_hiddens], dim=1) 67 | if cell.upper() == 'LSTM': 68 | pad_cells = torch.zeros(c.size(0), sorted_lens.size(0) - c.size(1), c.size(2)).type_as(c).to(c.device) 69 | sorted_cells = torch.cat([c, pad_cells], dim=1) 70 | # rerank according to sort_key 71 | shape = list(sorted_out.size()) 72 | out = torch.zeros_like(sorted_out).type_as(sorted_out).to(sorted_out.device).scatter_(0, sort_key.unsqueeze(-1).unsqueeze(-1).expand(*shape), sorted_out) 73 | shape = list(sorted_hiddens.size()) 74 | hiddens = torch.zeros_like(sorted_hiddens).type_as(sorted_hiddens).to(sorted_hiddens.device).scatter_(1, sort_key.unsqueeze(0).unsqueeze(-1).expand(*shape), sorted_hiddens) 75 | if cell.upper() == 'LSTM': 76 | cells = torch.zeros_like(sorted_cells).type_as(sorted_cells).to(sorted_cells.device).scatter_(1, sort_key.unsqueeze(0).unsqueeze(-1).expand(*shape), sorted_cells) 77 | return out, (hiddens.contiguous(), cells.contiguous()) 78 | return out, hiddens.contiguous() 79 | -------------------------------------------------------------------------------- /models/penalties.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | 4 | class PenaltyBuilder(object): 5 | """ 6 | Returns the Length Penalty function for Beam Search. 7 | 8 | Args: 9 | length_pen (str): option name of length pen 10 | """ 11 | 12 | def __init__(self, length_pen): 13 | self.length_pen = length_pen 14 | 15 | def length_penalty(self): 16 | if self.length_pen == "wu": 17 | return self.length_wu 18 | elif self.length_pen == "avg": 19 | return self.length_average 20 | else: 21 | return self.length_none 22 | 23 | """ 24 | Below are all the different penalty terms implemented so far 25 | """ 26 | 27 | def length_wu(self, beam, logprobs, alpha=0.): 28 | """ 29 | NMT length re-ranking score from 30 | "Google's Neural Machine Translation System" :cite:`wu2016google`. 31 | """ 32 | 33 | modifier = (((5 + len(beam.next_ys)) ** alpha) / 34 | ((5 + 1) ** alpha)) 35 | return (logprobs / modifier) 36 | 37 | def length_average(self, beam, logprobs, alpha=0.): 38 | """ 39 | Returns the average probability of tokens in a sequence. 40 | """ 41 | return logprobs / len(beam.next_ys) 42 | 43 | def length_none(self, beam, logprobs, alpha=0.): 44 | """ 45 | Returns unmodified scores. 46 | """ 47 | return logprobs 48 | -------------------------------------------------------------------------------- /models/reward.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | from utils.constants import * 3 | from utils.example import Example 4 | from models.model_utils import lens2mask 5 | import numpy as np 6 | import torch 7 | 8 | class RewardModel(): 9 | 10 | def __init__(self, dataset, qlm, lflm, lm_vocab, sp_device='cpu', qg_device='cpu'): 11 | super(RewardModel, self).__init__() 12 | self.dataset = dataset 13 | self.qlm = qlm.to(sp_device) 14 | self.lflm = lflm.to(qg_device) 15 | self.vocab = lm_vocab 16 | self.sp_device = sp_device 17 | self.qg_device = qg_device 18 | 19 | def forward(self, *args, choice='sp_val'): 20 | if choice == 'sp_val': 21 | return self.sp_validity_reward(*args) 22 | elif choice == 'qg_val': 23 | return self.qg_validity_reward(*args) 24 | elif 'rec' in choice: 25 | return self.reconstruction_reward(*args) 26 | else: 27 | raise ValueError('[Error]: unknown reward choice !') 28 | 29 | def sp_validity_reward(self, lf_list): 30 | # calculate logical form language model length normalized log probability 31 | input_idxs = [[self.vocab.lf2id[BOS]] + [self.vocab.lf2id[word] if word in self.vocab.lf2id else self.vocab.lf2id[UNK] for word in sent] + [self.vocab.word2id[EOS]] for sent in lf_list] 32 | lens = [len(each) for each in input_idxs] 33 | max_len = max(lens) 34 | input_idxs = [sent + [self.vocab.lf2id[PAD]] * (max_len - len(sent)) for sent in input_idxs] 35 | input_tensor = torch.tensor(input_idxs, dtype=torch.long, device=self.qg_device) 36 | lens = torch.tensor(lens, dtype=torch.long, device=self.qg_device) 37 | self.lflm.eval() 38 | with torch.no_grad(): 39 | logprob = self.lflm.sent_logprobability(input_tensor, lens).cpu() 40 | # grammar check 41 | domain = Example.domain 42 | ans = domain.is_valid(domain.obtain_denotations(domain.normalize(lf_list))) 43 | grammar = torch.tensor(ans, dtype=torch.float, requires_grad=False) 44 | val_reward = 0.5 * logprob + 0.5 * grammar 45 | return val_reward 46 | 47 | def qg_validity_reward(self, utterances): 48 | # calculate language model length normalized log probability 49 | input_idxs = [[self.vocab.word2id[BOS]] + [self.vocab.word2id[word] if word in self.vocab.word2id else self.vocab.word2id[UNK] for word in sent] + [self.vocab.word2id[EOS]] for sent in utterances] 50 | lens = [len(each) for each in input_idxs] 51 | max_len = max(lens) 52 | input_idxs = [sent + [self.vocab.word2id[PAD]] * (max_len - len(sent)) for sent in input_idxs] 53 | input_tensor = torch.tensor(input_idxs, dtype=torch.long, device=self.sp_device) 54 | lens = torch.tensor(lens, dtype=torch.long, device=self.sp_device) 55 | self.qlm.eval() 56 | with torch.no_grad(): 57 | logprob = self.qlm.sent_logprobability(input_tensor, lens).cpu() 58 | return logprob 59 | 60 | def reconstruction_reward(self, logscores, references, lens): 61 | """ 62 | logscores: bsize x max_out_len x vocab_size[ + MAX_OOV_NUM] 63 | references: bsize x max_out_len 64 | lens: len for each sample 65 | """ 66 | mask = lens2mask(lens) 67 | pick_score = torch.gather(logscores, dim=-1, index=references.unsqueeze(dim=-1)).squeeze(dim=-1) 68 | masked_score = mask.float() * pick_score 69 | reward = masked_score.sum(dim=1) 70 | return reward 71 | 72 | def __call__(self, *args, **kargs): 73 | return self.forward(*args, **kargs) 74 | -------------------------------------------------------------------------------- /pull_dependency.sh: -------------------------------------------------------------------------------- 1 | evaluator=evaluator.tar.gz 2 | lib=lib.tar.gz 3 | 4 | if [ ! -e "$evaluator" ]; then 5 | echo "Start downloading evaluator for overnight and geo datasets ..." 6 | wget -c https://worksheets.codalab.org/rest/bundles/0xbfbf0d1d8ab94874a68646a7d66c478e/contents/blob/ -O $evaluator 7 | fi 8 | 9 | if [ ! -e "$lib" ] ; then 10 | echo "Start downloading libraries for evaluation..." 11 | wget -c https://worksheets.codalab.org/rest/bundles/0xc6821b4f13f445d1b54e9da63019da1d/contents/blob/ -O $lib 12 | fi 13 | 14 | mkdir evaluator 15 | mkdir lib 16 | tar -zxf $evaluator -C evaluator 17 | tar -zxf $lib -C lib 18 | rm -rf $evaluator 19 | rm -rf $lib 20 | cp evaluator/sempre/module-classes.txt . 21 | 22 | wget -c http://nlp.stanford.edu/data/glove.6B.zip 23 | mkdir -p data/.cache 24 | unzip glove.6B.zip -d data/.cache/ 25 | rm glove.6B.zip 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gpustat==0.6.0 2 | allennlp==0.9.0 3 | matplotlib==3.1.1 -------------------------------------------------------------------------------- /run/run_dual_learning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | task='dual_learning' 4 | dataset=$1 5 | if [ "$2" = 'attnptr' ] ; then 6 | copy='copy__' 7 | else 8 | copy='' 9 | fi 10 | read_sp_model_path=exp/task_semantic_parsing/dataset_${1}/labeled_${3}/${copy}cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100__beam_5__nbest_1/ 11 | read_qg_model_path=exp/task_question_generation/dataset_${1}/labeled_${3}/${copy}cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100__beam_5__nbest_1/ 12 | read_qlm_path=exp/task_language_model/dataset_${1}/question__labeled_1.0/cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100/ 13 | read_lflm_path=exp/task_language_model/dataset_${1}/logical_form__labeled_1.0/cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100/ 14 | 15 | # training paras 16 | reduction=sum # sum, mean 17 | lr=0.001 18 | l2=1e-5 19 | batchSize=16 20 | test_batchSize=128 21 | max_norm=5 22 | max_epoch=100 23 | beam=5 24 | n_best=1 25 | 26 | # special paras 27 | sample=6 28 | alpha=0.5 29 | beta=0.5 30 | labeled=$3 31 | unlabeled=1.0 32 | cycle=sp+qg 33 | deviceId="0 1" 34 | seed=999 35 | extra='--extra' 36 | 37 | python3 scripts/dual_learning.py --task $task --read_sp_model_path $read_sp_model_path --read_qg_model_path $read_qg_model_path \ 38 | --dataset $dataset --read_qlm_path $read_qlm_path --read_lflm_path $read_lflm_path \ 39 | --reduction $reduction --lr $lr --l2 $l2 --batchSize $batchSize --test_batchSize $test_batchSize \ 40 | --cycle $cycle --max_norm $max_norm --max_epoch $max_epoch --beam $beam --n_best $n_best --sample $sample --alpha $alpha --beta $beta \ 41 | --labeled $labeled --unlabeled $unlabeled --deviceId $deviceId --seed $seed $extra 42 | -------------------------------------------------------------------------------- /run/run_language_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | task='language_model' 3 | dataset=$1 4 | side=$2 # question, logical_form 5 | # read_model_path='' 6 | 7 | num_layers=1 8 | hidden_dim=200 9 | emb_size=100 10 | cell=lstm # lstm, gru 11 | decoder_tied='' # '--decoder_tied', '' 12 | 13 | batchSize=16 14 | test_batchSize=128 15 | lr=0.001 16 | dropout=0.5 17 | max_norm=5 18 | l2=1e-5 19 | max_epoch=100 20 | labeled=1.0 21 | deviceId=0 22 | 23 | python scripts/language_model.py --task $task --dataset $dataset --side $side \ 24 | --num_layers $num_layers --hidden_dim $hidden_dim --emb_size $emb_size --cell $cell \ 25 | --batchSize $batchSize --test_batchSize $test_batchSize --lr $lr --dropout $dropout --max_norm $max_norm --l2 $l2 \ 26 | --labeled $labeled --max_epoch $max_epoch --deviceId $deviceId $decoder_tied 27 | -------------------------------------------------------------------------------- /run/run_pseudo_method.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | task='pseudo_method' 4 | dataset=$1 5 | if [ "$2" = 'attnptr' ] ; then 6 | copy='copy__' 7 | else 8 | copy='' 9 | fi 10 | read_sp_model_path=exp/task_semantic_parsing/dataset_${1}/labeled_${3}/${copy}cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100__beam_5__nbest_1/ 11 | read_qg_model_path=exp/task_question_generation/dataset_${1}/labeled_${3}/${copy}cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100__beam_5__nbest_1/ 12 | 13 | # training paras 14 | reduction=sum # sum, mean 15 | lr=0.001 16 | l2=1e-5 17 | batchSize=16 18 | test_batchSize=128 19 | max_norm=5 20 | max_epoch=100 21 | beam=5 22 | n_best=1 23 | 24 | # special paras 25 | discount=0.5 26 | method=constant # constant, linear 27 | labeled=$3 28 | unlabeled=1.0 29 | deviceId="0 1" 30 | seed=999 31 | extra='--extra' 32 | 33 | python3 scripts/pseudo_method.py --task $task --dataset $dataset \ 34 | --read_sp_model_path $read_sp_model_path --read_qg_model_path $read_qg_model_path \ 35 | --reduction $reduction --lr $lr --l2 $l2 --batchSize $batchSize --test_batchSize $test_batchSize \ 36 | --discount $discount --method $method --max_norm $max_norm --max_epoch $max_epoch --beam $beam --n_best $n_best \ 37 | --labeled $labeled --unlabeled $unlabeled --seed $seed --deviceId $deviceId $extra 38 | -------------------------------------------------------------------------------- /run/run_question_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | task='question_generation' 4 | dataset=$1 5 | # read_model_path='' 6 | 7 | # model paras 8 | if [ "$2" = "attnptr" ] ; then 9 | copy='--copy' 10 | else 11 | copy='' 12 | fi 13 | emb_size=100 14 | hidden_dim=200 15 | num_layers=1 16 | cell=lstm # lstm, gru 17 | 18 | # training paras 19 | reduction=sum # sum, mean 20 | lr=0.001 21 | l2=1e-5 22 | dropout=0.5 23 | batchSize=16 24 | test_batchSize=128 25 | init_weight=0.2 26 | max_norm=5 27 | max_epoch=100 28 | beam=5 29 | n_best=1 30 | 31 | # special paras 32 | labeled=$3 33 | deviceId=0 34 | seed=999 35 | 36 | python3 scripts/question_generation.py --task $task $copy --emb_size $emb_size --hidden_dim $hidden_dim --num_layers $num_layers \ 37 | --dataset $dataset --cell $cell --reduction $reduction --lr $lr --l2 $l2 --dropout $dropout --batchSize $batchSize --test_batchSize $test_batchSize \ 38 | --init_weight $init_weight --max_norm $max_norm --max_epoch $max_epoch --beam $beam --n_best $n_best \ 39 | --labeled $labeled --deviceId $deviceId --seed $seed 40 | -------------------------------------------------------------------------------- /run/run_semantic_parsing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | task='semantic_parsing' 4 | dataset=$1 5 | # read_model_path='' 6 | 7 | # model paras 8 | if [ "$2" = "attnptr" ] ; then 9 | copy='--copy' 10 | else 11 | copy='' 12 | fi 13 | emb_size=100 14 | hidden_dim=200 15 | num_layers=1 16 | cell=lstm # lstm, gru 17 | 18 | # training paras 19 | reduction=sum # sum, mean 20 | lr=0.001 21 | l2=1e-5 22 | dropout=0.5 23 | batchSize=16 24 | test_batchSize=128 25 | init_weight=0.2 26 | max_norm=5 27 | max_epoch=100 28 | beam=5 29 | n_best=1 30 | 31 | # special paras 32 | labeled=$3 33 | deviceId=0 34 | seed=999 35 | 36 | python3 scripts/semantic_parsing.py --task $task $copy --emb_size $emb_size --hidden_dim $hidden_dim --num_layers $num_layers \ 37 | --dataset $dataset --cell $cell --reduction $reduction --lr $lr --l2 $l2 --dropout $dropout --batchSize $batchSize --test_batchSize $test_batchSize \ 38 | --init_weight $init_weight --max_norm $max_norm --max_epoch $max_epoch --beam $beam --n_best $n_best \ 39 | --labeled $labeled --deviceId $deviceId --seed $seed 40 | -------------------------------------------------------------------------------- /scripts/dual_learning.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import argparse, os, sys, time, json 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | from utils.vocab import Vocab 5 | from utils.example import Example, split_dataset 6 | from utils.optimizer import set_optimizer 7 | from utils.loss import set_loss_function 8 | from utils.seed import set_random_seed 9 | from utils.logger import set_logger 10 | from utils.gpu import set_torch_device 11 | from utils.constants import * 12 | from utils.solver.solver_dual_learning import DualLearningSolver 13 | from utils.hyperparam import hyperparam_dual_learning 14 | from models.construct_models import construct_model as model 15 | from models.dual_learning import DualLearning 16 | from models.reward import RewardModel 17 | from models.language_model import LanguageModel 18 | 19 | ############################### Arguments parsing and Preparations ############################## 20 | 21 | def main(args=sys.argv[1:]): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--task', required=True, help='pseudo method for semantic parsing') 24 | parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)') 25 | parser.add_argument('--dataset', required=True, help='which dataset to experiment on') 26 | parser.add_argument('--read_model_path', help='Testing mode, load sp and qg model path') 27 | # model params 28 | parser.add_argument('--read_sp_model_path', required=True, help='pretrained sp model') 29 | parser.add_argument('--read_qg_model_path', required=True, help='pretrained qg model path') 30 | parser.add_argument('--read_qlm_path', required=True, help='language model for natural language questions') 31 | parser.add_argument('--read_lflm_path', required=True, help='language model for logical form') 32 | # pseudo training paras 33 | parser.add_argument('--reduction', choices=['sum', 'mean'], default='sum') 34 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 35 | parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)') 36 | parser.add_argument('--batchSize', type=int, default=16, help='input batch size') 37 | parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding') 38 | parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)") 39 | parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for') 40 | # special paras 41 | parser.add_argument('--sample', type=int, default=5, help='size of sampling during training in dual learning') 42 | parser.add_argument('--beam', default=5, type=int, help='used during decoding time') 43 | parser.add_argument('--n_best', default=1, type=int, help='used during decoding time') 44 | parser.add_argument('--alpha', type=float, default=0.5, help='coefficient which combines sp valid and reconstruction reward') 45 | parser.add_argument('--beta', type=float, default=0.5, help='coefficient which combines qg valid and reconstruction reward') 46 | parser.add_argument('--cycle', choices=['sp', 'qg', 'sp+qg'], default='sp+qg', help='whether use cycle starts from sp/qg') 47 | parser.add_argument('--labeled', type=float, default=1.0, help='ratio of labeled samples') 48 | parser.add_argument('--unlabeled', type=float, default=1.0, help='ratio of unlabeled samples') 49 | parser.add_argument('--deviceId', type=int, nargs=2, default=[-1, -1], help='device for semantic parsing and question generation model respectively') 50 | parser.add_argument('--seed', type=int, default=999, help='set initial random seed') 51 | parser.add_argument('--extra', action='store_true', help='whether use synthesized logical forms') 52 | opt = parser.parse_args(args) 53 | 54 | # Some Arguments Check 55 | assert opt.labeled > 0. 56 | assert opt.unlabeled >= 0. and opt.unlabeled <= 1.0 57 | return opt 58 | 59 | opt = main() 60 | 61 | ####################### Output path, logger, device and random seed configuration ################# 62 | 63 | exp_path = opt.read_model_path if opt.testing else hyperparam_dual_learning(opt) 64 | if not os.path.exists(exp_path): 65 | os.makedirs(exp_path) 66 | 67 | logger = set_logger(exp_path, testing=opt.testing) 68 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4))) 69 | logger.info("Experiment path: %s" % (exp_path)) 70 | sp_device, qg_device = set_torch_device(opt.deviceId[0]), set_torch_device(opt.deviceId[1]) 71 | set_random_seed(opt.seed, device='cuda') 72 | 73 | ################################ Vocab and Data Reader ########################### 74 | 75 | sp_copy, qg_copy = 'copy__' in opt.read_sp_model_path, 'copy__' in opt.read_qg_model_path 76 | sp_vocab, qg_vocab = Vocab(opt.dataset, task='semantic_parsing', copy=sp_copy), Vocab(opt.dataset, task='question_generation', copy=qg_copy) 77 | lm_vocab = Vocab(opt.dataset, task='language_model') 78 | logger.info("Semantic Parsing model vocabulary ...") 79 | logger.info("Vocab size for input natural language sentence is: %s" % (len(sp_vocab.word2id))) 80 | logger.info("Vocab size for output logical form is: %s" % (len(sp_vocab.lf2id))) 81 | 82 | logger.info("Question Generation model vocabulary ...") 83 | logger.info("Vocab size for input logical form is: %s" % (len(qg_vocab.lf2id))) 84 | logger.info("Vocab size for output natural language sentence is: %s" % (len(qg_vocab.word2id))) 85 | 86 | logger.info("Language model vocabulary ...") 87 | logger.info("Vocab size for question language model is: %s" % (len(lm_vocab.word2id))) 88 | logger.info("Vocab size for logical form language model is: %s" % (len(lm_vocab.lf2id))) 89 | 90 | logger.info("Read dataset starts at %s" % (time.asctime(time.localtime(time.time())))) 91 | Example.set_domain(opt.dataset) 92 | if not opt.testing: 93 | train_dataset, dev_dataset = Example.load_dataset(choice='train') 94 | labeled_train_dataset, unlabeled_train_dataset = split_dataset(train_dataset, opt.labeled) 95 | unlabeled_train_dataset, _ = split_dataset(unlabeled_train_dataset, opt.unlabeled) 96 | unlabeled_train_dataset += labeled_train_dataset 97 | if opt.extra: 98 | q_unlabeled_train_dataset = unlabeled_train_dataset 99 | lf_unlabeled_train_dataset = unlabeled_train_dataset + Example.load_dataset(choice='extra') 100 | else: 101 | q_unlabeled_train_dataset, lf_unlabeled_train_dataset = unlabeled_train_dataset, unlabeled_train_dataset 102 | logger.info("Labeled/Unlabeled train dataset size is: %s and %s" % (len(labeled_train_dataset), len(lf_unlabeled_train_dataset))) 103 | logger.info("Dev dataset size is: %s" % (len(dev_dataset))) 104 | test_dataset = Example.load_dataset(choice='test') 105 | logger.info("Test dataset size is: %s" % (len(test_dataset))) 106 | 107 | ###################################### Model Construction ######################################## 108 | 109 | if not opt.testing: 110 | params = { 111 | "read_sp_model_path": opt.read_sp_model_path, "read_qg_model_path": opt.read_qg_model_path, 112 | "read_qlm_path": opt.read_qlm_path, "read_lflm_path": opt.read_lflm_path, 113 | "sample": opt.sample, "alpha": opt.alpha, "beta": opt.beta, "reduction": opt.reduction 114 | } 115 | json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4) 116 | else: 117 | params = json.load(open(os.path.join(exp_path, "params.json"), 'r')) 118 | sp_params = json.load(open(os.path.join(params['read_sp_model_path'], 'params.json'), 'r')) 119 | sp_model = model(**sp_params) 120 | qg_params = json.load(open(os.path.join(params['read_qg_model_path'], 'params.json'), 'r')) 121 | qg_model = model(**qg_params) 122 | if not opt.testing: 123 | sp_model.load_model(os.path.join(params['read_sp_model_path'], 'model.pkl')) 124 | logger.info("Load Semantic Parsing model from path %s" % (params['read_sp_model_path'])) 125 | qg_model.load_model(os.path.join(params['read_qg_model_path'], 'model.pkl')) 126 | logger.info("Load Question Generation model from path %s" % (params['read_qg_model_path'])) 127 | qlm_params = json.load(open(os.path.join(params['read_qlm_path'], 'params.json'), 'r')) 128 | qlm_model = LanguageModel(**qlm_params) 129 | qlm_model.load_model(os.path.join(params['read_qlm_path'], 'model.pkl')) 130 | logger.info("Load Question Language Model from path %s" % (params['read_qlm_path'])) 131 | lflm_params = json.load(open(os.path.join(params['read_lflm_path'], 'params.json'), 'r')) 132 | lflm_model = LanguageModel(**lflm_params) 133 | lflm_model.load_model(os.path.join(params['read_lflm_path'], 'model.pkl')) 134 | logger.info("Load Logical Form Language Model from path %s" % (params['read_lflm_path'])) 135 | reward_model = RewardModel(opt.dataset, qlm_model, lflm_model, lm_vocab, sp_device=sp_device, qg_device=qg_device) 136 | else: 137 | sp_model.load_model(os.path.join(exp_path, 'sp_model.pkl')) 138 | logger.info("Load Semantic Parsing model from path %s" % (exp_path)) 139 | qg_model.load_model(os.path.join(exp_path, 'qg_model.pkl')) 140 | logger.info("Load Question Generation model from path %s" % (exp_path)) 141 | reward_model = None 142 | train_model = DualLearning(sp_model, qg_model, reward_model, sp_vocab, qg_vocab, 143 | alpha=params['alpha'], beta=params['beta'], sample=params['sample'], 144 | reduction=params["reduction"], sp_device=sp_device, qg_device=qg_device) 145 | 146 | loss_function = {'sp': {}, 'qg': {}} 147 | loss_function['sp'] = set_loss_function(ignore_index=sp_vocab.lf2id[PAD], reduction=opt.reduction) 148 | loss_function['qg'] = set_loss_function(ignore_index=qg_vocab.word2id[PAD], reduction=opt.reduction) 149 | optimizer = set_optimizer(sp_model, qg_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm) 150 | 151 | ###################################### Training and Decoding ####################################### 152 | 153 | vocab = {'sp': sp_vocab, 'qg': qg_vocab} 154 | device = {'sp': sp_device, 'qg': qg_device} 155 | solver = DualLearningSolver(train_model, vocab, loss_function, optimizer, exp_path, logger, device=device) 156 | if not opt.testing: 157 | logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time())))) 158 | solver.train_and_decode(labeled_train_dataset, q_unlabeled_train_dataset, lf_unlabeled_train_dataset, dev_dataset, test_dataset, 159 | batchSize=opt.batchSize, test_batchSize=opt.test_batchSize, cycle=opt.cycle, 160 | max_epoch=opt.max_epoch, beam=opt.beam, n_best=opt.n_best) 161 | else: 162 | logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time())))) 163 | start_time = time.time() 164 | acc, bleu = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'), opt.test_batchSize, beam=opt.beam, n_best=opt.n_best) 165 | logger.info('Evaluation cost: %.4fs\tSemantic Parsing (acc : %.4f)\tQuestion Generation (bleu: %.4f)' 166 | % (time.time() - start_time, acc, bleu)) 167 | -------------------------------------------------------------------------------- /scripts/language_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import argparse, os, sys, time, json 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | from utils.vocab import Vocab 5 | from utils.example import Example, split_dataset 6 | from utils.optimizer import set_optimizer 7 | from utils.loss import set_loss_function 8 | from utils.seed import set_random_seed 9 | from utils.logger import set_logger 10 | from utils.gpu import set_torch_device 11 | from utils.constants import * 12 | from utils.solver.solver_language_model import LMSolver 13 | from utils.word2vec import load_embeddings 14 | from utils.hyperparam import hyperparam_lm 15 | from models.language_model import LanguageModel as model 16 | 17 | ############################### Arguments parsing and Preparations ############################## 18 | 19 | def main(args=sys.argv[1:]): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--task', type=str, default='language_model', help='language model') 22 | parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)') 23 | parser.add_argument('--dataset', required=True, help='which dataset to experiemnt on') 24 | parser.add_argument('--side', choices=['question', 'logical_form'], help='which side to build language model') 25 | # pretrained models 26 | parser.add_argument('--read_model_path', required=False, help='Read model and hyperparams from this path') 27 | # model paras 28 | parser.add_argument('--emb_size', type=int, default=100, help='embedding size') 29 | parser.add_argument('--hidden_dim', type=int, default=200, help='hidden layer dimension') 30 | parser.add_argument('--num_layers', type=int, default=1, help='number of hidden layers') 31 | parser.add_argument('--cell', default='lstm', choices=['lstm', 'gru'], help='rnn cell choice') 32 | # training paras 33 | parser.add_argument('--reduction', default='sum', choices=['mean', 'sum'], help='loss function argument') 34 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 35 | parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)') 36 | parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate at each non-recurrent layer') 37 | parser.add_argument('--batchSize', type=int, default=16, help='input batch size') 38 | parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding') 39 | parser.add_argument('--init_weight', type=float, default=0.2, help='all weights will be set to [-init_weight, init_weight] during initialization') 40 | parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)") 41 | parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for') 42 | # special paras 43 | parser.add_argument('--decoder_tied', action='store_true', help='whether use the same embedding weights and output matrix') 44 | parser.add_argument('--labeled', type=float, default=1.0, help='training use only this propotion of dataset') 45 | parser.add_argument('--deviceId', type=int, default=-1, help='train model on ith gpu. -1:cpu') 46 | parser.add_argument('--seed', type=int, default=999, help='set initial random seed') 47 | opt = parser.parse_args(args) 48 | if opt.testing: 49 | assert opt.read_model_path 50 | return opt 51 | 52 | opt = main() 53 | 54 | ####################### Output path, logger, device and random seed configuration ################# 55 | 56 | exp_path = opt.read_model_path if opt.testing else hyperparam_lm(opt) 57 | if not os.path.exists(exp_path): 58 | os.makedirs(exp_path) 59 | 60 | logger = set_logger(exp_path, testing=opt.testing) 61 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4))) 62 | logger.info("Experiment path: %s" % (exp_path)) 63 | opt.device = set_torch_device(opt.deviceId) 64 | set_random_seed(opt.seed, device=opt.device.type) 65 | 66 | ################################ Vocab and Data Reader ########################### 67 | 68 | lm_vocab = Vocab(opt.dataset, task='language_model') 69 | if opt.side == 'question': 70 | word2id = lm_vocab.word2id 71 | logger.info("Vocab size for natural language sentence is: %s" % (len(word2id))) 72 | else: 73 | word2id = lm_vocab.lf2id 74 | logger.info("Vocab size for logical form is: %s" % (len(word2id))) 75 | 76 | logger.info("Read dataset %s starts at %s" % (opt.dataset, time.asctime(time.localtime(time.time())))) 77 | Example.set_domain(opt.dataset) 78 | if not opt.testing: 79 | train_dataset, dev_dataset = Example.load_dataset(choice='train') 80 | train_dataset, _ = split_dataset(train_dataset, opt.labeled) 81 | logger.info("Train and dev dataset size is: %s and %s" % (len(train_dataset), len(dev_dataset))) 82 | test_dataset = Example.load_dataset(choice='test') 83 | logger.info("Test dataset size is: %s" % (len(test_dataset))) 84 | 85 | ###################################### Model Construction ######################################## 86 | 87 | if not opt.testing: 88 | params = { 89 | 'emb_size': opt.emb_size, 'vocab_size': len(word2id), 'pad_token_idxs': [word2id[PAD]], 90 | 'hidden_dim': opt.hidden_dim, 'decoder_tied': opt.decoder_tied, 'num_layers': opt.num_layers, 'cell': opt.cell, 91 | 'dropout': opt.dropout, 'init': opt.init_weight 92 | } 93 | json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4) 94 | else: 95 | params = json.load(open(os.path.join(exp_path, 'params.json'), 'r')) 96 | train_model = model(**params) 97 | train_model = train_model.to(opt.device) 98 | 99 | ##################################### Model Initialization ######################################### 100 | 101 | if not opt.testing: 102 | ratio = load_embeddings(train_model.encoder, word2id, opt.device) 103 | logger.info("%.2f%% word embeddings from pretrained vectors" % (ratio * 100)) 104 | if opt.testing: 105 | model_path = os.path.join(opt.read_model_path, 'model.pkl') 106 | train_model.load_model(model_path) 107 | logger.info("Load model from path %s" % (model_path)) 108 | 109 | # set loss function and optimizer 110 | loss_function = set_loss_function(ignore_index=word2id[PAD], reduction=opt.reduction) 111 | optimizer = set_optimizer(train_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm) 112 | 113 | ###################################### Training and Decoding ####################################### 114 | 115 | solver = LMSolver(train_model, lm_vocab, loss_function, optimizer, exp_path, logger, device=opt.device, side=opt.side) 116 | if not opt.testing: 117 | logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time())))) 118 | solver.train_and_decode(train_dataset, dev_dataset, test_dataset, 119 | batchSize=opt.batchSize, test_batchSize=opt.test_batchSize, max_epoch=opt.max_epoch) 120 | else: 121 | logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time())))) 122 | start_time = time.time() 123 | ppl = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'), opt.test_batchSize) 124 | logger.info('Evaluation cost: %.4fs\tppl : %.4f' % (time.time() - start_time, ppl)) 125 | -------------------------------------------------------------------------------- /scripts/pseudo_method.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import argparse, os, sys, time, json 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | from utils.vocab import Vocab 5 | from utils.example import Example, split_dataset 6 | from utils.optimizer import set_optimizer 7 | from utils.loss import set_loss_function 8 | from utils.seed import set_random_seed 9 | from utils.logger import set_logger 10 | from utils.gpu import set_torch_device 11 | from utils.constants import * 12 | from utils.solver.solver_pseduo_method import PseudoSolver 13 | from utils.hyperparam import hyperparam_pseudo_method 14 | from models.construct_models import construct_model as model 15 | 16 | ############################### Arguments parsing and Preparations ############################## 17 | 18 | def main(args=sys.argv[1:]): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--task', required=True, help='pseudo method for semantic parsing') 21 | parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)') 22 | parser.add_argument('--dataset', required=True, help='which dataset to experiment on') 23 | parser.add_argument('--read_model_path', help='Testing mode, load sp and qg model path') 24 | # model params 25 | parser.add_argument('--read_sp_model_path', required=True, help='pretrained sp model') 26 | parser.add_argument('--read_qg_model_path', required=True, help='pretrained qg model path') 27 | # pseudo training paras 28 | parser.add_argument('--reduction', choices=['sum', 'mean'], default='sum') 29 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 30 | parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)') 31 | parser.add_argument('--batchSize', type=int, default=16, help='input batch size') 32 | parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding') 33 | parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)") 34 | parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for') 35 | # special paras 36 | parser.add_argument('--beam', default=5, type=int, help='used during decoding time') 37 | parser.add_argument('--n_best', default=1, type=int, help='used during decoding time') 38 | parser.add_argument('--labeled', type=float, default=1.0, help='ratio of labeled samples') 39 | parser.add_argument('--unlabeled', type=float, default=1.0, help='ratio of unlabeled samples') 40 | parser.add_argument('--method', choices=['constant', 'linear'], help='how to change confidence during training') 41 | parser.add_argument('--discount', type=float, default=1.0, help="final confidence for pseudo examples") 42 | parser.add_argument('--deviceId', type=int, nargs=2, default=[-1, -1], help='gpu indexes for slu and nlg models respectively, -1:cpu') 43 | parser.add_argument('--seed', type=int, default=999, help='set initial random seed') 44 | parser.add_argument('--extra', action='store_true', help='whether use synthesized logical forms') 45 | opt = parser.parse_args(args) 46 | 47 | # Some Arguments Check 48 | assert opt.labeled > 0. and opt.labeled < 1.0 49 | assert opt.unlabeled > 0. and opt.unlabeled <= 1.0 50 | return opt 51 | 52 | opt = main() 53 | 54 | ####################### Output path, logger, device and random seed configuration ################# 55 | 56 | exp_path = opt.read_model_path if opt.testing else hyperparam_pseudo_method(opt) 57 | if not os.path.exists(exp_path): 58 | os.makedirs(exp_path) 59 | 60 | logger = set_logger(exp_path, testing=opt.testing) 61 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4))) 62 | logger.info("Experiment path: %s" % (exp_path)) 63 | sp_device, qg_device = set_torch_device(opt.deviceId[0]), set_torch_device(opt.deviceId[1]) 64 | set_random_seed(opt.seed, device='cuda') 65 | 66 | ################################ Vocab and Data Reader ########################### 67 | 68 | sp_copy, qg_copy = 'copy__' in opt.read_sp_model_path, 'copy__' in opt.read_qg_model_path 69 | sp_vocab, qg_vocab = Vocab(opt.dataset, task='semantic_parsing', copy=sp_copy), Vocab(opt.dataset, task='question_generation', copy=qg_copy) 70 | logger.info("Semantic Parsing model vocabulary ...") 71 | logger.info("Vocab size for input natural language sentence is: %s" % (len(sp_vocab.word2id))) 72 | logger.info("Vocab size for output logical form is: %s" % (len(sp_vocab.lf2id))) 73 | 74 | logger.info("Question Generation model vocabulary ...") 75 | logger.info("Vocab size for input logical form is: %s" % (len(qg_vocab.lf2id))) 76 | logger.info("Vocab size for output natural language sentence is: %s" % (len(qg_vocab.word2id))) 77 | 78 | logger.info("Read dataset starts at %s" % (time.asctime(time.localtime(time.time())))) 79 | Example.set_domain(opt.dataset) 80 | if not opt.testing: 81 | train_dataset, dev_dataset = Example.load_dataset(choice='train') 82 | labeled_train_dataset, unlabeled_train_dataset = split_dataset(train_dataset, opt.labeled) 83 | unlabeled_train_dataset, _ = split_dataset(unlabeled_train_dataset, opt.unlabeled) 84 | if opt.extra: 85 | q_unlabeled_train_dataset = unlabeled_train_dataset 86 | lf_unlabeled_train_dataset = unlabeled_train_dataset + Example.load_dataset(choice='extra') 87 | else: 88 | q_unlabeled_train_dataset, lf_unlabeled_train_dataset = unlabeled_train_dataset, unlabeled_train_dataset 89 | logger.info("Labeled/Unlabeled train dataset size is: %s and %s" % (len(labeled_train_dataset), len(lf_unlabeled_train_dataset))) 90 | logger.info("Dev dataset size is: %s" % (len(dev_dataset))) 91 | test_dataset = Example.load_dataset(choice='test') 92 | logger.info("Test dataset size is: %s" % (len(test_dataset))) 93 | 94 | ###################################### Model Construction ######################################## 95 | 96 | if not opt.testing: 97 | params = { "read_sp_model_path": opt.read_sp_model_path, "read_qg_model_path": opt.read_qg_model_path } 98 | json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4) 99 | else: 100 | params = json.load(open(os.path.join(exp_path, "params.json"), 'r')) 101 | sp_params = json.load(open(os.path.join(params['read_sp_model_path'], 'params.json'), 'r')) 102 | sp_model = model(**sp_params) 103 | qg_params = json.load(open(os.path.join(params['read_qg_model_path'], 'params.json'), 'r')) 104 | qg_model = model(**qg_params) 105 | if not opt.testing: 106 | sp_model.load_model(os.path.join(params['read_sp_model_path'], 'model.pkl')) 107 | logger.info("Load Semantic Parsing model from path %s" % (params['read_sp_model_path'])) 108 | qg_model.load_model(os.path.join(params['read_qg_model_path'], 'model.pkl')) 109 | logger.info("Load Question Generation model from path %s" % (params['read_qg_model_path'])) 110 | else: 111 | sp_model.load_model(os.path.join(exp_path, 'sp_model.pkl')) 112 | logger.info("Load Semantic Parsing model from path %s" % (exp_path)) 113 | qg_model.load_model(os.path.join(exp_path, 'qg_model.pkl')) 114 | logger.info("Load Question Generation model from path %s" % (exp_path)) 115 | sp_model, qg_model = sp_model.to(sp_device), qg_model.to(qg_device) 116 | 117 | loss_function = {'sp': {}, 'qg': {}} 118 | loss_function['sp'] = set_loss_function(ignore_index=sp_vocab.lf2id[PAD], reduction=opt.reduction) 119 | loss_function['qg'] = set_loss_function(ignore_index=qg_vocab.word2id[PAD], reduction=opt.reduction) 120 | optimizer = set_optimizer(sp_model, qg_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm) 121 | 122 | ###################################### Training and Decoding ####################################### 123 | 124 | train_model = {'sp': sp_model, 'qg': qg_model} 125 | vocab = {'sp': sp_vocab, 'qg': qg_vocab} 126 | device = {'sp': sp_device, 'qg': qg_device} 127 | solver = PseudoSolver(train_model, vocab, loss_function, optimizer, exp_path, logger, device=device, 128 | discount=opt.discount, method=opt.method) 129 | if not opt.testing: 130 | logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time())))) 131 | solver.train_and_decode(labeled_train_dataset, q_unlabeled_train_dataset, lf_unlabeled_train_dataset, dev_dataset, test_dataset, 132 | batchSize=opt.batchSize, test_batchSize=opt.test_batchSize, 133 | max_epoch=opt.max_epoch, beam=opt.beam, n_best=opt.n_best) 134 | else: 135 | logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time())))) 136 | start_time = time.time() 137 | acc, bleu = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'), opt.test_batchSize, beam=opt.beam, n_best=opt.n_best) 138 | logger.info('Evaluation cost: %.4fs\tSemantic Parsing (acc : %.4f)\tQuestion Generation (bleu: %.4f)' 139 | % (time.time() - start_time, acc, bleu)) 140 | -------------------------------------------------------------------------------- /scripts/question_generation.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import argparse, os, sys, time, json 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | from utils.hyperparam import hyperparam_seq2seq 5 | from utils.logger import set_logger 6 | from utils.vocab import Vocab 7 | from utils.seed import set_random_seed 8 | from utils.example import split_dataset, Example 9 | from utils.constants import PAD, UNK 10 | from utils.loss import set_loss_function 11 | from utils.optimizer import set_optimizer 12 | from utils.gpu import set_torch_device 13 | from models.construct_models import construct_model as model 14 | from utils.word2vec import load_embeddings 15 | from utils.solver.solver_question_generation import QGSolver 16 | 17 | ############################### Arguments parsing and Preparations ############################## 18 | 19 | def main(args=sys.argv[1:]): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--task', type=str, default='question_generation', help='question generation') 22 | parser.add_argument('--dataset', type=str, required=True, help='which dataset to experiment on') 23 | parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)') 24 | # pretrained models 25 | parser.add_argument('--read_model_path', required=False, help='Read model and hyperparams from this path') 26 | # model paras 27 | parser.add_argument('--copy', action='store_true', help='attn model or attnptr model') 28 | parser.add_argument('--emb_size', type=int, default=100, help='embedding size') 29 | parser.add_argument('--hidden_dim', type=int, default=200, help='hidden layer dimension') 30 | parser.add_argument('--num_layers', type=int, default=1, help='number of hidden layers') 31 | parser.add_argument('--cell', default='lstm', choices=['lstm', 'gru'], help='rnn cell choice') 32 | # training paras 33 | parser.add_argument('--reduction', default='sum', choices=['mean', 'sum'], help='loss function argument') 34 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 35 | parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)') 36 | parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate at each non-recurrent layer') 37 | parser.add_argument('--batchSize', type=int, default=16, help='input batch size') 38 | parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding') 39 | parser.add_argument('--init_weight', type=float, default=0.2, help='all weights will be set to [-init_weight, init_weight] during initialization') 40 | parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)") 41 | parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for') 42 | parser.add_argument('--beam', default=5, type=int, help='beam search size') 43 | parser.add_argument('--n_best', default=1, type=int, help='return n best results') 44 | # special paras 45 | parser.add_argument('--labeled', type=float, default=1.0, help='training use only this propotion of dataset') 46 | parser.add_argument('--deviceId', type=int, default=-1, help='train model on ith gpu. -1: cpu, o.w. gpu index') 47 | parser.add_argument('--seed', type=int, default=999, help='set initial random seed') 48 | opt = parser.parse_args(args) 49 | if opt.testing: 50 | assert opt.read_model_path 51 | return opt 52 | 53 | opt = main() 54 | 55 | ####################### Output path, logger, device and random seed configuration ################# 56 | 57 | exp_path = opt.read_model_path if opt.testing else hyperparam_seq2seq(opt) 58 | if not os.path.exists(exp_path): 59 | os.makedirs(exp_path) 60 | 61 | logger = set_logger(exp_path, testing=opt.testing) 62 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4))) 63 | logger.info("Experiment path: %s" % (exp_path)) 64 | opt.device = set_torch_device(opt.deviceId) 65 | set_random_seed(opt.seed, device=opt.device.type) 66 | 67 | ################################ Vocab and Data Reader ########################### 68 | 69 | qg_vocab = Vocab(opt.dataset, task='question_generation', copy=opt.copy) 70 | logger.info("Vocab size for input logical form is: %s" % (len(qg_vocab.lf2id))) 71 | logger.info("Vocab size for output natural language sentence is: %s" % (len(qg_vocab.word2id))) 72 | 73 | logger.info("Read dataset %s starts at %s" % (opt.dataset, time.asctime(time.localtime(time.time())))) 74 | Example.set_domain(opt.dataset) 75 | if not opt.testing: 76 | train_dataset, dev_dataset = Example.load_dataset(choice='train') 77 | train_dataset, _ = split_dataset(train_dataset, opt.labeled) 78 | logger.info("Train and dev dataset size is: %s and %s" % (len(train_dataset), len(dev_dataset))) 79 | test_dataset = Example.load_dataset(choice='test') 80 | logger.info("Test dataset size is: %s" % (len(test_dataset))) 81 | 82 | ###################################### Model Construction ######################################## 83 | 84 | if not opt.testing: 85 | params = { 86 | "copy": opt.copy, # model attn or model attnptr 87 | "src_vocab": len(qg_vocab.lf2id), "tgt_vocab": len(qg_vocab.word2id), 88 | "src_unk_idx": qg_vocab.lf2id[UNK], "tgt_unk_idx": qg_vocab.word2id[UNK], 89 | "pad_src_idxs": [qg_vocab.lf2id[PAD]], "pad_tgt_idxs": [qg_vocab.word2id[PAD]], 90 | "src_emb_size": opt.emb_size, "tgt_emb_size": opt.emb_size, "hidden_dim": opt.hidden_dim, 91 | "num_layers": opt.num_layers, "cell": opt.cell, "dropout": opt.dropout, "init": opt.init_weight 92 | } 93 | json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4) 94 | else: 95 | params = json.load(open(os.path.join(exp_path, 'params.json'), 'r')) 96 | train_model = model(**params) 97 | train_model = train_model.to(opt.device) 98 | 99 | ##################################### Model Initialization ######################################### 100 | 101 | if not opt.testing: 102 | ratio1 = load_embeddings(train_model.src_embed.embed, qg_vocab.lf2id, opt.device) 103 | ratio2 = load_embeddings(train_model.tgt_embed.embed, qg_vocab.word2id, opt.device) 104 | logger.info("%.2f%% token embeddings from pretrained vectors" % (ratio1 * 100)) 105 | logger.info("%.2f%% word embeddings from pretrained vectors" % (ratio2 * 100)) 106 | else: 107 | model_path = os.path.join(opt.read_model_path, 'model.pkl') 108 | train_model.load_model(model_path) 109 | logger.info("Load model from path %s" % (model_path)) 110 | 111 | # set loss function and optimizer 112 | loss_function = set_loss_function(ignore_index=qg_vocab.word2id[PAD], reduction=opt.reduction) 113 | optimizer = set_optimizer(train_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm) 114 | 115 | ###################################### Training and Decoding ####################################### 116 | 117 | solver = QGSolver(train_model, qg_vocab, loss_function, optimizer, exp_path, logger, device=opt.device) 118 | if not opt.testing: 119 | logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time())))) 120 | solver.train_and_decode(train_dataset, dev_dataset, test_dataset, 121 | batchSize=opt.batchSize, test_batchSize=opt.test_batchSize, 122 | max_epoch=opt.max_epoch, beam=opt.beam, n_best=opt.n_best) 123 | else: 124 | logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time())))) 125 | start_time = time.time() 126 | bleu = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'), 127 | opt.test_batchSize, beam=opt.beam, n_best=opt.n_best) 128 | logger.info('Evaluation cost: %.4fs\tAcc : %.4f' % (time.time() - start_time, bleu)) -------------------------------------------------------------------------------- /scripts/semantic_parsing.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import argparse, os, sys, time, json 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | from utils.hyperparam import hyperparam_seq2seq 5 | from utils.logger import set_logger 6 | from utils.vocab import Vocab 7 | from utils.seed import set_random_seed 8 | from utils.example import split_dataset, Example 9 | from utils.constants import PAD, UNK 10 | from utils.loss import set_loss_function 11 | from utils.optimizer import set_optimizer 12 | from utils.gpu import set_torch_device 13 | from models.construct_models import construct_model as model 14 | from utils.word2vec import load_embeddings 15 | from utils.solver.solver_semantic_parsing import SPSolver 16 | 17 | ############################### Arguments parsing and Preparations ############################## 18 | 19 | def main(args=sys.argv[1:]): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--task', type=str, default='semantic_parsing', help='semantic parsing') 22 | parser.add_argument('--dataset', type=str, required=True, help='which dataset to experiment on') 23 | parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)') 24 | # pretrained models 25 | parser.add_argument('--read_model_path', required=False, help='Read model and hyperparams from this path') 26 | # model paras 27 | parser.add_argument('--copy', action='store_true', help='attn model or attnptr model') 28 | parser.add_argument('--emb_size', type=int, default=100, help='embedding size') 29 | parser.add_argument('--hidden_dim', type=int, default=200, help='hidden layer dimension') 30 | parser.add_argument('--num_layers', type=int, default=1, help='number of hidden layers') 31 | parser.add_argument('--cell', default='lstm', choices=['lstm', 'gru'], help='rnn cell choice') 32 | # training paras 33 | parser.add_argument('--reduction', default='sum', choices=['mean', 'sum'], help='loss function argument') 34 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 35 | parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)') 36 | parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate at each non-recurrent layer') 37 | parser.add_argument('--batchSize', type=int, default=16, help='input batch size') 38 | parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding') 39 | parser.add_argument('--init_weight', type=float, default=0.2, help='all weights will be set to [-init_weight, init_weight] during initialization') 40 | parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)") 41 | parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for') 42 | parser.add_argument('--beam', default=5, type=int, help='beam search size') 43 | parser.add_argument('--n_best', default=1, type=int, help='return n best results') 44 | # special paras 45 | parser.add_argument('--labeled', type=float, default=1.0, help='training use only this propotion of dataset') 46 | parser.add_argument('--deviceId', type=int, default=-1, help='train model on ith gpu. -1: cpu, o.w. gpu index') 47 | parser.add_argument('--seed', type=int, default=999, help='set initial random seed') 48 | opt = parser.parse_args(args) 49 | if opt.testing: 50 | assert opt.read_model_path 51 | return opt 52 | 53 | opt = main() 54 | 55 | ####################### Output path, logger, device and random seed configuration ################# 56 | 57 | exp_path = opt.read_model_path if opt.testing else hyperparam_seq2seq(opt) 58 | if not os.path.exists(exp_path): 59 | os.makedirs(exp_path) 60 | 61 | logger = set_logger(exp_path, testing=opt.testing) 62 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4))) 63 | logger.info("Experiment path: %s" % (exp_path)) 64 | opt.device = set_torch_device(opt.deviceId) 65 | set_random_seed(opt.seed, device=opt.device.type) 66 | 67 | ################################ Vocab and Data Reader ########################### 68 | 69 | sp_vocab = Vocab(opt.dataset, task='semantic_parsing', copy=opt.copy) 70 | logger.info("Vocab size for input natural language sentence is: %s" % (len(sp_vocab.word2id))) 71 | logger.info("Vocab size for output logical form is: %s" % (len(sp_vocab.lf2id))) 72 | 73 | logger.info("Read dataset %s starts at %s" % (opt.dataset, time.asctime(time.localtime(time.time())))) 74 | Example.set_domain(opt.dataset) 75 | if not opt.testing: 76 | train_dataset, dev_dataset = Example.load_dataset(choice='train') 77 | train_dataset, _ = split_dataset(train_dataset, opt.labeled) 78 | logger.info("Train and dev dataset size is: %s and %s" % (len(train_dataset), len(dev_dataset))) 79 | test_dataset = Example.load_dataset(choice='test') 80 | logger.info("Test dataset size is: %s" % (len(test_dataset))) 81 | 82 | ###################################### Model Construction ######################################## 83 | 84 | if not opt.testing: 85 | params = { 86 | "copy": opt.copy, # model attn or model attnptr 87 | "src_vocab": len(sp_vocab.word2id), "tgt_vocab": len(sp_vocab.lf2id), 88 | "src_unk_idx": sp_vocab.word2id[UNK], "tgt_unk_idx": sp_vocab.lf2id[UNK], 89 | "pad_src_idxs": [sp_vocab.word2id[PAD]], "pad_tgt_idxs": [sp_vocab.lf2id[PAD]], 90 | "src_emb_size": opt.emb_size, "tgt_emb_size": opt.emb_size, "hidden_dim": opt.hidden_dim, 91 | "num_layers": opt.num_layers, "cell": opt.cell, "dropout": opt.dropout, "init": opt.init_weight 92 | } 93 | json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4) 94 | else: 95 | params = json.load(open(os.path.join(exp_path, 'params.json'), 'r')) 96 | train_model = model(**params) 97 | train_model = train_model.to(opt.device) 98 | 99 | ##################################### Model Initialization ######################################### 100 | 101 | if not opt.testing: 102 | ratio1 = load_embeddings(train_model.src_embed.embed, sp_vocab.word2id, opt.device) 103 | ratio2 = load_embeddings(train_model.tgt_embed.embed, sp_vocab.lf2id, opt.device) 104 | logger.info("%.2f%% word embeddings from pretrained vectors" % (ratio1 * 100)) 105 | logger.info("%.2f%% token embeddings from pretrained vectors" % (ratio2 * 100)) 106 | else: 107 | model_path = os.path.join(opt.read_model_path, 'model.pkl') 108 | train_model.load_model(model_path) 109 | logger.info("Load model from path %s" % (model_path)) 110 | 111 | # set loss function and optimizer 112 | loss_function = set_loss_function(ignore_index=sp_vocab.lf2id[PAD], reduction=opt.reduction) 113 | optimizer = set_optimizer(train_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm) 114 | 115 | ###################################### Training and Decoding ####################################### 116 | 117 | solver = SPSolver(train_model, sp_vocab, loss_function, optimizer, exp_path, logger, device=opt.device) 118 | if not opt.testing: 119 | logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time())))) 120 | solver.train_and_decode(train_dataset, dev_dataset, test_dataset, 121 | batchSize=opt.batchSize, test_batchSize=opt.test_batchSize, 122 | max_epoch=opt.max_epoch, beam=opt.beam, n_best=opt.n_best) 123 | else: 124 | logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time())))) 125 | start_time = time.time() 126 | accuracy = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'), 127 | opt.test_batchSize, beam=opt.beam, n_best=opt.n_best) 128 | logger.info('Evaluation cost: %.4fs\tAcc : %.4f' % (time.time() - start_time, accuracy)) -------------------------------------------------------------------------------- /utils/batch.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import sys, os, random 3 | import torch 4 | from utils.constants import * 5 | 6 | def get_minibatch(data_list, vocab, task='semantic_parsing', data_index=None, index=0, batch_size=16, device=None, **kargs): 7 | index = index % len(data_list) 8 | batch_data_list = [data_list[idx] for idx in data_index[index: index + batch_size]] 9 | return BATCH_FUNC[task](batch_data_list, vocab, device, **kargs) 10 | 11 | def get_minibatch_sp(ex_list, vocab, device, copy=False, **kargs): 12 | inputs = [ex.question for ex in ex_list] 13 | lens = [len(ex) for ex in inputs] 14 | lens_tensor = torch.tensor(lens, dtype=torch.long, device=device) 15 | 16 | max_len = max(lens) 17 | padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in inputs] 18 | inputs_idx = [[vocab.word2id[w] if w in vocab.word2id else vocab.word2id[UNK] for w in sent] for sent in padded_inputs] 19 | inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device) 20 | 21 | outputs = [ex.logical_form for ex in ex_list] 22 | bos_eos_outputs = [[BOS] + sent + [EOS] for sent in outputs] 23 | out_lens = [len(each) for each in bos_eos_outputs] 24 | max_out_len = max(out_lens) 25 | padded_outputs = [sent + [PAD] * (max_out_len - len(sent)) for sent in bos_eos_outputs] 26 | outputs_idx = [[vocab.lf2id[w] if w in vocab.lf2id else vocab.lf2id[UNK] for w in sent] for sent in padded_outputs] 27 | outputs_tensor = torch.tensor(outputs_idx, dtype=torch.long, device=device) 28 | out_lens_tensor = torch.tensor(out_lens, dtype=torch.long, device=device) 29 | 30 | if copy: # pointer network need additional information 31 | mapped_inputs = [ex.mapped_question for ex in ex_list] 32 | oov_list, copy_inputs = [], [] 33 | for sent in mapped_inputs: 34 | tmp_oov_list, tmp_copy_inputs = [], [] 35 | for idx, word in enumerate(sent): 36 | if word not in vocab.lf2id and word not in tmp_oov_list and len(tmp_oov_list) < MAX_OOV_NUM: 37 | tmp_oov_list.append(word) 38 | tmp_copy_inputs.append( 39 | ( 40 | vocab.lf2id.get(word, vocab.lf2id[UNK]) if word in vocab.lf2id or word not in tmp_oov_list \ 41 | else len(vocab.lf2id) + tmp_oov_list.index(word) # tgt_vocab_size + oov_id 42 | ) 43 | ) 44 | tmp_oov_list += [UNK] * (MAX_OOV_NUM - len(tmp_oov_list)) 45 | oov_list.append(tmp_oov_list) 46 | copy_inputs.append(tmp_copy_inputs) 47 | 48 | copy_tokens = [ 49 | torch.cat([ 50 | torch.zeros(len(each), len(vocab.lf2id) + MAX_OOV_NUM, dtype=torch.float)\ 51 | .scatter_(-1, torch.tensor(each, dtype=torch.long).unsqueeze(-1), 1.0), 52 | torch.zeros(max_len - len(each), len(vocab.lf2id) + MAX_OOV_NUM, dtype=torch.float) 53 | ], dim=0) 54 | for each in copy_inputs 55 | ] 56 | copy_tokens = torch.stack(copy_tokens, dim=0).to(device) # bsize x src_len x (tgt_vocab + MAX_OOV_NUM) 57 | 58 | dec_outputs = [ 59 | [ 60 | len(vocab.lf2id) + oov_list[idx].index(tok) 61 | if tok not in vocab.lf2id and tok in oov_list[idx] \ 62 | else vocab.lf2id.get(tok, vocab.lf2id[UNK]) 63 | for tok in sent 64 | ] + [vocab.lf2id[PAD]] * (max_out_len - len(sent)) 65 | for idx, sent in enumerate(bos_eos_outputs) 66 | ] 67 | dec_outputs_tensor = torch.tensor(dec_outputs, dtype=torch.long, device=device) 68 | else: 69 | dec_outputs_tensor, copy_tokens, oov_list = outputs_tensor, None, [] 70 | 71 | return inputs_tensor, lens_tensor, outputs_tensor, dec_outputs_tensor, out_lens_tensor, copy_tokens, oov_list, (inputs, outputs) 72 | 73 | def get_minibatch_qg(ex_list, vocab, device, copy=False, **kargs): 74 | raw_inputs = [ex.logical_form for ex in ex_list] 75 | inputs = [ex.mapped_logical_form for ex in ex_list] if copy else raw_inputs 76 | lens = [len(ex) for ex in inputs] 77 | lens_tensor = torch.tensor(lens, dtype=torch.long, device=device) 78 | 79 | max_len = max(lens) 80 | padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in inputs] 81 | inputs_idx = [[vocab.lf2id[w] if w in vocab.lf2id else vocab.lf2id[UNK] for w in sent] for sent in padded_inputs] 82 | inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device) 83 | 84 | outputs = [ex.question for ex in ex_list] 85 | bos_eos_outputs = [[BOS] + sent + [EOS] for sent in outputs] 86 | out_lens = [len(each) for each in bos_eos_outputs] 87 | max_out_len = max(out_lens) 88 | padded_outputs = [sent + [PAD] * (max_out_len - len(sent)) for sent in bos_eos_outputs] 89 | outputs_idx = [[vocab.word2id[w] if w in vocab.word2id else vocab.word2id[UNK] for w in sent] for sent in padded_outputs] 90 | outputs_tensor = torch.tensor(outputs_idx, dtype=torch.long, device=device) 91 | out_lens_tensor = torch.tensor(out_lens, dtype=torch.long, device=device) 92 | 93 | if copy: # pointer network need additional information 94 | oov_list, copy_inputs = [], [] 95 | for sent in inputs: 96 | tmp_oov_list, tmp_copy_inputs = [], [] 97 | for idx, word in enumerate(sent): 98 | if word not in vocab.word2id and word not in tmp_oov_list and len(tmp_oov_list) < MAX_OOV_NUM: 99 | tmp_oov_list.append(word) 100 | tmp_copy_inputs.append( 101 | ( 102 | vocab.word2id.get(word, vocab.word2id[UNK]) if word in vocab.word2id or word not in tmp_oov_list \ 103 | else len(vocab.word2id) + tmp_oov_list.index(word) # tgt_vocab_size + oov_id 104 | ) 105 | ) 106 | tmp_oov_list += [UNK] * (MAX_OOV_NUM - len(tmp_oov_list)) 107 | oov_list.append(tmp_oov_list) 108 | copy_inputs.append(tmp_copy_inputs) 109 | 110 | copy_tokens = [ 111 | torch.cat([ 112 | torch.zeros(len(each), len(vocab.word2id) + MAX_OOV_NUM, dtype=torch.float)\ 113 | .scatter_(-1, torch.tensor(each, dtype=torch.long).unsqueeze(-1), 1.0), 114 | torch.zeros(max_len - len(each), len(vocab.word2id) + MAX_OOV_NUM, dtype=torch.float) 115 | ], dim=0) 116 | for each in copy_inputs 117 | ] 118 | copy_tokens = torch.stack(copy_tokens, dim=0).to(device) # bsize x src_len x (tgt_vocab + MAX_OOV_NUM) 119 | 120 | dec_outputs = [ 121 | [ 122 | len(vocab.word2id) + oov_list[idx].index(tok) 123 | if tok not in vocab.word2id and tok in oov_list[idx] \ 124 | else vocab.word2id.get(tok, vocab.word2id[UNK]) 125 | for tok in sent 126 | ] + [vocab.word2id[PAD]] * (max_out_len - len(sent)) 127 | for idx, sent in enumerate(bos_eos_outputs) 128 | ] 129 | dec_outputs_tensor = torch.tensor(dec_outputs, dtype=torch.long, device=device) 130 | else: 131 | dec_outputs_tensor, copy_tokens, oov_list = outputs_tensor, None, [] 132 | 133 | return inputs_tensor, lens_tensor, outputs_tensor, dec_outputs_tensor, out_lens_tensor, copy_tokens, oov_list, (raw_inputs, outputs) 134 | 135 | def get_minibatch_unlabeled_sp(ex_list, vocab, device, copy=False, **kargs): 136 | inputs = [ex.question for ex in ex_list] 137 | lens = [len(ex) for ex in inputs] 138 | lens_tensor = torch.tensor(lens, dtype=torch.long, device=device) 139 | 140 | max_len = max(lens) 141 | padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in inputs] 142 | inputs_idx = [[vocab.word2id[w] if w in vocab.word2id else vocab.word2id[UNK] for w in sent] for sent in padded_inputs] 143 | inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device) 144 | 145 | if copy: # pointer network need additional information 146 | mapped_inputs = [ex.mapped_question for ex in ex_list] 147 | oov_list, copy_inputs = [], [] 148 | for sent in mapped_inputs: 149 | tmp_oov_list, tmp_copy_inputs = [], [] 150 | for idx, word in enumerate(sent): 151 | if word not in vocab.lf2id and word not in tmp_oov_list and len(tmp_oov_list) < MAX_OOV_NUM: 152 | tmp_oov_list.append(word) 153 | tmp_copy_inputs.append( 154 | ( 155 | vocab.lf2id.get(word, vocab.lf2id[UNK]) if word in vocab.lf2id or word not in tmp_oov_list \ 156 | else len(vocab.lf2id) + tmp_oov_list.index(word) # tgt_vocab_size + oov_id 157 | ) 158 | ) 159 | tmp_oov_list += [UNK] * (MAX_OOV_NUM - len(tmp_oov_list)) 160 | oov_list.append(tmp_oov_list) 161 | copy_inputs.append(tmp_copy_inputs) 162 | 163 | copy_tokens = [ 164 | torch.cat([ 165 | torch.zeros(len(each), len(vocab.lf2id) + MAX_OOV_NUM, dtype=torch.float)\ 166 | .scatter_(-1, torch.tensor(each, dtype=torch.long).unsqueeze(-1), 1.0), 167 | torch.zeros(max_len - len(each), len(vocab.lf2id) + MAX_OOV_NUM, dtype=torch.float) 168 | ], dim=0) 169 | for each in copy_inputs 170 | ] 171 | copy_tokens = torch.stack(copy_tokens, dim=0).to(device) # bsize x src_len x (tgt_vocab + MAX_OOV_NUM) 172 | else: 173 | copy_tokens, oov_list = None, [] 174 | 175 | return inputs_tensor, lens_tensor, copy_tokens, oov_list, inputs 176 | 177 | def get_minibatch_unlabeled_qg(ex_list, vocab, device, copy=False, **kargs): 178 | raw_inputs = [ex.logical_form for ex in ex_list] 179 | inputs = [ex.mapped_logical_form for ex in ex_list] if copy else raw_inputs 180 | lens = [len(ex) for ex in inputs] 181 | lens_tensor = torch.tensor(lens, dtype=torch.long, device=device) 182 | 183 | max_len = max(lens) 184 | padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in inputs] 185 | inputs_idx = [[vocab.lf2id[w] if w in vocab.lf2id else vocab.lf2id[UNK] for w in sent] for sent in padded_inputs] 186 | inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device) 187 | 188 | if copy: # pointer network need additional information 189 | oov_list, copy_inputs = [], [] 190 | for sent in inputs: 191 | tmp_oov_list, tmp_copy_inputs = [], [] 192 | for idx, word in enumerate(sent): 193 | if word not in vocab.word2id and word not in tmp_oov_list and len(tmp_oov_list) < MAX_OOV_NUM: 194 | tmp_oov_list.append(word) 195 | tmp_copy_inputs.append( 196 | ( 197 | vocab.word2id.get(word, vocab.word2id[UNK]) if word in vocab.word2id or word not in tmp_oov_list \ 198 | else len(vocab.word2id) + tmp_oov_list.index(word) # tgt_vocab_size + oov_id 199 | ) 200 | ) 201 | tmp_oov_list += [UNK] * (MAX_OOV_NUM - len(tmp_oov_list)) 202 | oov_list.append(tmp_oov_list) 203 | copy_inputs.append(tmp_copy_inputs) 204 | 205 | copy_tokens = [ 206 | torch.cat([ 207 | torch.zeros(len(each), len(vocab.word2id) + MAX_OOV_NUM, dtype=torch.float)\ 208 | .scatter_(-1, torch.tensor(each, dtype=torch.long).unsqueeze(-1), 1.0), 209 | torch.zeros(max_len - len(each), len(vocab.word2id) + MAX_OOV_NUM, dtype=torch.float) 210 | ], dim=0) 211 | for each in copy_inputs 212 | ] 213 | copy_tokens = torch.stack(copy_tokens, dim=0).to(device) # bsize x src_len x (tgt_vocab + MAX_OOV_NUM) 214 | else: 215 | copy_tokens, oov_list = None, [] 216 | 217 | return inputs_tensor, lens_tensor, copy_tokens, oov_list, raw_inputs 218 | 219 | def get_minibatch_pseudo_sp(ex_list, vocab, device, copy=False, **kargs): 220 | inputs, lens, outputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch_sp(ex_list, vocab, device, copy=copy, **kargs) 221 | conf = torch.tensor([ex.conf for ex in ex_list], dtype=torch.float, device=device) 222 | return inputs, lens, outputs, dec_outputs, out_lens, copy_tokens, conf 223 | 224 | def get_minibatch_pseudo_qg(ex_list, vocab, device, copy=False, **kargs): 225 | inputs, lens, outputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch_qg(ex_list, vocab, device, copy=copy, **kargs) 226 | conf = torch.tensor([ex.conf for ex in ex_list], dtype=torch.float, device=device) 227 | return inputs, lens, outputs, dec_outputs, out_lens, copy_tokens, conf 228 | 229 | def get_minibatch_lm(ex_list, vocab, device, side='question', **kargs): 230 | if side == 'question': 231 | word2id = vocab.word2id 232 | inputs = [ex.question for ex in ex_list] 233 | else: 234 | word2id = vocab.lf2id 235 | inputs = [ex.logical_form for ex in ex_list] 236 | bos_eos_inputs = [[BOS] + sent + [EOS] for sent in inputs] 237 | lens = [len(each) for each in bos_eos_inputs] 238 | max_len = max(lens) 239 | padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in bos_eos_inputs] 240 | inputs_idx = [[word2id[w] if w in word2id else word2id[UNK] for w in sent] for sent in padded_inputs] 241 | inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device) 242 | lens = torch.tensor(lens, dtype=torch.long, device=device) 243 | return inputs_tensor, lens, inputs 244 | 245 | BATCH_FUNC = { 246 | "semantic_parsing": get_minibatch_sp, 247 | "question_generation": get_minibatch_qg, 248 | "unlabeled_semantic_parsing": get_minibatch_unlabeled_sp, 249 | "unlabeled_question_generation": get_minibatch_unlabeled_qg, 250 | "pseudo_semantic_parsing": get_minibatch_pseudo_sp, 251 | "pseudo_question_generation": get_minibatch_pseudo_qg, 252 | "language_model": get_minibatch_lm 253 | } 254 | -------------------------------------------------------------------------------- /utils/bleu.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import os, sys, nltk 3 | from nltk.translate.bleu_score import sentence_bleu, corpus_bleu 4 | from nltk.translate.bleu_score import SmoothingFunction 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | 7 | def get_bleu_score(candidate_list, references_list, method=0, weights=(0.25, 0.25, 0.25, 0.25)): 8 | ''' 9 | @args: 10 | if candidate_list is words list, e.g. ['which','flight'] 11 | references_list is list of words list, e.g. [ ['which','flight'] , ['what','flight'] ] 12 | calculate bleu score of one sentence 13 | if candidate_list is list of words list, e.g. [ ['which','flight'] , ['when','to','flight'] ] 14 | references_list is list of list of words list, e.g. 15 | [ [ ['which','flight'] , ['what','flight'] ] , [ ['when','to','flight'] , ['when','to','go'] ] ] 16 | calculate bleu score of multiple sentences, a whole corpus 17 | method(int): chencherry smoothing methods choice 18 | ''' 19 | chencherry = SmoothingFunction() 20 | if len(candidate_list) == 0: 21 | raise ValueError('[Error]: there is no candidate sentence!') 22 | if type(candidate_list[0]) == str: 23 | return sentence_bleu( 24 | references_list, 25 | candidate_list, 26 | weights, 27 | eval('chencherry.method' + str(method)) 28 | ) 29 | else: 30 | return corpus_bleu( 31 | references_list, 32 | candidate_list, 33 | weights, 34 | eval('chencherry.method' + str(method)) 35 | ) -------------------------------------------------------------------------------- /utils/constants.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | BOS = '' 3 | EOS = '' 4 | PAD = '' 5 | UNK = '' 6 | MAX_DECODE_LENGTH = 100 7 | MAX_OOV_NUM = 50 8 | VECTORCACHE = lambda emb_dim: 'data/.cache/glove.6B.' + str(emb_dim) + 'd.txt' -------------------------------------------------------------------------------- /utils/domain/domain_atis.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import sys, os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 4 | from utils.domain.domain_base import Domain 5 | from utils.domain.atis_evaluator import ATISEvaluator 6 | 7 | class ATISDomain(Domain): 8 | 9 | def __init__(self): 10 | 11 | self.dataset = 'atis' 12 | self.denotation = False 13 | self.evaluator = ATISEvaluator() 14 | 15 | def to_lisp_tree(self, toks): 16 | ''' 17 | input(list): ['lambda', '$0', 'e', '(', 'flight', '$0', ')'] 18 | return(recursive list): ['lambda', '$0', 'e', ['flight', '$0']] 19 | ''' 20 | def recurse(i): 21 | if toks[i] == '(': 22 | subtrees = [] 23 | j = i + 1 24 | while True: 25 | subtree, j = recurse(j) 26 | subtrees.append(subtree) 27 | if toks[j] == ')': 28 | return subtrees, j + 1 29 | else: 30 | return toks[i], i+1 31 | 32 | try: 33 | lisp_tree, final_ind = recurse(0) 34 | return lisp_tree 35 | except Exception as e: 36 | return None 37 | 38 | def sort_args(self, lf): 39 | lisp_tree = self.to_lisp_tree(lf) 40 | if lisp_tree is None: # failed to convert to logical tree 41 | return ' '.join(lf) 42 | 43 | def recurse(node): # Post-order traversal, sort and/or subtrees 44 | if isinstance(node, str): 45 | return 46 | for child in node: 47 | recurse(child) 48 | if node[0] in ('_and', '_or', 'and', 'or'): 49 | node[1:] = sorted(node[1:], key=lambda x: str(x)) 50 | 51 | recurse(lisp_tree) 52 | 53 | def tree_to_str(node): 54 | if isinstance(node, str): 55 | return node 56 | else: 57 | return '( %s )' % ' '.join(tree_to_str(child) for child in node) 58 | 59 | return tree_to_str(lisp_tree) 60 | 61 | def normalize(self, lf_list): 62 | sorted_lf_list = [self.sort_args(lf) for lf in lf_list] 63 | return sorted_lf_list 64 | 65 | def is_valid(self, ans_list): 66 | 67 | def bracket_matching(lf): 68 | left = 0 69 | for each in lf: 70 | if each == '(': 71 | left += 1 72 | elif each == ')': 73 | left -= 1 74 | if left < 0: 75 | return 0.0 76 | return 1.0 if left == 0 else 0.0 77 | 78 | ans_list = [[i.strip() for i in lf.split(' ') if i.strip() != ''] for lf in ans_list] 79 | bracket_signal = list(map(bracket_matching, ans_list)) 80 | lisp_trees = [self.to_lisp_tree(each) if bracket_signal[idx] == 1.0 else None for idx, each in enumerate(ans_list)] 81 | type_consistency = [self.evaluator.eval(each) if each is not None else 0.0 for each in lisp_trees] 82 | return list(map(lambda x, y: (0.5 if x is not None else 0.0) + 0.5 * y, lisp_trees, type_consistency)) 83 | 84 | if __name__ =='__main__': 85 | 86 | d = ATISDomain() 87 | 88 | lfs = [ 89 | 'al0', 90 | '( count $0 ( and ( flight $0 ) ( airline $0 al0 ) ) )', 91 | '( lambda $0 e ( and ( flight $0 ) ( approx_departure_time $0 ti0 ) ( during_day $0 afternoon:pd ) ( from $0 ci0 ) ( to $0 ci1 ) ) )', 92 | '( min $0 ( exists $1 ( and ( from $1 ci0 ) ( to $1 ci1 ) ( round_trip $1 ) ( = ( fare $1 ) $0 ) ) ) )', 93 | '( sum $0 ( and ( aircraft $0 ) ( airline $0 al0 ) ) ( capacity $0 ) )', 94 | '( argmin $0 ( and ( flight $0 ) ( from $0 ci0 ) ( to $0 ci1 ) ( day $0 da0 ) ) ( departure_time $0 ) )', 95 | ] 96 | lfs = [[i.strip() for i in lf.split(' ') if i.strip() != ''] for lf in lfs] 97 | lisp_trees = [d.to_lisp_tree(lf) for lf in lfs] 98 | for lf in lisp_trees: 99 | print('\nInput:', lf) 100 | try: 101 | t, v = d.evaluator.check_type_consistency(lf, variables={}) 102 | print('Return type: %s' % (t)) 103 | print('Variables: %s' % (v)) 104 | except Exception as e: 105 | print('Type inconsistent !') 106 | print(e) 107 | 108 | exit(0) 109 | 110 | def read_lfs(fp, idx=0): 111 | lfs = [] 112 | with open(fp, 'r') as infile: 113 | for line in infile: 114 | line = line.strip() 115 | if line == '': continue 116 | q, lf = line.split('\t') 117 | lf = [i.strip() for i in lf.split(' ') if i.strip() != ''] 118 | lfs.append((q, lf, idx)) 119 | idx += 1 120 | return lfs 121 | 122 | dataset = read_lfs('data/atis/atis_train.tsv') + read_lfs('data/atis/atis_dev.tsv') + \ 123 | read_lfs('data/atis/atis_test.tsv') + read_lfs('data/atis/atis_extra.tsv') 124 | count = 0 125 | for (q, lf, i) in dataset: 126 | lisp_tree = d.to_lisp_tree(lf) 127 | score = d.evaluator.eval(lisp_tree) 128 | if score == 0.0: 129 | count += 1 130 | print('The %d-th sample is type inconsistent:' % (i + 1)) 131 | print(q) 132 | print(' '.join(lf)) 133 | print('') 134 | print('Total error: %d' % (count)) 135 | -------------------------------------------------------------------------------- /utils/domain/domain_base.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import sys, os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 4 | from utils.constants import BOS, EOS, PAD, UNK 5 | from utils.bleu import get_bleu_score 6 | 7 | class Domain(): 8 | 9 | def __init__(self): 10 | super(Domain, self).__init__() 11 | self.dataset = None 12 | self.denotation = False 13 | 14 | @classmethod 15 | def from_dataset(self, dataset): 16 | if dataset == 'atis': 17 | from utils.domain.domain_atis import ATISDomain 18 | return ATISDomain() 19 | elif dataset == 'geo': 20 | from utils.domain.domain_geo import GEODomain 21 | return GEODomain() 22 | else: 23 | from utils.domain.domain_overnight import OvernightDomain 24 | return OvernightDomain(dataset) 25 | 26 | def reverse(self, idx_list, vocab, end_mask=EOS, oov_list=[], 27 | special_list=[BOS, EOS, PAD]): 28 | ''' 29 | Change idx list to token list without special tokens. 30 | @args: 31 | 1. idx_list: list of idx list, not tensor 32 | 2. vocab: idx to token list 33 | 3. end_mask: stop parsing when meets this token 34 | 4. oov_list: out of tgt vocab words, but in src inputs 35 | 5. special_list: remove these tokens in sequence, list of symbols 36 | @return: 37 | token list 38 | ''' 39 | unk_index = vocab.index(UNK) 40 | n_best = len(idx_list) / len(oov_list) if oov_list else None 41 | seq = [ 42 | [ 43 | oov_list[int(idx / n_best)][tok - len(vocab)] if tok >= len(vocab) else vocab[tok] for tok in tokens 44 | ] 45 | for idx, tokens in enumerate(idx_list) 46 | ] if oov_list else [[vocab[tok] if tok < len(vocab) else UNK for tok in tokens] for tokens in idx_list] 47 | 48 | def trim(s, t): 49 | sentence = [] 50 | for w in s: 51 | if w == t: 52 | break 53 | sentence.append(w) 54 | return sentence 55 | 56 | result = [trim(ex, end_mask) for ex in seq] 57 | 58 | def filter_special(tok): 59 | return tok not in special_list 60 | 61 | result = [list(filter(filter_special, ex)) for ex in result] 62 | return result 63 | 64 | def compare_question(self, predictions, references): 65 | """ 66 | predictions and references should be list of token list 67 | """ 68 | n_best = int(len(predictions) / len(references)) 69 | references = [[ref] for ref in references for _ in range(n_best)] 70 | bleu_list = list(map(get_bleu_score, predictions, references)) # sentence-level bleu score 71 | return bleu_list 72 | 73 | def compare_logical_form(self, predictions, references, pick=True): 74 | """ 75 | predictions and references should be list of token list 76 | pick(bool): pick the first prediction without syntax or execution error if n_best > 1 77 | """ 78 | predictions = self.normalize(predictions) 79 | references = self.normalize(references) 80 | n_best = int(len(predictions) / len(references)) 81 | if self.denotation: 82 | all_lf = predictions + references 83 | denotations = self.obtain_denotations(all_lf) 84 | predictions, references = denotations[:len(predictions)], denotations[len(predictions):] 85 | if pick: 86 | predictions, _ = self.pick_predictions(predictions, n_best) 87 | else: 88 | references = [each for each in references for _ in range(n_best)] 89 | return list(map(lambda x, y: 1.0 if x == y else 0.0, predictions, references)) 90 | 91 | def normalize(self, lf_list): 92 | """ 93 | Normalize each logical form, at least changes token list into string list 94 | """ 95 | return [' '.join(lf) for lf in lf_list] 96 | 97 | def obtain_denotations(self, lf_list): 98 | """ 99 | Obtain denotations for each logical form 100 | """ 101 | return lf_list 102 | 103 | def pick_predictions(self, pred_ans, n_best=1): 104 | if n_best == 1: 105 | return pred_ans, [i for i in range(len(pred_ans))] 106 | flags = self.is_valid(pred_ans) 107 | batches = int(len(pred_ans) / n_best) 108 | return_ans, return_idx = [], [] 109 | for idx in range(batches): 110 | for j in range(n_best): 111 | if int(flags[idx * n_best + j]) == 1: 112 | return_ans.append(pred_ans[idx * n_best + j]) 113 | return_idx.append(idx * n_best + j) 114 | break 115 | else: 116 | return_ans.append(pred_ans[idx * n_best]) 117 | return_idx.append(idx * n_best) 118 | return return_ans, return_idx 119 | 120 | def is_valid(self, ans_list): 121 | """ 122 | Check whether ans is syntax or semantic invalid 123 | ans_list(str list): denotation list or logical form list 124 | """ 125 | raise [1.0 for _ in range(len(ans_list))] 126 | -------------------------------------------------------------------------------- /utils/domain/domain_geo.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import sys, os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 4 | from utils.domain.domain_base import Domain 5 | import tempfile 6 | import subprocess 7 | import re 8 | 9 | class GEODomain(Domain): 10 | 11 | def __init__(self): 12 | self.dataset = 'geo' 13 | self.denotation = True 14 | 15 | def normalize(self, lf_list): 16 | def format_geo(lf): 17 | """ 18 | lf is token list 19 | 1. for entity longer than one word, add double quotes 20 | 2. remove underline _ for predicates 21 | 3. remove unnecessary space, except spaces in entities 22 | """ 23 | toks, quoted_toks, in_quotes = [], [], False 24 | for t in lf: 25 | if in_quotes: 26 | if t == "'": # entity ending 27 | toks.append('"%s"' % ' '.join(quoted_toks)) 28 | in_quotes, quoted_toks = False, [] 29 | else: 30 | quoted_toks.append(t) 31 | else: 32 | if t == "'": # entity start 33 | in_quotes = True 34 | else: 35 | if len(t) > 1 and t.startswith('_'): # predicate remove prefix _ 36 | toks.append(t[1:]) 37 | else: 38 | toks.append(t) 39 | return ''.join(toks) 40 | return [format_geo(lf) for lf in lf_list] 41 | 42 | def obtain_denotations(self, lf_list): 43 | tf = tempfile.NamedTemporaryFile('w+t', encoding='utf8', suffix='.dlog') 44 | tf_lines = ['_parse([query], %s).' % lf for lf in lf_list] 45 | for line in tf_lines: 46 | tf.write(line + '\n') 47 | tf.flush() 48 | msg = subprocess.check_output(['evaluator/geoquery', tf.name]) 49 | msg = msg.decode('utf8') 50 | tf.close() 51 | 52 | def get_denotation(line): 53 | m = re.search('\{[^}]*\}', line) 54 | if m: 55 | return m.group(0) 56 | else: 57 | return line.strip() 58 | 59 | denotations = [ 60 | get_denotation(line) 61 | for line in msg.split('\n') 62 | if line.startswith(' Example') 63 | ] 64 | return denotations 65 | 66 | def is_valid(self, ans_list): 67 | return list(map(lambda ans: 0.0 if 'FAILED' in ans or 'Join failed syntactically' in ans else 1.0, ans_list)) 68 | -------------------------------------------------------------------------------- /utils/domain/domain_overnight.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import sys, os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 4 | from utils.domain.domain_base import Domain 5 | import tempfile 6 | import subprocess 7 | import re 8 | 9 | class OvernightDomain(Domain): 10 | 11 | def __init__(self, dataset): 12 | self.dataset = dataset 13 | self.denotation = True 14 | 15 | def normalize(self, lf_list): 16 | lf_list = [' '.join(lf) for lf in lf_list] 17 | 18 | def format_overnight(lf): 19 | replacements = [ 20 | ('(', ' ( '), # make sure ( and ) must have blank space around 21 | (')', ' ) '), 22 | ('! ', '!'), 23 | ('SW', 'edu.stanford.nlp.sempre.overnight.SimpleWorld'), 24 | ] 25 | for a, b in replacements: 26 | lf = lf.replace(a, b) 27 | # remove redundant blank spaces 28 | lf = re.sub(' +', ' ', lf) 29 | return lf.strip() 30 | 31 | return [format_overnight(lf) for lf in lf_list] 32 | 33 | def obtain_denotations(self, lf_list): 34 | tf = tempfile.NamedTemporaryFile('w+t', encoding='utf8', suffix='.examples') 35 | for line in lf_list: 36 | tf.write(line + '\n') 37 | tf.flush() 38 | msg = subprocess.check_output(['evaluator/overnight', self.dataset, tf.name]) 39 | msg = msg.decode('utf8') 40 | tf.close() 41 | denotations = [ 42 | line.split('\t')[1] for line in msg.split('\n') 43 | if line.startswith('targetValue\t') 44 | ] 45 | return denotations 46 | 47 | def is_valid(self, ans_list): 48 | return list(map(lambda ans: 0.0 if 'BADJAVA' in ans or 'ERROR' in ans or ans == 'null' else 1.0, ans_list)) 49 | -------------------------------------------------------------------------------- /utils/example.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import numpy as np 3 | import sys, os 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 5 | from utils.lexicon import Lexicon 6 | from utils.domain.domain_base import Domain 7 | 8 | def split_dataset(dataset, split_ratio=1.0): 9 | assert split_ratio >= 0. and split_ratio <= 1.0 10 | index = np.arange(len(dataset)) 11 | np.random.shuffle(index) 12 | splt = int(len(dataset) * split_ratio) 13 | first = [dataset[idx] for idx in index[:splt]] 14 | second = [dataset[idx] for idx in index[splt:]] 15 | return first, second 16 | 17 | class Example(): 18 | 19 | __slots__ = ('question', 'logical_form', "mapped_question", "mapped_logical_form", "conf") 20 | 21 | @classmethod 22 | def set_domain(cls, dataset): 23 | cls.dataset = dataset # dataset name 24 | cls.db = Lexicon(dataset) 25 | cls.domain = Domain.from_dataset(dataset) # class Domain object 26 | if dataset in ['geo', 'atis']: 27 | cls.file_paths = [ 28 | os.path.join('data', dataset, dataset + '_train.tsv'), 29 | os.path.join('data', dataset, dataset + '_dev.tsv'), 30 | os.path.join('data', dataset, dataset + '_test.tsv') 31 | ] 32 | cls.extra_path = os.path.join('data', dataset, dataset + '_extra.tsv') 33 | else: #Overnight 34 | cls.file_paths = [ 35 | os.path.join('data', 'overnight', dataset + '_train.tsv'), 36 | os.path.join('data', 'overnight', dataset + '_test.tsv') 37 | ] 38 | cls.extra_path = os.path.join('data', 'overnight', dataset + '_extra.tsv') 39 | 40 | def __init__(self, question='', logical_form='', conf=1.0): 41 | super(Example, self).__init__() 42 | self.question = [each for each in question.split(' ') if each != ''] 43 | self.logical_form = [each for each in logical_form.split(' ') if each != ''] 44 | self.mapped_question = Example.db.entity_mapping(self.question) 45 | self.mapped_logical_form = Example.db.reverse_entity_mapping(self.logical_form, self.question) 46 | self.conf = conf 47 | 48 | @classmethod 49 | def load_dataset(cls, choice='train'): 50 | """ 51 | return example list of train, test or extra 52 | """ 53 | if choice == 'train': 54 | if len(cls.file_paths) == 2: 55 | # no dev dataset, split train dataset 56 | train_dataset = cls.load_dataset_from_file(cls.file_paths[0]) 57 | train_dataset, dev_dataset = split_dataset(train_dataset, split_ratio=0.8) 58 | else: 59 | assert len(cls.file_paths) == 3 60 | train_dataset = cls.load_dataset_from_file(cls.file_paths[0]) 61 | dev_dataset = cls.load_dataset_from_file(cls.file_paths[1]) 62 | return train_dataset, dev_dataset 63 | elif choice == 'test': 64 | test_dataset = cls.load_dataset_from_file(cls.file_paths[-1]) 65 | return test_dataset 66 | else: 67 | extra_dataset = cls.load_dataset_from_file(cls.extra_path) 68 | return extra_dataset 69 | 70 | @classmethod 71 | def load_dataset_from_file(cls, path): 72 | ex_list = [] 73 | with open(path, 'r') as infile: 74 | for line in infile: 75 | line = line.strip() 76 | if line == '': continue 77 | q, lf = line.split('\t') 78 | ex_list.append(cls(q.strip(), lf.strip())) 79 | return ex_list 80 | -------------------------------------------------------------------------------- /utils/gpu.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import os, sys, math 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | # special case in our remote server, just ignore 5 | if '/cm/local/apps/cuda/libs/current/pynvml' in sys.path: 6 | sys.path.remove('/cm/local/apps/cuda/libs/current/pynvml') 7 | import gpustat 8 | import torch 9 | 10 | def set_torch_device(deviceId): 11 | # Simplified version of gpu selection 12 | if deviceId < 0: 13 | device = torch.device("cpu") 14 | print('Use CPU ...') 15 | else: 16 | assert torch.cuda.device_count() >= deviceId + 1 17 | device = torch.device("cuda:%d" % (deviceId)) 18 | print('Use GPU with index %d' % (deviceId)) 19 | # os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # used when debug 20 | ## These two sentences are used to ensure reproducibility with cudnnbacken 21 | # torch.backends.cudnn.deterministic = True 22 | # torch.backends.cudnn.benchmark = False 23 | return device 24 | 25 | if __name__ == '__main__': 26 | 27 | set_torch_device(0) -------------------------------------------------------------------------------- /utils/hyperparam.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | ''' 3 | Construct exp directory according to hyper parameters 4 | ''' 5 | import os 6 | 7 | EXP_PATH = 'exp' 8 | 9 | def hyperparam_seq2seq(options): 10 | """Hyerparam string for semantic parsing and question generation.""" 11 | task_path = 'task_%s' % (options.task) 12 | dataset_path = 'dataset_%s' % (options.dataset) 13 | ratio = 'labeled_%s' % (options.labeled) 14 | 15 | exp_name = 'copy__' if options.copy else '' 16 | exp_name += 'cell_%s__' % (options.cell) 17 | exp_name += 'emb_%s__' % (options.emb_size) 18 | exp_name += 'hidden_%s_x_%s__' % (options.hidden_dim, options.num_layers) 19 | exp_name += 'dropout_%s__' % (options.dropout) 20 | exp_name += 'reduce_%s__' % (options.reduction) 21 | exp_name += 'lr_%s__' % (options.lr) 22 | exp_name += 'mn_%s__' % (options.max_norm) 23 | exp_name += 'l2_%s__' % (options.l2) 24 | exp_name += 'bsize_%s__' % (options.batchSize) 25 | exp_name += 'me_%s__' % (options.max_epoch) 26 | exp_name += 'beam_%s__' % (options.beam) 27 | exp_name += 'nbest_%s' % (options.n_best) 28 | return os.path.join(EXP_PATH, task_path, dataset_path, ratio, exp_name) 29 | 30 | def hyperparam_lm(options): 31 | task = 'task_%s' % (options.task) 32 | dataset_path = 'dataset_%s' % (options.dataset) 33 | ratio = '%s__labeled_%s' % (options.side, options.labeled) 34 | 35 | exp_name = '' 36 | exp_name += 'cell_%s__' % (options.cell) 37 | exp_name += 'emb_%s__' % (options.emb_size) 38 | exp_name += 'hidden_%s_x_%s__' % (options.hidden_dim, options.num_layers) 39 | exp_name += 'dropout_%s__' % (options.dropout) 40 | exp_name += 'reduce_%s__' % (options.reduction) 41 | exp_name += 'lr_%s__' % (options.lr) 42 | exp_name += 'mn_%s__' % (options.max_norm) 43 | exp_name += 'l2_%s__' % (options.l2) 44 | exp_name += 'bsize_%s__' % (options.batchSize) 45 | exp_name += 'me_%s' % (options.max_epoch) 46 | exp_name += '__decTied' if options.decoder_tied else '' 47 | return os.path.join(EXP_PATH, task, dataset_path, ratio, exp_name) 48 | 49 | def hyperparam_pseudo_method(options): 50 | task = 'task_%s' % (options.task) 51 | dataset_path = 'dataset_%s' % (options.dataset) 52 | ratio = 'labeled_%s__unlabeled_%s' % (options.labeled, options.unlabeled) 53 | ratio += '__extra' if options.extra else '' 54 | 55 | exp_name = '' 56 | if 'copy__' in options.read_sp_model_path: 57 | exp_name += 'sp_attnptr__' 58 | else: 59 | exp_name += 'sp_attn__' 60 | if 'copy__' in options.read_qg_model_path: 61 | exp_name += 'qg_attnptr__' 62 | else: 63 | exp_name += 'qg_attn__' 64 | exp_name += 'reduce_%s__' % (options.reduction) 65 | exp_name += 'lr_%s__' % (options.lr) 66 | exp_name += 'mn_%s__' % (options.max_norm) 67 | exp_name += 'l2_%s__' % (options.l2) 68 | exp_name += 'bsize_%s__' % (options.batchSize) 69 | exp_name += 'me_%s__' % (options.max_epoch) 70 | exp_name += 'beam_%s__' % (options.beam) 71 | exp_name += 'nbest_%s__' % (options.n_best) 72 | exp_name += 'discount_%s__method_%s' % (options.discount, options.method) 73 | return os.path.join(EXP_PATH, task, dataset_path, ratio, exp_name) 74 | 75 | def hyperparam_dual_learning(options): 76 | task = 'task_%s' % (options.task) 77 | dataset_path = 'dataset_%s' % (options.dataset) 78 | ratio = 'labeled_%s__unlabeled_%s' % (options.labeled, options.unlabeled) 79 | ratio += '__extra' if options.extra else '' 80 | 81 | exp_name = '' 82 | if 'copy__' in options.read_sp_model_path: 83 | exp_name += 'sp_attnptr__' 84 | else: 85 | exp_name += 'sp_attn__' 86 | if 'copy__' in options.read_qg_model_path: 87 | exp_name += 'qg_attnptr__' 88 | else: 89 | exp_name += 'qg_attn__' 90 | exp_name += 'reduce_%s__' % (options.reduction) 91 | exp_name += 'lr_%s__' % (options.lr) 92 | exp_name += 'mn_%s__' % (options.max_norm) 93 | exp_name += 'l2_%s__' % (options.l2) 94 | exp_name += 'bsize_%s__' % (options.batchSize) 95 | exp_name += 'me_%s__' % (options.max_epoch) 96 | exp_name += 'beam_%s__' % (options.beam) 97 | exp_name += 'nbest_%s__' % (options.n_best) 98 | exp_name += 'cycle_%s__' % (options.cycle) 99 | exp_name += 'sample_%s__alpha_%s__beta_%s' % (options.sample, options.alpha, options.beta) 100 | return os.path.join(EXP_PATH, task, dataset_path, ratio, exp_name) 101 | -------------------------------------------------------------------------------- /utils/lexicon.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import random, os 3 | import collections 4 | import itertools 5 | 6 | class Lexicon(): 7 | """ 8 | A Lexicon class used for entity mapping and reverse entity mapping (in pointer network) 9 | 10 | 1. Entity mapping: mapping word phrase into entity, replaced after copy 11 | ['in', 'which', 'seasons', 'kob', 'bryant', 'made', '3', 'blocks'] => (kob bryant, en.player.kobe_bryant) 12 | ==> ['in', 'which', 'seasons', 'en.player.kobe_bryant', 'en.player.kobe_bryant', 'made', '3', 'blocks'] 13 | For word phrases with multiple choices, entity that matches longer spans takes precedence (Longest Match First) 14 | 15 | 2. Reverse entity mapping: transform input logical form entities into natural phrases, replaced before copy (actually before feeding into network) 16 | ['(', 'lambda', '$0', 'e', '(', 'and', '(', 'flight', '$0', ')', '(', 'during_day', '$0', 'late:pd', ')'] => (late:pd, late flight|late|night) 17 | ==> ['(', 'lambda', '$0', 'e', '(', 'and', '(', 'flight', '$0', ')', '(', 'during_day', '$0', 'late', 'flight', ')'] 18 | Randomly select one natural phrase from multiple choices if question word is not available 19 | Otherwise, use exactly the longest word phrase in the question. 20 | Attention: Remember to add late, flight to logical form vocabulary 21 | """ 22 | def __init__(self, dataset): 23 | super(Lexicon, self).__init__() 24 | self.phrase2entity = collections.OrderedDict() 25 | self.entity2phrase = collections.OrderedDict() 26 | self.seen_words = set() 27 | self._load_lexicon(dataset) 28 | 29 | def _load_lexicon(self, dataset): 30 | entries = [] 31 | if dataset in ['atis', 'geo']: 32 | lexicon_path = os.path.join('data', dataset, dataset + '_lexicon.txt') 33 | else: 34 | lexicon_path = os.path.join('data', 'overnight', dataset + '_lexicon.txt') 35 | print('Start load lexicon from file %s ...' % (lexicon_path)) 36 | with open(lexicon_path, 'r') as f: 37 | for line in f: 38 | line = line.strip() 39 | if line == '': continue 40 | x, y = line.split(' :- NP : ') 41 | entries.append((x.strip(), y.strip())) 42 | self._add_entries(entries) 43 | 44 | def _add_entries(self, entries): 45 | for name, entity in entries: 46 | if entity not in self.entity2phrase: 47 | self.entity2phrase[entity] = [name] 48 | elif name not in self.entity2phrase[entity]: 49 | self.entity2phrase[entity].append(name) 50 | if name in self.phrase2entity: 51 | if self.phrase2entity[name] != entity: # we do not handle entity disambiguation 52 | print('Collision detected: %s -> %s, %s' % (name, self.entries[name], entity)) 53 | continue 54 | # Update self.seen_words 55 | for w in name.split(' '): 56 | self.seen_words.add(w) 57 | self.phrase2entity[name] = entity 58 | for entity in self.entity2phrase: # sorted according to length of noun phrases 59 | self.entity2phrase[entity] = sorted(self.entity2phrase[entity], key=lambda x: len(x), reverse=True) 60 | 61 | def entity_mapping(self, words): 62 | """ 63 | @args: 64 | words: a list of words 65 | @return: 66 | mapped_words: a list of words, where words[i] is replaced with entity if available 67 | """ 68 | entities = ['' for i in range(len(words))] 69 | index_pairs = sorted(list(itertools.combinations(range(len(words) + 1), 2)), 70 | key=lambda x: x[0] - x[1]) 71 | ret_entries = [] 72 | 73 | for i, j in index_pairs: 74 | # Longest match first 75 | if any(x for x in entities[i: j]): continue 76 | span = ' '.join(words[i: j]) 77 | if span in self.phrase2entity: 78 | entity = self.phrase2entity[span] 79 | for k in range(i, j): 80 | entities[k] = entity 81 | ret_entries.append(((i, j), entity)) 82 | mapped_words = [words[idx] if not item else item for idx, item in enumerate(entities)] 83 | return mapped_words 84 | 85 | def reverse_entity_mapping(self, tokens, words=None): 86 | """ 87 | @args: 88 | tokens: a list of logical form tokens 89 | words: a list of words if available 90 | @return: 91 | mapped_tokens: a list of tokens, where tokens[i] is replaced with noun phrases if available, 92 | prefer to use raw noun phrase in words if available 93 | """ 94 | entities = ['' for each in tokens] 95 | words = ' '.join(words) if words and words != ['none'] else None 96 | for idx, tok in enumerate(tokens): 97 | if tok in self.entity2phrase: 98 | if words: 99 | choices = self.entity2phrase[tok] 100 | among_words = list(filter(lambda item: item in words, choices)) 101 | if len(among_words) > 0: 102 | entities[idx] = among_words[0] 103 | else: 104 | entities[idx] = random.choice(self.entity2phrase[tok]) 105 | else: 106 | entities[idx] = random.choice(self.entity2phrase[tok]) 107 | mapped_words = [tokens[idx] if not item else item for idx, item in enumerate(entities)] 108 | return ' '.join(mapped_words).split(' ') 109 | -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import sys, logging 3 | 4 | def set_logger(exp_path, testing=False): 5 | logFormatter = logging.Formatter('%(asctime)s - %(message)s') #('%(asctime)s - %(levelname)s - %(message)s') 6 | logger = logging.getLogger('mylogger') 7 | logger.setLevel(logging.DEBUG) 8 | if testing: 9 | fileHandler = logging.FileHandler('%s/log_test.txt' % (exp_path), mode='w') 10 | else: 11 | fileHandler = logging.FileHandler('%s/log_train.txt' % (exp_path), mode='w') 12 | fileHandler.setFormatter(logFormatter) 13 | logger.addHandler(fileHandler) 14 | consoleHandler = logging.StreamHandler(sys.stdout) 15 | consoleHandler.setFormatter(logFormatter) 16 | logger.addHandler(consoleHandler) 17 | return logger -------------------------------------------------------------------------------- /utils/loss.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | ''' 3 | Set loss function, allow different confidence for different training samples 4 | ''' 5 | import torch 6 | import torch.nn as nn 7 | 8 | def set_loss_function(reduction='sum', ignore_index=-100): 9 | loss_function = MyNLLLoss(reduction=reduction, ignore_index=ignore_index) 10 | return loss_function 11 | 12 | class MyNLLLoss(nn.Module): 13 | 14 | def __init__(self, *args, **kargs): 15 | super(MyNLLLoss, self).__init__() 16 | self.real_reduction = kargs.pop('reduction', 'sum') 17 | kargs['reduction'] = 'none' 18 | self.loss_function = nn.NLLLoss(*args, **kargs) 19 | 20 | def forward(self, inputs, targets, lens=None, conf=None): 21 | if conf is None: 22 | conf = torch.ones(inputs.size(0), dtype=torch.float).to(inputs.device) 23 | bsize, seq_len, voc_size = list(inputs.size()) 24 | loss = self.loss_function(inputs.contiguous().view(-1, voc_size), targets.contiguous().view(-1)) 25 | loss = loss.contiguous().view(bsize, seq_len).sum(dim=1) 26 | loss = loss if self.real_reduction == 'sum' else loss / lens.float() 27 | loss = (loss * conf).sum() if self.real_reduction == 'sum' else (loss * conf).sum() / conf.sum() 28 | return loss -------------------------------------------------------------------------------- /utils/optimizer.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | ''' 3 | Set optimizer for train_model 4 | ''' 5 | import torch 6 | import torch.nn as nn 7 | from torch.optim import Adam 8 | 9 | def set_optimizer(*args, lr=1e-3, l2=1e-5, max_norm=5): 10 | params = [] 11 | for train_model in args: 12 | params += list(train_model.named_parameters()) 13 | grouped_params = [ 14 | {'params': list(set([p for n, p in params if p.requires_grad and 'bias' not in n])), 'weight_decay': l2}, 15 | {'params': list(set([p for n, p in params if p.requires_grad and 'bias' in n])), 'weight_decay': 0.0} 16 | ] 17 | optimizer = MyAdam(grouped_params, lr=lr, max_norm=max_norm) 18 | return optimizer 19 | 20 | class MyAdam(Adam): 21 | """ 22 | Add clip_grad_norm_ for Optimizer Adam 23 | """ 24 | def __init__(self, *args, **kargs): 25 | self.max_norm = kargs.pop('max_norm', -1) 26 | super(MyAdam, self).__init__(*args, **kargs) 27 | 28 | def step(self, *args, **kargs): 29 | if self.max_norm > 0: 30 | for group in self.param_groups: 31 | torch.nn.utils.clip_grad_norm_(group['params'], self.max_norm) 32 | super(MyAdam, self).step(*args, **kargs) 33 | -------------------------------------------------------------------------------- /utils/seed.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import random, torch 3 | import numpy as np 4 | 5 | def set_random_seed(random_seed=999, device='cuda'): 6 | random.seed(random_seed) 7 | torch.manual_seed(random_seed) 8 | if torch.cuda.is_available(): 9 | if device != 'cuda': 10 | print("WARNING: You have a CUDA device, so you should probably run with --deviceId [1|2|3]") 11 | else: 12 | torch.cuda.manual_seed(random_seed) 13 | np.random.seed(random_seed) -------------------------------------------------------------------------------- /utils/solver/solver_base.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | 3 | class Solver(): 4 | 5 | def __init__(self, model, vocab, loss_function, optimizer, exp_path, logger, device=None, **kargs): 6 | super(Solver, self).__init__() 7 | self.model = model 8 | self.vocab = vocab 9 | self.loss_function = loss_function 10 | self.optimizer = optimizer 11 | self.exp_path = exp_path 12 | self.logger = logger 13 | self.device = device 14 | 15 | def decode(self, data_inputs, output_path, test_batchSize, beam_size=5, n_best=1): 16 | raise NotImplementedError 17 | 18 | def train_and_decode(self, train_dataset, dev_dataset, test_dataset, batchSize=16, test_batchSize=128, 19 | max_epoch=100, beam_size=5, n_best=1): 20 | raise NotImplementedError 21 | -------------------------------------------------------------------------------- /utils/solver/solver_dual_learning.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import os, sys, time, gc 3 | import numpy as np 4 | import torch 5 | from utils.constants import * 6 | from utils.batch import get_minibatch 7 | from utils.example import Example 8 | from utils.bleu import get_bleu_score 9 | from utils.solver.solver_base import Solver 10 | 11 | class DualLearningSolver(Solver): 12 | ''' 13 | For Dual Learning Solver 14 | ''' 15 | def __init__(self, *args, **kargs): 16 | super(DualLearningSolver, self).__init__(*args, **kargs) 17 | self.best_result = { 18 | "iter_sp": 0, "dev_acc": 0., "test_acc": 0., 19 | "iter_qg": 0, "dev_bleu": 0., "test_bleu": 0. 20 | } 21 | 22 | def decode(self, data_inputs, output_path, test_batchSize, beam=5, n_best=1): 23 | data_index = np.arange(len(data_inputs)) 24 | nsentences = len(data_index) 25 | domain = Example.domain 26 | total, candidate_list, references_list = [], [], [] 27 | ########################### Evaluation Phase ############################ 28 | with open(output_path, 'w') as of: 29 | self.model.eval() 30 | for j in range(0, nsentences, test_batchSize): 31 | ###################### Obtain minibatch data ###################### 32 | inputs, lens, _, _, _, copy_tokens, oov_list, (raw_inputs, raw_outputs) = get_minibatch( 33 | data_inputs, self.vocab['sp'], task='semantic_parsing', data_index=data_index, 34 | index=j, batch_size=test_batchSize, device=self.device['sp'], copy=self.model.sp_model.copy) 35 | ############################ Forward Model ############################ 36 | with torch.no_grad(): 37 | results = self.model.decode_batch(inputs, lens, self.vocab['sp'].lf2id, copy_tokens, task='semantic_parsing', beam_size=beam, n_best=n_best) 38 | predictions = results["predictions"] 39 | predictions = [pred for each in predictions for pred in each] 40 | predictions = domain.reverse(predictions, self.vocab['sp'].id2lf, oov_list=oov_list) 41 | accuracy = domain.compare_logical_form(predictions, raw_outputs, pick=True) 42 | total.extend(accuracy) 43 | ############################ Write result to file ############################ 44 | for idx in range(len(raw_inputs)): 45 | of.write("Utterance: " + ' '.join(raw_inputs[idx]) + '\n') 46 | of.write("Target: " + ' '.join(raw_outputs[idx]) + '\n') 47 | for i in range(n_best): 48 | of.write("Pred" + str(i) + ": " + ' '.join(predictions[n_best * idx + i]) + '\n') 49 | of.write("Correct: " + ("True" if accuracy[idx] == 1 else "False") + '\n\n') 50 | 51 | of.write('=' * 50 + '\n' + '=' * 50 + '\n\n') 52 | 53 | for j in range(0, nsentences, test_batchSize): 54 | ###################### Obtain minibatch data ###################### 55 | inputs, lens, _, _, _, copy_tokens, oov_list, (raw_inputs, raw_outputs) = get_minibatch( 56 | data_inputs, self.vocab['qg'], task='question_generation', data_index=data_index, 57 | index=j, batch_size=test_batchSize, device=self.device['qg'], copy=self.model.qg_model.copy) 58 | ########################## Beam Search/Greed Decode ####################### 59 | with torch.no_grad(): 60 | results = self.model.decode_batch(inputs, lens, self.vocab['qg'].word2id, copy_tokens, task='question_generation', beam_size=beam, n_best=n_best) 61 | predictions = results["predictions"] 62 | predictions = [each[0] for each in predictions] 63 | predictions = domain.reverse(predictions, self.vocab['qg'].id2word, oov_list=oov_list) 64 | bleu_scores = domain.compare_question(predictions, raw_outputs) 65 | candidate_list.extend(predictions) 66 | references_list.extend([[ref] for ref in raw_outputs]) 67 | ############################# Writing Result to File ########################### 68 | for idx in range(len(raw_inputs)): 69 | of.write("LogicalForm: " + ' '.join(raw_inputs[idx]) + '\n') 70 | of.write("Target: " + ' '.join(raw_outputs[idx]) + '\n') 71 | of.write("Pred0: " + ' '.join(predictions[idx]) + '\n') 72 | of.write("Bleu: " + str(bleu_scores[idx]) + '\n\n') 73 | ########################### Calculate accuracy ########################### 74 | acc = sum(total) / float(len(total)) 75 | avg_bleu = get_bleu_score(candidate_list, references_list) 76 | of.write('Overall accuracy: %.4f | Overall bleu score: %.4f' % (acc, avg_bleu)) 77 | return acc, avg_bleu 78 | 79 | def train_and_decode(self, labeled_train_dataset, q_unlabeled_train_dataset, lf_unlabeled_train_dataset, dev_dataset, test_dataset, 80 | batchSize, test_batchSize, cycle='sp+qg', max_epoch=100, beam=5, n_best=1): 81 | sp_unlabeled_train_index = np.arange(len(q_unlabeled_train_dataset)) 82 | qg_unlabeled_train_index = np.arange(len(lf_unlabeled_train_dataset)) 83 | labeled_train_index = np.arange(len(labeled_train_dataset)) 84 | nsentences = max([len(q_unlabeled_train_dataset), len(lf_unlabeled_train_dataset), len(labeled_train_dataset)]) 85 | for i in range(max_epoch): 86 | ########################### Training Phase ############################ 87 | start_time = time.time() 88 | np.random.shuffle(sp_unlabeled_train_index) 89 | np.random.shuffle(qg_unlabeled_train_index) 90 | np.random.shuffle(labeled_train_index) 91 | losses = { 'sp': [], 'qg': [] } 92 | self.model.train() 93 | for j in range(0, nsentences, batchSize): 94 | self.model.zero_grad() 95 | 96 | ''' Cycle start from Semantic Parsing ''' 97 | if 'sp' in cycle: 98 | ###################### Obtain minibatch data ###################### 99 | inputs, lens, copy_tokens, oov_list, raw_in = get_minibatch(q_unlabeled_train_dataset, self.vocab['sp'], task='unlabeled_semantic_parsing', 100 | data_index=sp_unlabeled_train_index, index=j, batch_size=batchSize, device=self.device['sp'], copy=self.model.sp_model.copy) 101 | ######################## Forward Model ########################## 102 | sp_loss, qg_loss = self.model(inputs, lens, copy_tokens, oov_list, raw_in, start_from='semantic_parsing') 103 | losses['sp'].append(sp_loss.item()) 104 | losses['qg'].append(qg_loss.item()) 105 | sp_loss.backward() 106 | qg_loss.backward() 107 | 108 | ''' Cycle start from Question Generation ''' 109 | if 'qg' in cycle: 110 | ###################### Obtain minibatch data ###################### 111 | inputs, lens, copy_tokens, oov_list, raw_in = get_minibatch(lf_unlabeled_train_dataset, self.vocab['qg'], task='unlabeled_question_generation', 112 | data_index=qg_unlabeled_train_index, index=j, batch_size=batchSize, device=self.device['qg'], copy=self.model.qg_model.copy) 113 | ########################### Forward Model ######################## 114 | sp_loss, qg_loss = self.model(inputs, lens, copy_tokens, oov_list, raw_in, start_from='question_generation') 115 | losses['sp'].append(sp_loss.item()) 116 | losses['qg'].append(qg_loss.item()) 117 | sp_loss.backward() 118 | qg_loss.backward() 119 | 120 | ''' Supervised Training ''' 121 | if True: 122 | ###################### Obtain minibatch data ###################### 123 | inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch( 124 | labeled_train_dataset, self.vocab['sp'], task='semantic_parsing', 125 | data_index=labeled_train_index, index=j, batch_size=batchSize, device=self.device['sp'], copy=self.model.sp_model.copy) 126 | ############################ Forward Model ############################ 127 | batch_scores = self.model.sp_model(inputs, lens, dec_inputs[:, :-1], copy_tokens) 128 | batch_loss = self.loss_function['sp'](batch_scores, dec_outputs[:, 1:], out_lens - 1) 129 | losses['sp'].append(batch_loss.item()) 130 | batch_loss.backward() 131 | 132 | ###################### Obtain minibatch data ###################### 133 | inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch( 134 | labeled_train_dataset, self.vocab['qg'], task='question_generation', 135 | data_index=labeled_train_index, index=j, batch_size=batchSize, device=self.device['qg'], copy=self.model.qg_model.copy) 136 | ############################ Forward Model ############################ 137 | batch_scores = self.model.qg_model(inputs, lens, dec_inputs[:, :-1], copy_tokens) 138 | batch_loss = self.loss_function['qg'](batch_scores, dec_outputs[:, 1:], out_lens - 1) 139 | losses['qg'].append(batch_loss.item()) 140 | batch_loss.backward() 141 | 142 | self.model.pad_embedding_grad_zero() 143 | self.optimizer.step() 144 | gc.collect() 145 | torch.cuda.empty_cache() 146 | 147 | print('[learning] epoch %i >> %3.2f%%' % (i, 100), 'completed in %.2f (sec) <<' % (time.time() - start_time)) 148 | sp_loss, qg_loss = np.sum(losses['sp'], axis=0), np.sum(losses['qg'], axis=0) 149 | self.logger.info('Training:\tEpoch : %d\tTime : %.4fs\tLoss(sp loss : %.4f ; qg loss : %.4f)' \ 150 | % (i, time.time() - start_time, sp_loss, qg_loss)) 151 | 152 | ########################### Evaluation Phase ############################ 153 | start_time = time.time() 154 | dev_acc, dev_bleu = self.decode(dev_dataset, os.path.join(self.exp_path, 'valid.iter' + str(i)), 155 | test_batchSize, beam=beam, n_best=n_best) 156 | self.logger.info('Evaluation:\tEpoch : %d\tTime : %.4fs\tSemantic Parsing (acc : %.4f)\tQuestion Generation (bleu : %.4f)' \ 157 | % (i, time.time() - start_time, dev_acc, dev_bleu)) 158 | start_time = time.time() 159 | test_acc, test_bleu = self.decode(test_dataset, os.path.join(self.exp_path, 'test.iter' + str(i)), 160 | test_batchSize, beam=beam, n_best=n_best) 161 | self.logger.info('Evaluation:\tEpoch : %d\tTime : %.4fs\tSemantic Parsing (acc : %.4f)\tQuestion Generation (bleu : %.4f)' \ 162 | % (i, time.time() - start_time, test_acc, test_bleu)) 163 | 164 | ######################## Pick best result and save ##################### 165 | if dev_acc > self.best_result['dev_acc']: 166 | self.model.save_model(sp_save_dir=os.path.join(self.exp_path, 'sp_model.pkl')) 167 | self.best_result['iter_sp'] = i 168 | self.best_result['dev_acc'], self.best_result['test_acc'] = dev_acc, test_acc 169 | self.logger.info('NEW BEST Semantic Parsing:\tEpoch : %d\tBest Valid (acc : %.4f)\tBest Test (acc : %.4f)' \ 170 | % (i, dev_acc, test_acc)) 171 | if dev_bleu >= self.best_result['dev_bleu']: 172 | self.model.save_model(qg_save_dir=os.path.join(self.exp_path, 'qg_model.pkl')) 173 | self.best_result['iter_qg'] = i 174 | self.best_result['dev_bleu'], self.best_result['test_bleu'] = dev_bleu, test_bleu 175 | self.logger.info('NEW BEST Question Generation:\tEpoch : %d\tBest Valid (bleu : %.4f)\tBest Test (bleu : %.4f)' \ 176 | % (i, dev_bleu, test_bleu)) 177 | gc.collect() 178 | torch.cuda.empty_cache() 179 | 180 | ######################## Reload best model for later usage ##################### 181 | self.logger.info('FINAL BEST Semantic Parsing RESULT: \tEpoch : %d\tBest Valid (acc : %.4f)\tBest Test (acc : %.4f)' 182 | % (self.best_result['iter_sp'], self.best_result['dev_acc'], self.best_result['test_acc'])) 183 | self.logger.info('FINAL BEST Question Generation RESULT: \tEpoch : %d\tBest Valid (bleu : %.4f)\tBest Test (bleu : %.4f)' 184 | % (self.best_result['iter_qg'], self.best_result['dev_bleu'], self.best_result['test_bleu'])) 185 | self.model.load_model(os.path.join(self.exp_path, 'sp_model.pkl'), os.path.join(self.exp_path, 'qg_model.pkl')) 186 | -------------------------------------------------------------------------------- /utils/solver/solver_language_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf8 2 | import os, sys, time, gc 3 | import numpy as np 4 | import torch 5 | from utils.solver.solver_base import Solver 6 | from utils.batch import get_minibatch 7 | 8 | class LMSolver(Solver): 9 | ''' 10 | For traditional RNN-based Language Model 11 | ''' 12 | def __init__(self, *args, **kargs): 13 | self.side = kargs.pop('side', 'question') 14 | super(LMSolver, self).__init__(*args, **kargs) 15 | self.best_result = {"losses": [], "iter": 0, "dev_ppl": float('inf'), "test_ppl": float('inf')} 16 | 17 | def decode(self, data_inputs, output_path, test_batchSize): 18 | data_index = np.arange(len(data_inputs)) 19 | count, eval_loss, length_list = 0, [], [] 20 | ########################### Evaluation Phase ############################ 21 | self.model.eval() 22 | with open(output_path, 'w') as f: 23 | for j in range(0, len(data_index), test_batchSize): 24 | ###################### Obtain minibatch data ###################### 25 | inputs, lens, raw_inputs = get_minibatch(data_inputs, self.vocab, task='language_model', 26 | data_index=data_index, index=j, batch_size=test_batchSize, device=self.device, side=self.side) 27 | length_list.extend((lens - 1).tolist()) 28 | ########################## Calculate Sentence PPL ####################### 29 | with torch.no_grad(): 30 | scores = self.model(inputs, lens) # bsize, seq_len, voc_size 31 | batch_loss = self.loss_function(scores, inputs[:, 1:]).item() 32 | eval_loss.append(batch_loss) 33 | norm_log_prob = self.model.sent_logprobability(inputs, lens).cpu().tolist() 34 | 35 | ############################# Writing Result to File ########################### 36 | for idx in range(len(inputs)): 37 | f.write('Utterance: ' + ' '.join(raw_inputs[idx]) + '\n') 38 | f.write('NormLogProb: ' + str(norm_log_prob[idx]) + '\n') 39 | current_ppl = np.exp(- norm_log_prob[idx]) 40 | f.write('PPL: ' + str(current_ppl) + '\n\n') 41 | 42 | ########################### Calculate Corpus PPL ########################### 43 | word_count = np.sum(length_list, axis=0) 44 | eval_loss = np.sum(eval_loss, axis=0) 45 | final_ppl = np.exp(eval_loss / word_count) 46 | f.write('Overall ppl: %.4f' % (final_ppl)) 47 | return final_ppl 48 | 49 | def train_and_decode(self, train_inputs, dev_inputs, test_inputs, batchSize=16, test_batchSize=128, max_epoch=100): 50 | train_data_index = np.arange(len(train_inputs)) 51 | nsentences = len(train_data_index) 52 | for i in range(max_epoch): 53 | ########################### Training Phase ############################ 54 | start_time = time.time() 55 | np.random.shuffle(train_data_index) 56 | losses = [] 57 | self.model.train() 58 | for j in range(0, nsentences, batchSize): 59 | ###################### Obtain minibatch data ###################### 60 | inputs, lens, _ = get_minibatch(train_inputs, self.vocab, task='language_model', 61 | data_index=train_data_index, index=j, batch_size=batchSize, device=self.device, side=self.side) 62 | ############################ Forward Model ############################ 63 | self.optimizer.zero_grad() 64 | batch_scores = self.model(inputs, lens) 65 | ############################ Loss Calculation ######################### 66 | batch_loss = self.loss_function(batch_scores, inputs[:, 1:], lens - 1) 67 | losses.append(batch_loss.item()) 68 | ########################### Backward and Optimize ###################### 69 | batch_loss.backward() 70 | self.model.pad_embedding_grad_zero() 71 | self.optimizer.step() 72 | 73 | print('[learning] epoch %i >> %3.2f%%' % (i, 100), 'completed in %.2f (sec) <<' % (time.time() - start_time)) 74 | epoch_loss = np.sum(losses, axis=0) 75 | self.best_result['losses'].append(epoch_loss) 76 | self.logger.info('Training:\tEpoch : %d\tTime : %.4fs\t Loss of tgt : %.5f' \ 77 | % (i, time.time() - start_time, epoch_loss)) 78 | gc.collect() 79 | torch.cuda.empty_cache() 80 | 81 | # whether evaluate later after training for some epochs 82 | if i < 10: 83 | continue 84 | 85 | ########################### Evaluation Phase ############################ 86 | start_time = time.time() 87 | dev_ppl = self.decode(dev_inputs, os.path.join(self.exp_path, 'valid.iter' + str(i)), test_batchSize) 88 | self.logger.info('Evaluation:\tEpoch : %d\tTime : %.4fs\tppl : %.4f' % (i, time.time() - start_time, dev_ppl)) 89 | start_time = time.time() 90 | test_ppl = self.decode(test_inputs, os.path.join(self.exp_path, 'test.iter' + str(i)), test_batchSize) 91 | self.logger.info('Evaluation:\tEpoch : %d\tTime : %.4fs\tppl : %.4f' % (i, time.time() - start_time, test_ppl)) 92 | 93 | ######################## Pick best result and save ##################### 94 | if dev_ppl < self.best_result['dev_ppl']: 95 | self.model.save_model(os.path.join(self.exp_path, 'model.pkl')) 96 | self.best_result['iter'] = i 97 | self.best_result['dev_ppl'], self.best_result['test_ppl'] = dev_ppl, test_ppl 98 | self.logger.info('NEW BEST:\tEpoch : %d\tBest Valid ppl : %.4f;\tBest Test ppl : %.4f' % (i, dev_ppl, test_ppl)) 99 | 100 | ######################## Reload best model for later usage ##################### 101 | self.logger.info('FINAL BEST RESULT: \tEpoch : %d\tBest Valid (ppl : %.4f)\tBest Test (ppl : %.4f) ' 102 | % (self.best_result['iter'], self.best_result['dev_ppl'], self.best_result['test_ppl'])) 103 | self.model.load_model(os.path.join(self.exp_path, 'model.pkl')) 104 | -------------------------------------------------------------------------------- /utils/solver/solver_question_generation.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import time, os, gc 3 | from utils.solver.solver_base import Solver 4 | from utils.example import Example 5 | from utils.batch import get_minibatch 6 | from utils.bleu import get_bleu_score 7 | import numpy as np 8 | import torch 9 | 10 | class QGSolver(Solver): 11 | 12 | def __init__(self, *args, **kargs): 13 | super(QGSolver, self).__init__(*args, **kargs) 14 | self.best_result = { "losses": [], "iter": 0, "dev_bleu": 0., "test_bleu": 0. } 15 | 16 | def decode(self, data_inputs, output_path, test_batchSize, beam=5, n_best=1): 17 | data_index= np.arange(len(data_inputs)) 18 | nsentences, candidate_list, references_list = len(data_index), [], [] 19 | domain = Example.domain 20 | self.model.eval() 21 | with open(output_path, 'w') as of: 22 | for j in range(0, nsentences, test_batchSize): 23 | ###################### Obtain minibatch data ###################### 24 | inputs, lens, dec_inputs, _, _, copy_tokens, oov_list, (raw_inputs, raw_outputs) = get_minibatch( 25 | data_inputs, self.vocab, task='question_generation', data_index=data_index, 26 | index=j, batch_size=test_batchSize, device=self.device, copy=self.model.copy) 27 | ############################ Forward Model ############################ 28 | with torch.no_grad(): 29 | results = self.model.decode_batch(inputs, lens, self.vocab.word2id, copy_tokens, beam_size=beam, n_best=n_best) 30 | predictions = results["predictions"] 31 | predictions = [each[0] for each in predictions] 32 | predictions = domain.reverse(predictions, self.vocab.id2word, oov_list=oov_list) 33 | bleu_scores = domain.compare_question(predictions, raw_outputs) 34 | candidate_list.extend(predictions) 35 | references_list.extend([[ref] for ref in raw_outputs]) 36 | ############################ Write result to file ############################ 37 | for idx in range(len(raw_inputs)): 38 | of.write("LogicalForm: " + ' '.join(raw_inputs[idx]) + '\n') 39 | of.write("Target: " + ' '.join(raw_outputs[idx]) + '\n') 40 | of.write("Pred0: " + ' '.join(predictions[idx]) + '\n') 41 | of.write("Bleu: " + str(bleu_scores[idx]) + '\n\n') 42 | avg_bleu = get_bleu_score(candidate_list, references_list) 43 | of.write('Overall bleu is %.4f' % (avg_bleu)) 44 | return avg_bleu 45 | 46 | def train_and_decode(self, train_dataset, dev_dataset, test_dataset, batchSize=16, test_batchSize=128, 47 | max_epoch=100, beam=5, n_best=1): 48 | train_data_index = np.arange(len(train_dataset)) 49 | nsentences = len(train_data_index) 50 | for i in range(max_epoch): 51 | ########################### Training Phase ############################ 52 | start_time = time.time() 53 | np.random.shuffle(train_data_index) 54 | losses = [] 55 | self.model.train() 56 | for j in range(0, nsentences, batchSize): 57 | ###################### Obtain minibatch data ###################### 58 | inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch( 59 | train_dataset, self.vocab, task='question_generation', data_index=train_data_index, 60 | index=j, batch_size=batchSize, device=self.device, copy=self.model.copy) 61 | ############################ Forward Model ############################ 62 | self.model.zero_grad() 63 | batch_scores = self.model(inputs, lens, dec_inputs[:, :-1], copy_tokens) 64 | ############################ Loss Calculation ######################### 65 | batch_loss = self.loss_function(batch_scores, dec_outputs[:, 1:], out_lens - 1) 66 | losses.append(batch_loss.item()) 67 | ########################### Backward and Optimize ###################### 68 | batch_loss.backward() 69 | self.model.pad_embedding_grad_zero() 70 | self.optimizer.step() 71 | 72 | print('[learning] epoch %i >> %3.2f%%' % (i, 100), 'completed in %.2f (sec) <<' % (time.time() - start_time)) 73 | epoch_loss = np.sum(losses, axis=0) 74 | self.best_result['losses'].append(epoch_loss) 75 | self.logger.info('Training:\tEpoch : %d\tTime : %.4fs\t Loss: %.5f' \ 76 | % (i, time.time() - start_time, epoch_loss)) 77 | gc.collect() 78 | torch.cuda.empty_cache() 79 | 80 | if i < 10: 81 | continue 82 | 83 | ########################### Evaluation Phase ############################ 84 | start_time = time.time() 85 | dev_bleu = self.decode(dev_dataset, os.path.join(self.exp_path, 'valid.iter' + str(i)), 86 | test_batchSize, beam=beam, n_best=n_best) 87 | self.logger.info('Dev Evaluation:\tEpoch : %d\tTime : %.4fs\tBleu : %.4f' \ 88 | % (i, time.time() - start_time, dev_bleu)) 89 | start_time = time.time() 90 | test_bleu = self.decode(test_dataset, os.path.join(self.exp_path, 'test.iter' + str(i)), 91 | test_batchSize, beam=beam, n_best=n_best) 92 | self.logger.info('Test Evaluation:\tEpoch : %d\tTime : %.4fs\tBleu : %.4f' \ 93 | % (i, time.time() - start_time, test_bleu)) 94 | 95 | ######################## Pick best result on dev and save ##################### 96 | if dev_bleu >= self.best_result['dev_bleu']: 97 | self.model.save_model(os.path.join(self.exp_path, 'model.pkl')) 98 | self.best_result['iter'] = i 99 | self.best_result['dev_bleu'], self.best_result['test_bleu'] = dev_bleu, test_bleu 100 | self.logger.info('NEW BEST:\tEpoch : %d\tBest Valid Bleu : %.4f;\tBest Test Bleu : %.4f' % (i, dev_bleu, test_bleu)) 101 | 102 | ######################## Reload best model for later usage ##################### 103 | self.logger.info('FINAL BEST RESULT: \tEpoch : %d\tBest Valid (Bleu : %.4f)\tBest Test (Bleu : %.4f)' 104 | % (self.best_result['iter'], self.best_result['dev_bleu'], self.best_result['test_bleu'])) 105 | self.model.load_model(os.path.join(self.exp_path, 'model.pkl')) 106 | -------------------------------------------------------------------------------- /utils/solver/solver_semantic_parsing.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import time, os, gc 3 | from utils.solver.solver_base import Solver 4 | from utils.example import Example 5 | from utils.batch import get_minibatch 6 | import numpy as np 7 | import torch 8 | 9 | class SPSolver(Solver): 10 | 11 | def __init__(self, *args, **kargs): 12 | super(SPSolver, self).__init__(*args, **kargs) 13 | self.best_result = { "losses": [], "iter": 0, "dev_acc": 0., "test_acc": 0. } 14 | 15 | def decode(self, data_inputs, output_path, test_batchSize, beam=5, n_best=1): 16 | data_index= np.arange(len(data_inputs)) 17 | nsentences, total = len(data_index), [] 18 | domain = Example.domain 19 | self.model.eval() 20 | with open(output_path, 'w') as of: 21 | for j in range(0, nsentences, test_batchSize): 22 | ###################### Obtain minibatch data ###################### 23 | inputs, lens, dec_inputs, _, _, copy_tokens, oov_list, (raw_inputs, raw_outputs) = get_minibatch( 24 | data_inputs, self.vocab, task='semantic_parsing', data_index=data_index, 25 | index=j, batch_size=test_batchSize, device=self.device, copy=self.model.copy) 26 | ############################ Forward Model ############################ 27 | with torch.no_grad(): 28 | results = self.model.decode_batch(inputs, lens, self.vocab.lf2id, copy_tokens, beam_size=beam, n_best=n_best) 29 | predictions = results["predictions"] 30 | predictions = [pred for each in predictions for pred in each] 31 | predictions = domain.reverse(predictions, self.vocab.id2lf, oov_list=oov_list) 32 | accuracy = domain.compare_logical_form(predictions, raw_outputs, pick=True) 33 | total.extend(accuracy) 34 | ############################ Write result to file ############################ 35 | for idx in range(len(raw_inputs)): 36 | of.write("Utterance: " + ' '.join(raw_inputs[idx]) + '\n') 37 | of.write("Target: " + ' '.join(raw_outputs[idx]) + '\n') 38 | for i in range(n_best): 39 | of.write("Pred" + str(i) + ": " + ' '.join(predictions[n_best * idx + i]) + '\n') 40 | of.write("Correct: " + ("True" if accuracy[idx] == 1 else "False") + '\n\n') 41 | acc = sum(total) / float(len(total)) 42 | of.write('Overall accuracy is %.4f' % (acc)) 43 | return acc 44 | 45 | def train_and_decode(self, train_dataset, dev_dataset, test_dataset, batchSize=16, test_batchSize=128, 46 | max_epoch=100, beam=5, n_best=1): 47 | train_data_index = np.arange(len(train_dataset)) 48 | nsentences = len(train_data_index) 49 | for i in range(max_epoch): 50 | ########################### Training Phase ############################ 51 | start_time = time.time() 52 | np.random.shuffle(train_data_index) 53 | losses = [] 54 | self.model.train() 55 | for j in range(0, nsentences, batchSize): 56 | ###################### Obtain minibatch data ###################### 57 | inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch( 58 | train_dataset, self.vocab, task='semantic_parsing', data_index=train_data_index, 59 | index=j, batch_size=batchSize, device=self.device, copy=self.model.copy) 60 | ############################ Forward Model ############################ 61 | self.model.zero_grad() 62 | batch_scores = self.model(inputs, lens, dec_inputs[:, :-1], copy_tokens) 63 | ############################ Loss Calculation ######################### 64 | batch_loss = self.loss_function(batch_scores, dec_outputs[:, 1:], out_lens - 1) 65 | losses.append(batch_loss.item()) 66 | ########################### Backward and Optimize ###################### 67 | batch_loss.backward() 68 | self.model.pad_embedding_grad_zero() 69 | self.optimizer.step() 70 | 71 | print('[learning] epoch %i >> %3.2f%%' % (i, 100), 'completed in %.2f (sec) <<' % (time.time() - start_time)) 72 | epoch_loss = np.sum(losses, axis=0) 73 | self.best_result['losses'].append(epoch_loss) 74 | self.logger.info('Training:\tEpoch : %d\tTime : %.4fs\t Loss: %.5f' \ 75 | % (i, time.time() - start_time, epoch_loss)) 76 | gc.collect() 77 | torch.cuda.empty_cache() 78 | 79 | if i < 10: 80 | continue 81 | 82 | ########################### Evaluation Phase ############################ 83 | start_time = time.time() 84 | dev_acc = self.decode(dev_dataset, os.path.join(self.exp_path, 'valid.iter' + str(i)), 85 | test_batchSize, beam=beam, n_best=n_best) 86 | self.logger.info('Dev Evaluation:\tEpoch : %d\tTime : %.4fs\tAcc : %.4f' \ 87 | % (i, time.time() - start_time, dev_acc)) 88 | start_time = time.time() 89 | test_acc = self.decode(test_dataset, os.path.join(self.exp_path, 'test.iter' + str(i)), 90 | test_batchSize, beam=beam, n_best=n_best) 91 | self.logger.info('Test Evaluation:\tEpoch : %d\tTime : %.4fs\tAcc : %.4f' \ 92 | % (i, time.time() - start_time, test_acc)) 93 | 94 | ######################## Pick best result on dev and save ##################### 95 | if dev_acc >= self.best_result['dev_acc']: 96 | self.model.save_model(os.path.join(self.exp_path, 'model.pkl')) 97 | self.best_result['iter'] = i 98 | self.best_result['dev_acc'], self.best_result['test_acc'] = dev_acc, test_acc 99 | self.logger.info('NEW BEST:\tEpoch : %d\tBest Valid Acc : %.4f;\tBest Test Acc : %.4f' % (i, dev_acc, test_acc)) 100 | 101 | ######################## Reload best model for later usage ##################### 102 | self.logger.info('FINAL BEST RESULT: \tEpoch : %d\tBest Valid (Acc : %.4f)\tBest Test (Acc : %.4f)' 103 | % (self.best_result['iter'], self.best_result['dev_acc'], self.best_result['test_acc'])) 104 | self.model.load_model(os.path.join(self.exp_path, 'model.pkl')) 105 | -------------------------------------------------------------------------------- /utils/statistics.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | """ 3 | Construct vocabulary for each dataset. 4 | """ 5 | import os, sys, argparse 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 7 | from utils.lexicon import Lexicon 8 | from utils.constants import BOS, EOS, PAD, UNK 9 | import operator 10 | 11 | def read_data(path): 12 | ex_list = [] 13 | with open(path, 'r') as infile: 14 | for line in infile: 15 | line = line.strip() 16 | if line == '': 17 | continue 18 | q, lf = line.split('\t') 19 | q = [each.strip() for each in q.strip().split(' ') if each.strip() != ''] 20 | lf = [each.strip() for each in lf.strip().split(' ') if each.strip() != ''] 21 | ex_list.append((q, lf)) 22 | return ex_list 23 | 24 | def save_vocab(idx2word, vocab_path): 25 | with open(vocab_path, 'w') as f: 26 | for idx in range(len(idx2word)): 27 | f.write(idx2word[idx] + '\n') 28 | 29 | def construct_vocab(input_seqs, mwf=1): 30 | ''' 31 | Construct vocabulary given input_seqs 32 | @params: 33 | 1. input_seqs: a list of seqs, e.g. 34 | [ ['what', 'flight'] , ['which', 'flight'] ] 35 | 2. mwf: minimum word frequency 36 | @return: 37 | 1. word2idx(dict) 38 | 2. idx2word(dict) 39 | ''' 40 | vocab, word2idx, idx2word = {}, {}, [] 41 | for seq in input_seqs: 42 | if type(seq) in [tuple, list]: 43 | for word in seq: 44 | if word not in vocab: 45 | vocab[word] = 1 46 | else: 47 | vocab[word] += 1 48 | else: 49 | if seq not in vocab: 50 | vocab[seq] = 1 51 | else: 52 | vocab[seq] += 1 53 | 54 | # Discard those special tokens if already exist 55 | if PAD in vocab: del vocab[PAD] 56 | if UNK in vocab: del vocab[UNK] 57 | if BOS in vocab: del vocab[BOS] 58 | if EOS in vocab: del vocab[EOS] 59 | 60 | sorted_words = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True) 61 | sorted_words = [x[0] for x in sorted_words if x[1] >= mwf] 62 | for word in sorted_words: 63 | idx = len(word2idx) 64 | word2idx[word] = idx 65 | idx2word.append(word) 66 | return word2idx, idx2word 67 | 68 | def main(args=sys.argv[1:]): 69 | """ 70 | Construct vocabulary for each dataset 71 | """ 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument('--dataset', default='all', help='dataset name') 74 | parser.add_argument('--mwf', type=int, default=1, help='minimum word frequency, if less than this int, not included') 75 | opt = parser.parse_args(args) 76 | all_dataset = [opt.dataset] if opt.dataset != 'all' else ['atis', 'geo', 'basketball', 'blocks', 'calendar', 'housing', 'publications', 'recipes', 'restaurants', 'socialnetwork'] 77 | 78 | for dataset in all_dataset: 79 | dirname = os.path.join('data', 'overnight') if dataset != 'atis' and dataset != 'geo' else os.path.join('data', dataset) 80 | file_path = os.path.join(dirname, dataset + '_train.tsv') 81 | word_vocab_path, lf_vocab_path, copy_vocab_path = os.path.join(dirname, dataset + '_vocab.word'), \ 82 | os.path.join(dirname, dataset + '_vocab.lf'), os.path.join(dirname, dataset + '_vocab.copy') 83 | lexicon_words = sorted(list(Lexicon(dataset).seen_words)) 84 | 85 | ex_list = read_data(file_path) 86 | questions, logical_forms = list(zip(*ex_list)) 87 | _, id2word = construct_vocab(questions, mwf=opt.mwf) 88 | _, id2lf = construct_vocab(logical_forms, mwf=opt.mwf) 89 | _, id2copy = construct_vocab(lexicon_words, mwf=opt.mwf) 90 | save_vocab(id2word, word_vocab_path) 91 | save_vocab(id2lf, lf_vocab_path) 92 | save_vocab(id2copy, copy_vocab_path) 93 | 94 | if __name__ == '__main__': 95 | 96 | main() 97 | -------------------------------------------------------------------------------- /utils/vocab.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import sys, os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | from utils.constants import BOS, EOS, PAD, UNK 5 | 6 | class Vocab(): 7 | 8 | def __init__(self, dataset, task='semantic_parsing', copy=False): 9 | super(Vocab, self).__init__() 10 | self.dataset = dataset 11 | dirname = os.path.join('data', 'overnight') if dataset != 'atis' and dataset != 'geo' else os.path.join('data', dataset) 12 | word_path = os.path.join(dirname, dataset + '_vocab.word') 13 | lf_path = os.path.join(dirname, dataset + '_vocab.lf') 14 | copy_path = os.path.join(dirname, dataset + '_vocab.copy') 15 | if task == 'semantic_parsing': 16 | self.word2id, self.id2word = self.read_vocab(word_path, bos_eos=False) 17 | self.lf2id, self.id2lf = self.read_vocab(lf_path, bos_eos=True) 18 | elif task == 'question_generation': 19 | self.word2id, self.id2word = self.read_vocab(word_path, bos_eos=True) 20 | if copy: 21 | self.lf2id, self.id2lf = self.read_vocab(lf_path, copy_path, bos_eos=False) 22 | else: 23 | self.lf2id, self.id2lf = self.read_vocab(lf_path, bos_eos=False) 24 | elif task == 'language_model': 25 | self.word2id, self.id2word = self.read_vocab(word_path, bos_eos=True) 26 | self.lf2id, self.id2lf = self.read_vocab(lf_path, bos_eos=True) 27 | else: 28 | raise ValueError('[Error]: unknown task !') 29 | 30 | def read_vocab(self, *args, bos_eos=True, pad=True, unk=True, separator=' : '): 31 | word2idx, idx2word = {}, [] 32 | if pad: 33 | word2idx[PAD] = len(word2idx) 34 | idx2word.append(PAD) 35 | if unk: 36 | word2idx[UNK] = len(word2idx) 37 | idx2word.append(UNK) 38 | if bos_eos: 39 | word2idx[BOS] = len(word2idx) 40 | idx2word.append(BOS) 41 | word2idx[EOS] = len(word2idx) 42 | idx2word.append(EOS) 43 | for vocab_path in args: 44 | with open(vocab_path, 'r') as f: 45 | for line in f: 46 | line = line.strip() 47 | if line == '': 48 | continue 49 | if separator in line: 50 | word, _ = line.split(separator) 51 | else: 52 | word = line 53 | idx = len(word2idx) 54 | if word not in word2idx: 55 | word2idx[word] = idx 56 | idx2word.append(word) 57 | return word2idx, idx2word -------------------------------------------------------------------------------- /utils/word2vec.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | """ 3 | Word2vec utilities: load pre-trained embeddings: Glove6B embeddings 4 | """ 5 | import torch 6 | import numpy as np 7 | from collections import defaultdict 8 | from utils.constants import BOS, EOS, PAD, UNK, VECTORCACHE 9 | 10 | def read_pretrained_vectors(filename, vocab, device): 11 | word2vec, mapping = {}, {} 12 | mapping['bos'], mapping['eos'], mapping['padding'], mapping['unknown'] = BOS, EOS, PAD, UNK 13 | with open(filename, 'r') as infile: 14 | for line in infile: 15 | line = line.strip() 16 | if line == '': 17 | continue 18 | word = line[:line.index(' ')] 19 | word = mapping[word] if word in mapping else word 20 | if word in vocab: 21 | values = line[line.index(' ') + 1:] 22 | word2vec[word] = torch.tensor(np.fromstring(values, sep=' ', dtype=np.float), device=device) 23 | return word2vec 24 | 25 | def load_embeddings(module, word2id, device=None): 26 | emb_dim = module.weight.data.size(-1) 27 | if emb_dim not in [50, 100, 200, 300]: 28 | print('Not use pretrained glove6B embeddings ...') 29 | return 0.0 30 | word2vec_file = VECTORCACHE(emb_dim) 31 | pretrained_vectors = read_pretrained_vectors(word2vec_file, word2id, device) 32 | for word in pretrained_vectors: 33 | module.weight.data[word2id[word]] = pretrained_vectors[word] 34 | return len(pretrained_vectors)/float(len(word2id)) 35 | --------------------------------------------------------------------------------