├── .gitignore
├── README.md
├── data
    ├── atis
    │   ├── atis_dev.tsv
    │   ├── atis_extra.tsv
    │   ├── atis_lexicon.txt
    │   ├── atis_ontology.txt
    │   ├── atis_test.tsv
    │   └── atis_train.tsv
    ├── geo
    │   ├── geo_dev.tsv
    │   ├── geo_extra.tsv
    │   ├── geo_lexicon.txt
    │   ├── geo_test.tsv
    │   ├── geo_train.tsv
    │   └── train600.tsv
    └── overnight
    │   ├── basketball_extra.tsv
    │   ├── basketball_lexicon.txt
    │   ├── basketball_test.tsv
    │   ├── basketball_train.tsv
    │   ├── blocks_extra.tsv
    │   ├── blocks_lexicon.txt
    │   ├── blocks_test.tsv
    │   ├── blocks_train.tsv
    │   ├── calendar_extra.tsv
    │   ├── calendar_lexicon.txt
    │   ├── calendar_test.tsv
    │   ├── calendar_train.tsv
    │   ├── calendarplus_test.tsv
    │   ├── calendarplus_train.tsv
    │   ├── housing_extra.tsv
    │   ├── housing_lexicon.txt
    │   ├── housing_test.tsv
    │   ├── housing_train.tsv
    │   ├── publications_extra.tsv
    │   ├── publications_lexicon.txt
    │   ├── publications_test.tsv
    │   ├── publications_train.tsv
    │   ├── recipes_extra.tsv
    │   ├── recipes_lexicon.txt
    │   ├── recipes_test.tsv
    │   ├── recipes_train.tsv
    │   ├── restaurants_extra.tsv
    │   ├── restaurants_lexicon.txt
    │   ├── restaurants_test.tsv
    │   ├── restaurants_train.tsv
    │   ├── socialnetwork_extra.tsv
    │   ├── socialnetwork_lexicon.txt
    │   ├── socialnetwork_test.tsv
    │   └── socialnetwork_train.tsv
├── models
    ├── Beam.py
    ├── attention
    │   └── attention_rnn.py
    ├── construct_models.py
    ├── decoder
    │   ├── decoder_rnn.py
    │   └── decoder_rnn_pointer.py
    ├── dual_learning.py
    ├── embedding
    │   └── embedding_rnn.py
    ├── enc2dec
    │   └── state_transition.py
    ├── encoder
    │   └── encoder_rnn.py
    ├── encoder_decoder.py
    ├── generator
    │   ├── generator_naive.py
    │   └── generator_pointer.py
    ├── language_model.py
    ├── model_attn.py
    ├── model_attnptr.py
    ├── model_utils.py
    ├── penalties.py
    └── reward.py
├── pull_dependency.sh
├── requirements.txt
├── run
    ├── run_dual_learning.sh
    ├── run_language_model.sh
    ├── run_pseudo_method.sh
    ├── run_question_generation.sh
    └── run_semantic_parsing.sh
├── scripts
    ├── dual_learning.py
    ├── language_model.py
    ├── pseudo_method.py
    ├── question_generation.py
    └── semantic_parsing.py
└── utils
    ├── batch.py
    ├── bleu.py
    ├── constants.py
    ├── domain
        ├── atis_evaluator.py
        ├── domain_atis.py
        ├── domain_base.py
        ├── domain_geo.py
        └── domain_overnight.py
    ├── example.py
    ├── gpu.py
    ├── hyperparam.py
    ├── lexicon.py
    ├── logger.py
    ├── loss.py
    ├── optimizer.py
    ├── seed.py
    ├── solver
        ├── solver_base.py
        ├── solver_dual_learning.py
        ├── solver_language_model.py
        ├── solver_pseduo_method.py
        ├── solver_question_generation.py
        └── solver_semantic_parsing.py
    ├── statistics.py
    ├── vocab.py
    └── word2vec.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | exp
 2 | tmp
 3 | log
 4 | lib
 5 | state
 6 | evaluator
 7 | data/.cache
 8 | data/*/*_vocab.*
 9 | */*/*/__pycache__
10 | */*/__pycache__
11 | */__pycache__
12 | */*/*/*.pyc
13 | */*/*.pyc
14 | */*.pyc
15 | module-classes.txt
16 | .ipynb_checkpoints
17 | *.ipynb
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Semantic Parsing with Dual Learning
  2 | 
  3 | This repository contains source code and data for the ACL 2019 Long Paper ["Semantic Parsing with Dual Learning"](https://www.aclweb.org/anthology/P19-1007.pdf).
  4 | 
  5 | If you use our framework in your work, please cite it as follows:
  6 | 
  7 |         @inproceedings{cao-etal-2019-semantic,
  8 |             title = "Semantic Parsing with Dual Learning",
  9 |             author = "Cao, Ruisheng  and
 10 |               Zhu, Su  and
 11 |               Liu, Chen  and
 12 |               Li, Jieyu  and
 13 |               Yu, Kai",
 14 |             booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
 15 |             month = jul,
 16 |             year = "2019",
 17 |             address = "Florence, Italy",
 18 |             publisher = "Association for Computational Linguistics",
 19 |             url = "https://www.aclweb.org/anthology/P19-1007",
 20 |             doi = "10.18653/v1/P19-1007",
 21 |             pages = "51--64"
 22 |         }
 23 |    
 24 | ----
 25 | 
 26 | ## Setup
 27 | 
 28 | * First, create the environment
 29 | 
 30 |         conda create -n sp python=3.6
 31 |         source activate sp
 32 |         pip3 install -r requirements.txt
 33 | 
 34 | * Second, pull all the dependencies from remote repository, including `evaluator`, `lib` and `glove6B word embeddings`.
 35 | 
 36 |         ./pull_dependency.sh
 37 | 
 38 | * Construct vocabulary for all datasets in advance under corresponding directory `data`, run
 39 | 
 40 |         python3 utils/statistics.py
 41 | 
 42 | ----
 43 | 
 44 | ## **Dataset**
 45 | 
 46 | ----
 47 | 
 48 | Experiments are conducted on two semantic parsing dataset **ATIS** and **OVERNIGHT**, including traditional __train__, __dev__ and __test__ files, plus elaborated __lexicon__ files for *entity mapping* and *reverse entity mapping* techniques, and __extra__ files for synthesized unlabeled logical forms. An additional ontology file are created for dataset **ATIS** since there is no evaluator available.
 49 | 
 50 | ----
 51 | 
 52 | ### **ATIS**
 53 | 
 54 | Files:
 55 | 
 56 | - *atis_train.tsv*: training dataset, 4433 samples.
 57 | - *atis_dev.tsv*: validation dataset, 491 samples.
 58 | - *atis_test.tsv*: test dataset, 448 samples.
 59 | - *atis_extra.tsv*: synthesized logical forms (Lambda Calculus), 3797 samples.
 60 | - *atis_lexicon.txt*: each line specifies a one-to-one mapping between a natural language noun phrase and its corresponding entity representation in knowledge base, such as pair `(first class, fist:cl)`.
 61 | - *atis_ontology.txt*: specify all the entity types, unary, and binary predicates used in the logical form.
 62 | 
 63 | **Attention**: Since there is no evaluator for this domain, we provide a simple type consistency checker for the target logical form (`utils/domain/atis_evaluator.py`). *atis_train.tsv*, *aits_dev.tsv* and *atis_test.tsv* are preprocessed version provided by [Dong and Lapata (2018)](https://www.aclweb.org/anthology/P16-1004.pdf), where natural language queries are lowercased and stemmed with NLTK, and entity mentions are replaced by numbered markers. For example:
 64 | 
 65 |     flight from ci0 to ci1	( lambda $0 e ( and ( flight $0 ) ( from $0 ci0 ) ( to $0 ci1 ) ) )
 66 | 
 67 | ----
 68 | 
 69 | ### **OVERNIGHT**
 70 | 
 71 | It contains eight sub-domains in total, namely *basketball*, *blocks*, *calendar*, *housing*, *publications*, *recipes*, *restaurants* and *socialnetwork*.
 72 | 
 73 | - *[domain]_train.tsv*: training and dev dataset. There is no isolate validation dataset in **OVERNIGHT**. We follow the traditional 80%/20% (train/dev) split in experiments.
 74 | - *[domain]_test.tsv*: test datset.
 75 | - *[domain]_extra.tsv*: synthesized logical forms (Lambda DCS). We revise the template rules in [SEMPRE](https://github.com/percyliang/sempre) to generate new instances.
 76 | - *[domain]_lexicon.txt*: each line specifies a one-to-one mapping between a natural language noun phrase and its corresponding entity representation in knowledge base, such as pair `(kobe bryant, en.player.kobe_bryant
 77 | )`.
 78 | 
 79 | **Attention**: There is also a evaluator program provided by [Jia and Liang (2016)](https://www.aclweb.org/anthology/P16-1002.pdf) in each domain to obtain denotations (`utils/domain/domain_overnight.py`). Each sample in *[domain]_train.tsv* and *[domain]_test.tsv* is of the form:
 80 | 
 81 |     what player did not play point guard	( call SW.listValue ( call SW.getProperty ( ( lambda s ( call SW.filter ( var s ) ( string position ) ( string ! = ) en.position.point_guard ) ) ( call SW.domain ( string player ) ) ) ( string player ) ) )
 82 | 
 83 | ----
 84 | 
 85 | ## Experiments
 86 | 
 87 | ----
 88 | 
 89 | ### Semantic Parsing (Supervised|Pretrain)
 90 | 
 91 | Refer to script in `run/run_semantic_parsing.sh`, for example
 92 | 
 93 |     ./run/run_semantic_parsing.sh dataset_name [attn|attnptr] labeled
 94 | 
 95 | `dataset_name` must be in choices `[atis, basketball, blocks, calendar, housing, publications, recipes, restaurants, socialnetwork]` and `labeled` denotes the ratio of labeled examples in training set we are going to use.
 96 | 
 97 | ----
 98 | 
 99 | ### Question Generation (Supervised|Pretrain)
100 | 
101 | The procedure is similar to that of Semantic Parsing since we use similar model architecture.
102 | 
103 |     ./run/run_question_generation.sh dataset_name [attn|attnptr] labeled
104 | 
105 | ----
106 | 
107 | ### Language Model (Unsupervised|Pretrain)
108 | 
109 | Language model is used to calculate the validity reward during the closed cycle.
110 | 
111 |     ./run/run_language_model.sh dataset_name [question|logical_form]
112 | 
113 | ----
114 | 
115 | ### Pseudo Method (Semi-supervised)
116 | 
117 | Use pretrained models of Semantic Parsing and Question Generation to generate pseudo samples. Then shuffle these pseudo samples with labeled samples together to train an improved Semantic Parsing and Question Generation Model.
118 | 
119 |     ./run/run_pseudo_method.sh dataset_name [attn|attnptr] labeled
120 | 
121 | **Attention:** in the script `run/run_pseudo_method.sh`, `read_sp_model_path` and `read_qg_model_path` are paths to the pretrained models(semantic parsing and question generation). `labeled` and `seed` should be kept the same for both the pretraining phases and pseudo method. By default, model type (attn/attnptr) is the same for both semantic parsing and question generation models.
122 | 
123 | ----
124 | 
125 | ### Dual Learning (Semi-supervised)
126 | 
127 | Use pretrained models of semantic parsing, question generation and language models to form two closed cycles with different starting points. Combine dual reinforcement learning algorithm and supervised training together. Running script:
128 | 
129 |     ./run/run_dual_learning.sh dataset_name [attn|attnptr] labeled
130 | 
131 | **Attention:** in the script `run/run_dual_learning.sh`, `read_sp_model_path`, `read_qg_model_path`, `read_qlm_path` and `read_lflm_path` are paths to the pretrained models(semantic parsing, question generation, question language model and logical form language model). `labeled` and `seed` should be kept the same for both the pretraining phases and dual learning framework. By default, model type (attn/attnptr) is the same for both semantic parsing and question generation models.
132 | 


--------------------------------------------------------------------------------
/data/atis/atis_lexicon.txt:
--------------------------------------------------------------------------------
 1 | ci0 :- NP : ci0
 2 | ci1 :- NP : ci1
 3 | ci2 :- NP : ci2
 4 | ap0 :- NP : ap0
 5 | ap1 :- NP : ap1
 6 | da0 :- NP : da0
 7 | da1 :- NP : da1
 8 | da2 :- NP : da2
 9 | da3 :- NP : da3
10 | da4 :- NP : da4
11 | al0 :- NP : al0
12 | al1 :- NP : al1
13 | al2 :- NP : al2
14 | ti0 :- NP : ti0
15 | ti1 :- NP : ti1
16 | dn0 :- NP : dn0
17 | dn1 :- NP : dn1
18 | mn0 :- NP : mn0
19 | mn1 :- NP : mn1
20 | fn0 :- NP : fn0
21 | fn1 :- NP : fn1
22 | ac0 :- NP : ac0
23 | fb0 :- NP : fb0
24 | fb1 :- NP : fb1
25 | yr0 :- NP : yr0
26 | st0 :- NP : st0
27 | morn :- NP : morning:pd
28 | am :- NP : morning:pd
29 | afternoon :- NP : afternoon:pd
30 | even :- NP : evening:pd
31 | earli :- NP : early:pd
32 | late :- NP : late:pd
33 | night :- NP : late:pd
34 | late night :- NP : late_night:pd
35 | late even :- NP : late_evening:pd
36 | late in the even :- NP : late_evening:pd
37 | pm :- NP : pm:pd
38 | day time :- NP : daytime:pd
39 | mealtim :- NP : mealtime:pd
40 | breakfast :- NP : breakfast:me
41 | dinner :- NP : dinner:me
42 | lunch :- NP : lunch:me
43 | snack :- NP : snack:me
44 | dure breakfast :- NP : breakfast:pd
45 | ground transport :- NP : ground_transport
46 | rental car :- NP : rental_car
47 | rental a car :- NP : rental_car
48 | car rental :- NP : rental_car
49 | car :- NP : rental_car
50 | limousin :- NP : limousine
51 | limo :- NP : limousine
52 | air taxi oper :- NP : air_taxi_operation
53 | first class :- NP : first:cl
54 | coach :- NP : coach:cl
55 | busi :- NP : business:cl
56 | busi class :- NP : business:cl
57 | thrift :- NP : thrift:cl
58 | thrift economi :- NP : thrift:cl
59 | economi thrift :- NP : thrift:cl
60 | daili :- NP : daily
61 | economi :- NP : economy
62 | cheap :- NP : economy
63 | one way :- NP : oneway
64 | oneway :- NP : oneway
65 | round trip :- NP : round_trip
66 | no stopov :- NP : nonstop
67 | direct :- NP : nonstop
68 | connect :- NP : connecting
69 | boe :- NP : boeing:mf
70 | one :- NP : 1:i
71 | two :- NP : 2:i
72 | 3 :- NP : 3:i
73 | 9 hour :- NP : 9:hr
74 | minimum connect time :- NP : minimum_connection_time
75 | day after tomorrow :- NP : day_after_tomorrow
76 | distanc :- NP : miles_distant
77 | discount :- NP : discounted
78 | time zone :- NP : time_zone_code


--------------------------------------------------------------------------------
/data/atis/atis_ontology.txt:
--------------------------------------------------------------------------------
  1 | entity:	dn	no
  2 | entity:	fb	yes
  3 | entity:	me	no
  4 | entity:	pd	no
  5 | entity:	mf	no
  6 | entity:	ti	yes
  7 | entity:	st	no
  8 | entity:	rc	yes
  9 | entity:	al	no
 10 | entity:	ci	no
 11 | entity:	do	yes
 12 | entity:	dc	no
 13 | entity:	fn	no
 14 | entity:	cl	no
 15 | entity:	ac	no
 16 | entity:	yr	no
 17 | entity:	i	yes
 18 | entity:	hr	yes
 19 | entity:	mn	no
 20 | entity:	ap	no
 21 | entity:	da	no
 22 | unary:	has_stops
 23 | unary:	air_taxi_operation
 24 | cat:	aircraft_code:t	ac
 25 | unary:	connecting
 26 | unary:	jet
 27 | unary:	nonstop
 28 | unary:	day_after_tomorrow
 29 | unary:	rapid_transit
 30 | unary:	tomorrow
 31 | unary:	class_of_service
 32 | unary:	rental_car
 33 | unary:	overnight
 34 | unary:	airport
 35 | cat:	airport	ap
 36 | unary:	fare_basis_code
 37 | unary:	fare
 38 | unary:	tomorrow_arrival
 39 | unary:	airline
 40 | cat:	airline	al
 41 | unary:	time_zone_code
 42 | cat:	time_zone_code	fb
 43 | cat:	time_zone_code	rc
 44 | unary:	miles_distant
 45 | unary:	has_meal
 46 | unary:	economy
 47 | unary:	taxi
 48 | unary:	city
 49 | cat:	city	ci
 50 | unary:	discounted
 51 | unary:	airline_name
 52 | unary:	meal:t
 53 | unary:	today
 54 | unary:	limousine
 55 | unary:	restriction_code
 56 | cat:	restriction_code	fb
 57 | cat:	restriction_code	rc
 58 | unary:	meal_code
 59 | unary:	ground_transport
 60 | unary:	aircraft
 61 | unary:	turboprop
 62 | unary:	tonight
 63 | unary:	daily
 64 | unary:	round_trip
 65 | unary:	weekday
 66 | unary:	flight
 67 | unary:	booking_class:t
 68 | unary:	oneway
 69 | binary:	type:al	services	type:ci
 70 | binary:	type:al	services	type:ap
 71 | binary:	type:flight	fare	type:do
 72 | binary:	type:flight	fare	type:fb
 73 | binary:	type:fb	fare	type:do
 74 | binary:	type:flight	cost	type:do
 75 | binary:	type:flight	day_return	type:da
 76 | binary:	type:flight	approx_return_time	type:ti
 77 | binary:	type:flight	day_number_return	type:dn
 78 | binary:	type:flight	class_type	type:cl
 79 | binary:	type:flight	month_arrival	type:mn
 80 | binary:	type:flight	stop	type:ci
 81 | binary:	type:flight	stop	type:ap
 82 | binary:	type:flight	flight_number	type:fn
 83 | binary:	type:flight	month_return	type:mn
 84 | binary:	type:flight	approx_arrival_time	type:ti
 85 | binary:	type:flight	stop_arrival_time	type:ti
 86 | binary:	type:flight	day_arrival	type:da
 87 | binary:	type:flight	aircraft_code	type:ac
 88 | binary:	type:flight	after_day	type:da
 89 | binary:	type:flight	meal	type:me
 90 | binary:	type:flight	arrival_month	type:mn
 91 | binary:	type:flight	day_number_arrival	type:dn
 92 | binary:	type:flight	arrival_time	type:ti
 93 | binary:	type:flight	next_days	type:i
 94 | binary:	type:flight	manufacturer	type:mf
 95 | binary:	type:flight	before_day	type:da
 96 | binary:	type:flight	minutes_distant	type:i
 97 | binary:	type:flight	capacity	type:i
 98 | binary:	type:ac	capacity	type:i
 99 | binary:	type:mf	capacity	type:i
100 | binary:	type:flight	stops	type:i
101 | binary:	type:flight	to	type:ci
102 | binary:	type:flight	to	type:ap
103 | binary:	type:flight	to	type:st
104 | binary:	type:flight	time_elapsed	type:hr
105 | binary:	type:flight	year	type:yr
106 | binary:	type:flight	booking_class	type:fb
107 | binary:	type:flight	booking_class	type:cl
108 | binary:	type:flight	from	type:ci
109 | binary:	type:flight	from	type:ap
110 | binary:	type:flight	airport	type:ap
111 | binary:	type:flight	month	type:mn
112 | binary:	type:flight	day_number	type:dn
113 | binary:	type:flight	fare_basis_code	type:fb
114 | binary:	type:flight	ground_fare	type:do
115 | binary:	type:ap	loc:t	type:ci
116 | binary:	type:ci	loc:t	type:fb
117 | binary:	type:ap	loc:t	type:st
118 | binary:	type:flight	approx_departure_time	type:ti
119 | binary:	type:al	named	type:al
120 | binary:	type:flight	to_city	type:ci
121 | binary:	type:flight	minimum_connection_time	type:ti
122 | binary:	type:ap	minimum_connection_time	type:ti
123 | binary:	type:flight	departure_time	type:ti
124 | binary:	type:flight	airline	type:al
125 | binary:	type:flight	airline:e	type:al
126 | binary:	type:flight	from_airport	type:ap
127 | binary:	type:flight	day	type:da
128 | binary:	type:flight	during_day_arrival	type:pd
129 | binary:	type:flight	during_day	type:pd
130 | binary:	type:flight	days_from_today	type:i
131 | binary:	type:flight	aircraft	type:ac
132 | binary:	type:al	abbrev	type:al
133 | 


--------------------------------------------------------------------------------
/data/geo/geo_extra.tsv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rhythmcao/semantic-parsing-dual/bd419acf391a9f5fe9eeedbcf7799b350138a6da/data/geo/geo_extra.tsv


--------------------------------------------------------------------------------
/data/geo/geo_lexicon.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rhythmcao/semantic-parsing-dual/bd419acf391a9f5fe9eeedbcf7799b350138a6da/data/geo/geo_lexicon.txt


--------------------------------------------------------------------------------
/data/overnight/basketball_lexicon.txt:
--------------------------------------------------------------------------------
 1 | kobe bryant :- NP : en.player.kobe_bryant
 2 | kobe :- NP : en.player.kobe_bryant
 3 | kobe bryants :- NP : en.player.kobe_bryant
 4 | kob bryant :- NP : en.player.kobe_bryant
 5 | kobe bryan :- NP : en.player.kobe_bryant
 6 | lebron james :- NP : en.player.lebron_james
 7 | la laker :- NP : en.team.lakers
 8 | la lakers :- NP : en.team.lakers
 9 | los angeles lakers :- NP : en.team.lakers
10 | lakers :- NP : en.team.lakers
11 | laker :- NP : en.team.lakers
12 | cavaliers :- NP : en.team.cavaliers
13 | cleveland cavaliers :- NP : en.team.cavaliers
14 | point guards :- NP : en.position.point_guard
15 | point guard :- NP : en.position.point_guard
16 | forward :- NP : en.position.forward
17 | forwards :- NP : en.position.forward
18 | 


--------------------------------------------------------------------------------
/data/overnight/blocks_lexicon.txt:
--------------------------------------------------------------------------------
 1 | block :- NP : en.block
 2 | block 1 :- NP : en.block.block1
 3 | brick 1 :- NP : en.block.block1
 4 | block 1s :- NP : en.block.block1
 5 | block one :- NP : en.block.block1
 6 | block 2 :- NP : en.block.block2
 7 | brick 2 :- NP : en.block.block2
 8 | block two :- NP : en.block.block2
 9 | pyramid :- NP : en.shape.pyramid
10 | pyramidshaped :- NP : en.shape.pyramid
11 | pyramidshape :- NP : en.shape.pyramid
12 | cube :- NP : en.shape.cube
13 | cube shaped :- NP : en.shape.cube
14 | inch :- NP : en.inch
15 | inches :- NP : en.inch
16 | 
17 | 


--------------------------------------------------------------------------------
/data/overnight/calendar_lexicon.txt:
--------------------------------------------------------------------------------
 1 | meeting :- NP : en.meeting
 2 | meetings :- NP : en.meeting
 3 | weekly standup :- NP : en.meeting.weekly_standup
 4 | weekly stand up :- NP : en.meeting.weekly_standup
 5 | weekly roundup :- NP : en.meeting.weekly_standup
 6 | weekly startup :- NP : en.meeting.weekly_standup
 7 | week startup :- NP : en.meeting.weekly_standup
 8 | annual review :- NP : en.meeting.annual_review
 9 | person :- NP : en.person
10 | alice :- NP : en.person.alice
11 | bob :- NP : en.person.bob
12 | greenberg cafe :- NP : en.location.greenberg_cafe
13 | central office :- NP : en.location.central_office
14 | 


--------------------------------------------------------------------------------
/data/overnight/housing_lexicon.txt:
--------------------------------------------------------------------------------
 1 | 123 sesame street :- NP : en.housing_unit.123_sesame_street
 2 | 123 sesame st :- NP : en.housing_unit.123_sesame_street
 3 | 900 mission avenue :- NP : en.housing_unit.900_mission_ave
 4 | 900 mission ave :- NP : en.housing_unit.900_mission_ave
 5 | dollar :- NP : en.dollar
 6 | dollars :- NP : en.dollar
 7 | square :- NP : en.square_feet
 8 | square feet :- NP : en.square_feet
 9 | square foot :- NP : en.square_feet
10 | midtown west :- NP : en.neighborhood.midtown_west
11 | midtown east :- NP : en.neighborhood.midtown_west
12 | west midtown :- NP : en.neighborhood.midtown_west
13 | chelsea :- NP : en.neighborhood.chelsea
14 | apartment :- NP : en.housing.apartment
15 | aprtment :- NP : en.housing.apartment
16 | apartments :- NP : en.housing.apartment
17 | condo :- NP : en.housing.condo
18 | condos :- NP : en.housing.condo
19 | 


--------------------------------------------------------------------------------
/data/overnight/publications_lexicon.txt:
--------------------------------------------------------------------------------
 1 | article :- NP : en.article
 2 | articles :- NP : en.article
 3 | multivariate data analysis :- NP : en.article.multivariate_data_analysis
 4 | person :- NP : en.person
 5 | persons :- NP : en.person
 6 | efron :- NP : en.person.efron
 7 | efrons :- NP : en.person.efron
 8 | lakoff :- NP : en.person.lakoff
 9 | annals of statistics :- NP : en.venue.annals_of_statistics
10 | computational linguistics :- NP : en.venue.computational_linguistics
11 | 


--------------------------------------------------------------------------------
/data/overnight/recipes_lexicon.txt:
--------------------------------------------------------------------------------
 1 | recipe :- NP : en.recipe
 2 | recipes :- NP : en.recipe
 3 | rice pudding :- NP : en.recipe.rice_pudding
 4 | rice puddings :- NP : en.recipe.rice_pudding
 5 | quiche :- NP : en.recipe.quiche
 6 | quice :- NP : en.recipe.quiche
 7 | ingredient :- NP : en.ingredient
 8 | ingredients :- NP : en.ingredient
 9 | milk :- NP : en.ingredient.milk
10 | spinach :- NP : en.ingredient.spinach
11 | lunch :- NP : en.meal.lunch
12 | dinner :- NP : en.meal.dinner
13 | supper :- NP : en.meal.dinner
14 | 


--------------------------------------------------------------------------------
/data/overnight/restaurants_lexicon.txt:
--------------------------------------------------------------------------------
 1 | restaurant :- NP : en.restaurant
 2 | restaurants :- NP : en.restaurant
 3 | thai cafe :- NP : en.restaurant.thai_cafe
 4 | pizzeria juno :- NP : en.restaurant.pizzeria_juno
 5 | pizzeria :- NP : en.restaurant.pizzeria_juno
 6 | stars :- NP : en.star
 7 | star :- NP : en.star
 8 | dollar sign :- NP : en.dollar_sign
 9 | dollar signs :- NP : en.dollar_sign
10 | reviews :- NP : en.review
11 | midtown west :- NP : en.neighborhood.midtown_west
12 | west midtown :- NP : en.neighborhood.midtown_west
13 | chelsea :- NP : en.neighborhood.chelsea
14 | thai :- NP : en.cuisine.thai
15 | italian :- NP : en.cuisine.italian
16 | lunch :- NP : en.food.lunch
17 | dinner :- NP : en.food.dinner
18 | 


--------------------------------------------------------------------------------
/data/overnight/socialnetwork_lexicon.txt:
--------------------------------------------------------------------------------
 1 | person :- NP : en.person
 2 | persons :- NP : en.person
 3 | alice :- NP : en.person.alice
 4 | bob :- NP : en.person.bob
 5 | alices :- NP : en.person.alice
 6 | bobs :- NP : en.person.bob
 7 | male :- NP : en.gender.male
 8 | males :- NP : en.gender.male
 9 | female :- NP : en.gender.female
10 | females :- NP : en.gender.female
11 | single :- NP : en.relationship_status.single
12 | singles :- NP : en.relationship_status.single
13 | singlestatus :- NP : en.relationship_status.single
14 | married :- NP : en.relationship_status.married
15 | city :- NP : en.city
16 | cities :- NP : en.city
17 | citys :- NP : en.city
18 | new york :- NP : en.city.new_york
19 | new yorks :- NP : en.city.new_york
20 | newyork :- NP : en.city.new_york
21 | beijing :- NP : en.city.beijing
22 | cm :- NP : en.cm
23 | brown university :- NP : en.university.brown
24 | ucla :- NP : en.university.ucla
25 | ucla university :- NP : en.university.ucla
26 | computer science :- NP : en.field.computer_science
27 | mckinsey :- NP : en.company.mckinsey
28 | mckinseys :- NP : en.company.mckinsey
29 | google :- NP : en.company.google
30 | software engineer :- NP : en.job_title.software_engineer
31 | software engineers :- NP : en.job_title.software_engineer
32 | program manager :- NP : en.job_title.program_manager
33 | program managers :- NP : en.job_title.program_manager
34 | 


--------------------------------------------------------------------------------
/models/Beam.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | from models import penalties
  4 | from utils.constants import *
  5 | 
  6 | class Beam(object):
  7 |     """
  8 |         Class for managing the internals of the beam search process.
  9 |         Takes care of beams, back pointers, and scores. (Revised from OpenNMT.)
 10 |         @args:
 11 |             size (int): beam size
 12 |             vocab (dict): contains indices of padding, beginning, and ending.
 13 |             min_length (int): minimum length to generate
 14 |             global_scorer (:obj:`GlobalScorer`)
 15 |             device (torch.device)
 16 |     """
 17 | 
 18 |     def __init__(self, size, vocab, min_length=2,
 19 |                  global_scorer=None, device=None):
 20 | 
 21 |         self.size = size
 22 |         self.device = device
 23 |         # The score for each translation on the beam.
 24 |         self.scores = torch.zeros(size, dtype=torch.float, device=self.device)
 25 | 
 26 |         # The backpointers at each time-step.
 27 |         self.prev_ks = []
 28 | 
 29 |         # The outputs at each time-step.
 30 |         self.next_ys = [torch.zeros(size, dtype=torch.long, device=self.device).fill_(vocab[PAD])]
 31 |         self.next_ys[0][0] = vocab[BOS]
 32 | 
 33 |         # Has EOS topped the beam yet.
 34 |         self._eos = vocab[EOS]
 35 |         self.eos_top = False
 36 | 
 37 |         # Other special symbols
 38 |         self._bos = vocab[BOS]
 39 |         self._pad = vocab[PAD]
 40 | 
 41 |         # Time and k pair for finished.
 42 |         self.finished = []
 43 | 
 44 |         # Information for global scoring.
 45 |         self.global_scorer = global_scorer
 46 | 
 47 |         # Minimum prediction length
 48 |         self.min_length = min_length
 49 | 
 50 |     def get_current_state(self):
 51 |         "Get the outputs for the current timestep."
 52 |         return self.next_ys[-1]
 53 | 
 54 |     def get_current_origin(self):
 55 |         "Get the backpointers for the current timestep."
 56 |         return self.prev_ks[-1]
 57 | 
 58 |     def advance(self, word_probs):
 59 |         """
 60 |         Given prob over words for every last beam `K x vocab` and update the beam.
 61 | 
 62 |         Parameters:
 63 | 
 64 |         * `word_probs`- probs of advancing from the last step (K x words)
 65 | 
 66 |         Returns: True if beam search is complete.
 67 |         """
 68 |         num_words = word_probs.size(1)
 69 |         # force the output to be longer than self.min_length
 70 |         cur_len = len(self.next_ys)
 71 |         masks = torch.zeros(word_probs.size(), requires_grad=False, dtype=torch.float, device=self.device)
 72 |         masks[:, self._bos] = 1e20
 73 |         masks[:, self._pad] = 1e20 # prevent generate <s> <pad> symbol
 74 |         if cur_len < self.min_length:
 75 |             masks[:, self._eos] = 1e20 # prevent terminate too early
 76 |         word_probs = word_probs - masks
 77 | 
 78 |         # Sum the previous scores.
 79 |         if len(self.prev_ks) > 0:
 80 |             beam_scores = word_probs + self.scores.unsqueeze(1)
 81 |             # Don't let EOS have children.
 82 |             masks = torch.zeros(beam_scores.size(), requires_grad=False, dtype=torch.float, device=self.device)
 83 |             for i in range(self.next_ys[-1].size(0)):
 84 |                 if self.next_ys[-1][i] == self._eos:
 85 |                     masks[i] = 1e20
 86 |             beam_scores = beam_scores - masks
 87 |         else:
 88 |             beam_scores = word_probs[0] # only start from <s>, not <pad>
 89 |         flat_beam_scores = beam_scores.contiguous().view(-1)
 90 |         best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0, True, True)
 91 | 
 92 |         self.scores = best_scores
 93 | 
 94 |         # best_scores_id is flattened beam x word array, so calculate which
 95 |         # word and beam each score came from
 96 |         prev_k = best_scores_id / num_words
 97 |         self.prev_ks.append(prev_k)
 98 |         self.next_ys.append((best_scores_id - prev_k * num_words))
 99 | 
100 |         # check whether some sequence has terminated
101 |         for i in range(self.next_ys[-1].size(0)):
102 |             if self.next_ys[-1][i] == self._eos:
103 |                 global_scores = self.global_scorer.score(self, self.scores) # normalize score by length penalty
104 |                 rank_s, s = global_scores[i], self.scores[i]
105 |                 self.finished.append(([rank_s, s], len(self.next_ys) - 1, i))
106 | 
107 |         # End condition is when top-of-beam is EOS and no global score.
108 |         if self.next_ys[-1][0] == self._eos:
109 |             self.eos_top = True
110 |         return self.done()
111 | 
112 |     def done(self):
113 |         return self.eos_top and len(self.finished) >= self.size
114 | 
115 |     def sort_best(self):
116 |         """
117 |             Sort the current beam.
118 |         """
119 |         return torch.sort(self.scores, 0, True) # beam size
120 |     
121 |     def sort_finished(self, minimum=None):
122 |         if minimum is not None:
123 |             i = 0
124 |             # Add from beam until we have minimum outputs.
125 |             while len(self.finished) < minimum:
126 |                 global_scores = self.global_scorer.score(self, self.scores)
127 |                 rank_s, s = global_scores[i], self.scores[i]
128 |                 self.finished.append(([rank_s, s], len(self.next_ys) - 1, i))
129 |                 i += 1
130 | 
131 |         self.finished.sort(key=lambda a: -a[0][0])
132 |         scores = [sc[1] for sc, _, _ in self.finished]
133 |         ks = [(t, k) for _, t, k in self.finished]
134 |         return scores, ks
135 | 
136 |     def get_temporary_hyp(self, k):
137 |         """
138 |             Get current hypotheses of rank k ( 0 <= rank <= beam_size-1 ). 
139 |         """
140 |         hyp = []
141 |         for j in range(len(self.prev_ks) - 1, -1, -1):
142 |             hyp.append(self.next_ys[j + 1][k])
143 |             k = self.prev_ks[j][k]
144 |         return torch.stack(hyp[::-1])
145 | 
146 |     def get_hyp(self, timestep, k):
147 |         """ 
148 |             Walk back to construct the full hypothesis. 
149 |             hyp contains </s> but does not contain <s>
150 |             @return:
151 |                 hyp: LongTensor of size tgt_len
152 |         """
153 |         hyp = []
154 |         for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1):
155 |             hyp.append(self.next_ys[j + 1][k])
156 |             k = self.prev_ks[j][k]
157 |         return torch.stack(hyp[::-1])
158 | 
159 | class GNMTGlobalScorer(object):
160 |     """
161 |     Re-ranking score revised from
162 |     "Google's Neural Machine Translation System" :cite:`wu2016google`
163 | 
164 |     Args:
165 |        alpha (float): length parameter
166 |     """
167 | 
168 |     def __init__(self, alpha, len_penalty):
169 |         self.alpha = alpha
170 |         penalty_builder = penalties.PenaltyBuilder(len_penalty)
171 |         # Probability will be divided by this
172 |         self.length_penalty = penalty_builder.length_penalty()
173 | 
174 |     def score(self, beam, logprobs):
175 |         """
176 |         Rescores a prediction based on penalty functions
177 |         """
178 |         normalized_probs = self.length_penalty(beam, logprobs, self.alpha)
179 |         return normalized_probs
180 | 


--------------------------------------------------------------------------------
/models/attention/attention_rnn.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class Attention(nn.Module):
 7 |     METHODS = ['general', 'feedforward']
 8 | 
 9 |     def __init__(self, enc_dim, dec_dim, method='feedforward'):
10 | 
11 |         super(Attention, self).__init__()
12 |         self.enc_dim, self.dec_dim = enc_dim, dec_dim
13 |         assert method in Attention.METHODS
14 |         self.method = method
15 |         if self.method == 'general':
16 |             self.Wa = nn.Linear(self.enc_dim, self.dec_dim, bias=False)
17 |         else:
18 |             self.Wa = nn.Linear(self.enc_dim + self.dec_dim, self.dec_dim, bias=False)
19 |             self.Va = nn.Linear(self.dec_dim, 1, bias=False)
20 | 
21 |     def forward(self, hiddens, decoder_state, masks):
22 |         '''
23 |             hiddens : bsize x src_lens x enc_dim
24 |             decoder_state : bsize x dec_dim
25 |             masks : bsize x src_lens, ByteTensor
26 |             @return: 
27 |                 context : bsize x 1 x enc_dim
28 |                 a : normalized coefficient, bsize x src_lens
29 |         '''
30 |         if self.method == 'general':
31 |             m = self.Wa(hiddens) # bsize x src_len x dec_dim
32 |             e = torch.bmm(m, decoder_state.unsqueeze(-1)).squeeze(dim=-1) # bsize x src_len
33 |         else:
34 |             d = decoder_state.unsqueeze(dim=1).repeat(1, hiddens.size(1), 1)
35 |             e = self.Wa(torch.cat([d, hiddens], dim=-1))
36 |             e = self.Va(torch.tanh(e)).squeeze(dim=-1)
37 |         e.masked_fill_(masks == 0, -float('inf'))
38 |         a = F.softmax(e, dim=1) 
39 |         context = torch.bmm(a.unsqueeze(1), hiddens)
40 |         return context, a


--------------------------------------------------------------------------------
/models/construct_models.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | from utils.constants import *
 5 | from models.embedding.embedding_rnn import RNNEmbeddings
 6 | from models.encoder.encoder_rnn import RNNEncoder
 7 | from models.enc2dec.state_transition import StateTransition
 8 | from models.attention.attention_rnn import Attention
 9 | from models.decoder.decoder_rnn import RNNDecoder
10 | from models.decoder.decoder_rnn_pointer import RNNDecoderPointer
11 | from models.generator.generator_naive import Generator
12 | from models.generator.generator_pointer import GeneratorPointer
13 | from models.model_attn import AttnModel
14 | from models.model_attnptr import AttnPtrModel
15 | 
16 | def construct_model(*args, **kargs):
17 |     copy = kargs.pop('copy', True)
18 |     if copy:
19 |         return construct_attnptr(*args, **kargs)
20 |     else:
21 |         return construct_attn(*args, **kargs)
22 | 
23 | def construct_attn(
24 |     src_vocab=None, tgt_vocab=None, src_unk_idx=1, tgt_unk_idx=1, pad_src_idxs=[0], pad_tgt_idxs=[0],
25 |     src_emb_size=100, tgt_emb_size=100, hidden_dim=200, num_layers=1, bidirectional=True,
26 |     cell='lstm', dropout=0.5, init=None, **kargs
27 | ):
28 |     """
29 |         Construct Seq2Seq model with attention mechanism
30 |     """
31 |     num_directions = 2 if bidirectional else 1
32 |     enc2dec_model = StateTransition(num_layers, cell=cell, bidirectional=bidirectional, hidden_dim=hidden_dim)
33 |     attn_model = Attention(hidden_dim * num_directions, hidden_dim)
34 |     src_embeddings = RNNEmbeddings(src_emb_size, src_vocab, src_unk_idx, pad_token_idxs=pad_src_idxs, dropout=dropout)
35 |     encoder = RNNEncoder(src_emb_size, hidden_dim, num_layers, cell=cell, bidirectional=bidirectional, dropout=dropout)
36 |     tgt_embeddings = RNNEmbeddings(tgt_emb_size, tgt_vocab, tgt_unk_idx, pad_token_idxs=pad_tgt_idxs, dropout=dropout)
37 |     decoder = RNNDecoder(tgt_emb_size, hidden_dim, num_layers, attn=attn_model, cell=cell, dropout=dropout)
38 |     generator_model = Generator(tgt_emb_size, tgt_vocab, dropout=dropout)
39 |     model = AttnModel(src_embeddings, encoder, tgt_embeddings, decoder, enc2dec_model, generator_model)
40 | 
41 |     if init:
42 |         for p in model.parameters():
43 |             p.data.uniform_(-init, init)
44 |         for pad_token_idx in pad_src_idxs:
45 |             model.src_embed.embed.weight.data[pad_token_idx].zero_()
46 |         for pad_token_idx in pad_tgt_idxs:
47 |             model.tgt_embed.embed.weight.data[pad_token_idx].zero_()
48 |     return model
49 | 
50 | def construct_attnptr(
51 |     src_vocab=None, tgt_vocab=None, src_unk_idx=1, tgt_unk_idx=1, pad_src_idxs=[0], pad_tgt_idxs=[0],
52 |     src_emb_size=100, tgt_emb_size=100, hidden_dim=200, bidirectional=True, num_layers=1,
53 |     cell='lstm', dropout=0.5, init=None, **kargs
54 | ):
55 |     """
56 |         Construct Seq2Seq model with attention mechanism and pointer network
57 |     """
58 |     num_directions = 2 if bidirectional else 1
59 |     enc2dec_model = StateTransition(num_layers, cell=cell, bidirectional=bidirectional, hidden_dim=hidden_dim)
60 |     attn_model = Attention(hidden_dim * num_directions, hidden_dim)
61 |     src_embeddings = RNNEmbeddings(src_emb_size, src_vocab, src_unk_idx, pad_token_idxs=pad_src_idxs, dropout=dropout)
62 |     encoder = RNNEncoder(src_emb_size, hidden_dim, num_layers, cell=cell, bidirectional=bidirectional, dropout=dropout)
63 |     tgt_embeddings = RNNEmbeddings(tgt_emb_size, tgt_vocab, tgt_unk_idx, pad_token_idxs=pad_tgt_idxs, dropout=dropout)
64 |     decoder = RNNDecoderPointer(tgt_emb_size, hidden_dim, num_layers, attn=attn_model, cell=cell, dropout=dropout)
65 |     generator_model = GeneratorPointer(tgt_emb_size, tgt_vocab, dropout=dropout)
66 |     model = AttnPtrModel(src_embeddings, encoder, tgt_embeddings, decoder, enc2dec_model, generator_model)
67 | 
68 |     if init:
69 |         for p in model.parameters():
70 |             p.data.uniform_(-init, init)
71 |         for pad_token_idx in pad_src_idxs:
72 |             model.src_embed.embed.weight.data[pad_token_idx].zero_()
73 |         for pad_token_idx in pad_tgt_idxs:
74 |             model.tgt_embed.embed.weight.data[pad_token_idx].zero_()
75 |     return model
76 | 


--------------------------------------------------------------------------------
/models/decoder/decoder_rnn.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | class RNNDecoder(nn.Module):
 6 |     """
 7 |         Generic unidirectional RNN layers containing Attention modules.
 8 |     """
 9 |     def __init__(self, tgt_emb_size, hidden_dim, num_layers, attn, cell="lstm", dropout=0.5):
10 |         super(RNNDecoder, self).__init__()
11 |         self.tgt_emb_size = tgt_emb_size
12 |         self.hidden_dim = hidden_dim
13 |         self.num_layers = num_layers
14 |         self.dropout = dropout if self.num_layers > 1 else 0
15 |         self.cell = cell.upper()
16 |         self.rnn_decoder = getattr(nn, self.cell)(self.tgt_emb_size, self.hidden_dim,
17 |             num_layers=self.num_layers, bidirectional=False, batch_first=True, dropout=self.dropout)
18 |         self.attn = attn
19 |         self.affine = nn.Linear(self.hidden_dim + self.attn.enc_dim, self.tgt_emb_size)
20 |         self.dropout_layer = nn.Dropout(p=dropout)
21 | 
22 |     def forward(self, x, hidden_states, memory, src_mask, copy_tokens=None):
23 |         """
24 |             x: decoder input embeddings, bsize x tgt_len x emb_size
25 |             hidden_states: previous decoder state
26 |             memory: encoder output, bsize x src_len x hidden_dim*2
27 |             src_mask: bsize x src_lens
28 |             copy_tokens: to be compatible with pointer network
29 |         """
30 |         out, hidden_states = self.rnn_decoder(x, hidden_states)
31 |         context = []
32 |         for i in range(out.size(1)):
33 |             tmp_context, _ = self.attn(memory, out[:, i, :], src_mask)
34 |             context.append(tmp_context)
35 |         context = torch.cat(context, dim=1)
36 |         feats = torch.cat([out, context], dim=-1)
37 |         feats = self.affine(self.dropout_layer(feats))
38 |         return feats, hidden_states
39 | 


--------------------------------------------------------------------------------
/models/decoder/decoder_rnn_pointer.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | class RNNDecoderPointer(nn.Module):
 6 |     """
 7 |         Generic unidirectional RNN layers containing StateTransition and Attention modules.
 8 |     """
 9 |     def __init__(self, tgt_emb_size, hidden_dim, num_layers, attn, cell="lstm", dropout=0.5):
10 |         super(RNNDecoderPointer, self).__init__()
11 |         self.tgt_emb_size = tgt_emb_size
12 |         self.hidden_dim = hidden_dim
13 |         self.num_layers = num_layers
14 |         self.dropout = dropout if self.num_layers > 1 else 0
15 |         self.cell = cell.upper()
16 |         self.rnn_decoder = getattr(nn, self.cell)(self.tgt_emb_size, self.hidden_dim,
17 |             num_layers=self.num_layers, bidirectional=False, batch_first=True, dropout=self.dropout)
18 |         self.attn = attn
19 |         self.gate = nn.Linear(self.hidden_dim + self.attn.enc_dim + self.tgt_emb_size, 1)
20 |         self.affine = nn.Linear(self.hidden_dim + self.attn.enc_dim, self.tgt_emb_size)
21 |         self.dropout_layer = nn.Dropout(p=dropout)
22 | 
23 |     def forward(self, x, hidden_states, memory, src_mask, copy_tokens=None):
24 |         """
25 |             x: decoder input embeddings, bsize x tgt_len x emb_size
26 |             hidden_states: previous decoder state
27 |             memory: memory and hidden_states
28 |             src_mask: mask on src input, bsize x src_lens
29 |             copy_tokens: bsize x src_lens x vocab_size
30 |             @return:
31 |                 feats: bsize x tgt_lens x (dec_dim + enc_dim)
32 |                 copy_distribution: bsize x tgt_lens x (vocab_size + MAX_OOV_NUM)
33 |                 gate_scores: bsize x tgt_lens x 1
34 |         """
35 |         out, hidden_states = self.rnn_decoder(x, hidden_states)
36 |         context, pointer = [], []
37 |         for i in range(out.size(1)):
38 |             tmp_context, tmp_ptr = self.attn(memory, out[:, i, :], src_mask)
39 |             context.append(tmp_context)
40 |             pointer.append(tmp_ptr.unsqueeze(dim=1))
41 |         context, pointer = torch.cat(context, dim=1), torch.cat(pointer, dim=1)
42 |         feats = self.dropout_layer(torch.cat([out, context], dim=-1))
43 |         gate_scores = torch.sigmoid(self.gate(torch.cat([feats, x], dim=-1)))
44 |         feats = self.affine(feats)
45 |         copy_distribution = torch.bmm(pointer, copy_tokens)
46 |         return feats, hidden_states, copy_distribution, gate_scores
47 | 


--------------------------------------------------------------------------------
/models/dual_learning.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import os, sys
  3 | import torch
  4 | import torch.nn as nn
  5 | from utils.example import Example
  6 | from utils.constants import PAD, UNK, BOS, EOS
  7 | from utils.batch import get_minibatch_sp, get_minibatch_qg
  8 | 
  9 | class DualLearning(nn.Module):
 10 | 
 11 |     def __init__(self, sp_model, qg_model, reward_model, sp_vocab, qg_vocab,
 12 |         alpha=0.5, beta=0.5, sample=5, reduction='sum', sp_device=None, qg_device=None, **kargs):
 13 |         """
 14 |             @args:
 15 |                 1. alpha: reward for cycle starting from sp_reward = val_reward * alpha + rec_reward * (1 - alpha)
 16 |                 2. beta: reward for cycle starting from qg_reward = val_reward * beta + rec_reward * (1 - beta)
 17 |                 3. sample: beam search and sample size for training in dual learning cycles
 18 |         """
 19 |         super(DualLearning, self).__init__()
 20 |         self.sp_device = sp_device
 21 |         self.qg_device = qg_device
 22 |         self.sp_model = sp_model.to(self.sp_device)
 23 |         self.qg_model = qg_model.to(self.qg_device)
 24 |         self.reward_model = reward_model
 25 |         self.alpha, self.beta, self.sample = alpha, beta, sample
 26 |         self.reduction = reduction
 27 |         self.sp_vocab = sp_vocab
 28 |         self.qg_vocab = qg_vocab
 29 | 
 30 |     def forward(self, *args, start_from='semantic_parsing', **kargs):
 31 |         """
 32 |             @args:
 33 |                 *args(tensors): positional arguments for semantic parsing or question generation
 34 |                 start_from(enum): semantic_parsing or question_generation
 35 |         """
 36 |         if start_from == 'semantic_parsing':
 37 |             return self.cycle_start_from_sp(*args, **kargs)
 38 |         elif start_from == 'question_generation':
 39 |             return self.cycle_start_from_qg(*args, **kargs)
 40 |         else:
 41 |             raise ValueError('[Error]: dual learning cycle with unknown starting point !')
 42 | 
 43 |     def cycle_start_from_sp(self, inputs, lens, copy_tokens, oov_list, raw_in):
 44 |         domain = Example.domain
 45 |         # primal model
 46 |         results = self.sp_model.decode_batch(inputs, lens, self.sp_vocab.lf2id, copy_tokens, self.sample, self.sample)
 47 |         predictions, sp_scores = results['predictions'], results['scores']
 48 |         predictions = [idx for each in predictions for idx in each]
 49 |         predictions = domain.reverse(predictions, self.sp_vocab.id2lf, oov_list=oov_list)
 50 |         raw_in = [each for each in raw_in for _ in range(self.sample)] # repeat sample times
 51 | 
 52 |         # calculate validity reward
 53 |         sp_val_reward = self.reward_model(predictions, choice='sp_val').contiguous().view(-1, self.sample)
 54 |         baseline = sp_val_reward.mean(dim=-1, keepdim=True)
 55 |         sp_val_reward -= baseline
 56 | 
 57 |         # dual model
 58 |         qg_inputs, qg_lens, qg_dec_inputs, qg_dec_outputs, qg_out_lens, qg_copy_tokens = \
 59 |             self.sp2qg(predictions, raw_in, vocab=self.qg_vocab, device=self.qg_device)
 60 |         logscore = self.qg_model(qg_inputs, qg_lens, qg_dec_inputs[:, :-1], qg_copy_tokens)
 61 | 
 62 |         # calculate reconstruction reward
 63 |         rec_reward = self.reward_model(logscore, qg_dec_outputs[:, 1:], qg_out_lens - 1, choice='sp_rec').contiguous().view(-1, self.sample)
 64 |         sp_rec_reward = rec_reward.detach().cpu()
 65 |         baseline = sp_rec_reward.mean(dim=-1, keepdim=True)
 66 |         sp_rec_reward = sp_rec_reward - baseline
 67 | 
 68 |         total_reward = self.alpha * sp_val_reward + (1 - self.alpha) * sp_rec_reward
 69 |         sp_loss = - torch.mean(total_reward.to(self.sp_device) * sp_scores, dim=1)
 70 |         sp_loss = torch.sum(sp_loss) if self.reduction == 'sum' else torch.mean(sp_loss)
 71 |         qg_loss = - torch.mean((1 - self.alpha) * rec_reward, dim=1)
 72 |         qg_loss = torch.sum(qg_loss) if self.reduction == 'sum' else torch.mean(qg_loss)
 73 |         return sp_loss, qg_loss
 74 | 
 75 |     def sp2qg(self, lf_list, utterances, vocab, device):
 76 |         ex_list = [Example(' '.join(sent), ' '.join(lf)) for sent, lf in zip(utterances, lf_list)]
 77 |         inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = \
 78 |             get_minibatch_qg(ex_list, vocab, device, copy=self.qg_model.copy)
 79 |         return inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens
 80 | 
 81 |     def cycle_start_from_qg(self, inputs, lens, copy_tokens, oov_list, raw_in):
 82 |         domain = Example.domain
 83 |         # primal model
 84 |         results = self.qg_model.decode_batch(inputs, lens, self.qg_vocab.word2id, copy_tokens, self.sample, self.sample)
 85 |         predictions, qg_scores = results['predictions'], results['scores']
 86 |         predictions = [idx for each in predictions for idx in each]
 87 |         predictions = domain.reverse(predictions, self.qg_vocab.id2word, oov_list=oov_list)
 88 |         raw_in = [each for each in raw_in for _ in range(self.sample)] # repeat sample times
 89 | 
 90 |         # calculate validity reward
 91 |         qg_val_reward = self.reward_model(predictions, choice='qg_val').contiguous().view(-1, self.sample)
 92 |         baseline = qg_val_reward.mean(dim=-1, keepdim=True)
 93 |         qg_val_reward -= baseline
 94 | 
 95 |         # dual model
 96 |         sp_inputs, sp_lens, sp_dec_inputs, sp_dec_outputs, sp_out_lens, sp_copy_tokens = \
 97 |             self.qg2sp(predictions, raw_in, self.sp_vocab, self.sp_device)
 98 |         logscore = self.sp_model(sp_inputs, sp_lens, sp_dec_inputs[:, :-1], sp_copy_tokens)
 99 | 
100 |         # calculate reconstruction reward
101 |         rec_reward = self.reward_model(logscore, sp_dec_outputs[:, 1:], sp_out_lens - 1, choice='qg_rec').contiguous().view(-1, self.sample)
102 |         qg_rec_reward = rec_reward.detach().cpu()
103 |         baseline = qg_rec_reward.mean(dim=-1, keepdim=True)
104 |         qg_rec_reward = qg_rec_reward - baseline
105 | 
106 |         total_reward = self.beta * qg_val_reward + (1 - self.beta) * qg_rec_reward
107 |         qg_loss = - torch.mean(total_reward.to(self.qg_device) * qg_scores, dim=1)
108 |         qg_loss = torch.sum(qg_loss) if self.reduction == 'sum' else torch.mean(qg_loss)
109 |         sp_loss = - torch.mean((1 - self.beta) * rec_reward, dim=1)
110 |         sp_loss = torch.sum(sp_loss) if self.reduction == 'sum' else torch.mean(sp_loss)
111 |         return sp_loss, qg_loss
112 | 
113 |     def qg2sp(self, utterances, lf_list, vocab, device):
114 |         ex_list = [Example(' '.join(sent), ' '.join(lf)) for sent, lf in zip(utterances, lf_list)]
115 |         inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = \
116 |             get_minibatch_sp(ex_list, vocab, device, copy=self.sp_model.copy)
117 |         return inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens
118 | 
119 |     def decode_batch(self, *args, task='semantic_parsing', **kargs):
120 |         if task == 'semantic_parsing':
121 |             return self.sp_model.decode_batch(*args, **kargs)
122 |         elif task == 'question_generation':
123 |             return self.qg_model.decode_batch(*args, **kargs)
124 |         else:
125 |             raise ValueError('[Error]: unknown task name !')
126 | 
127 |     def pad_embedding_grad_zero(self):
128 |         self.sp_model.pad_embedding_grad_zero()
129 |         self.qg_model.pad_embedding_grad_zero()
130 | 
131 |     def load_model(self, sp_load_dir=None, qg_load_dir=None):
132 |         if sp_load_dir is not None:
133 |             self.sp_model.load_model(sp_load_dir)
134 |         if qg_load_dir is not None:
135 |             self.qg_model.load_model(qg_load_dir)
136 | 
137 |     def save_model(self, sp_save_dir=None, qg_save_dir=None):
138 |         if sp_save_dir is not None:
139 |             self.sp_model.save_model(sp_save_dir)
140 |         if qg_save_dir is not None:
141 |             self.qg_model.save_model(qg_save_dir)
142 | 


--------------------------------------------------------------------------------
/models/embedding/embedding_rnn.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | class RNNEmbeddings(nn.Module):
 6 |     def __init__(self, emb_size, vocab, unk_idx=1, pad_token_idxs=[0], dropout=0.5):
 7 |         super(RNNEmbeddings, self).__init__()
 8 |         self.embed = nn.Embedding(vocab, emb_size)
 9 |         self.vocab = vocab
10 |         self.emb_size = emb_size
11 |         self.dropout_layer = nn.Dropout(p=dropout)
12 |         self.pad_token_idxs = pad_token_idxs
13 |         self.unk_idx = unk_idx
14 | 
15 |     def forward(self, x):
16 |         token_mask = x >= self.vocab
17 |         if token_mask.any():
18 |             x = x.masked_fill_(token_mask, self.unk_idx)
19 |         return self.dropout_layer(self.embed(x))
20 |     
21 |     def pad_embedding_grad_zero(self):
22 |         for pad_token_idx in self.pad_token_idxs:
23 |             self.embed.weight.grad[pad_token_idx].zero_()


--------------------------------------------------------------------------------
/models/enc2dec/state_transition.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | class StateTransition(nn.Module):
 6 | 
 7 |     METHODS = ['affine', 'reverse', 'tanh(affine)', 'empty']
 8 | 
 9 |     def __init__(self, num_layers, cell='lstm', bidirectional=True, hidden_dim=None, method='empty'):
10 |         """
11 |             Transform encoder final hidden states to decoder initial hidden states
12 |         """
13 |         super(StateTransition, self).__init__()
14 |         self.cell = cell.upper()
15 |         self.num_layers = num_layers
16 |         self.num_directions = 2 if bidirectional else 1
17 |         assert method in StateTransition.METHODS
18 |         self.method = method
19 |         if 'affine' in self.method:
20 |             assert hidden_dim
21 |             self.h_affine = nn.Linear(hidden_dim * self.num_directions, hidden_dim)
22 |             if self.cell == 'LSTM':
23 |                 self.c_affine = nn.Linear(hidden_dim * self.num_directions, hidden_dim)
24 | 
25 |     def forward(self, hidden_states):
26 |         if self.method == 'empty':
27 |             if 'LSTM' in self.cell:
28 |                 enc_h, enc_c = hidden_states
29 |                 dec_h = enc_h.new_zeros(self.num_layers, enc_h.size(1), enc_h.size(2))
30 |                 dec_c = enc_c.new_zeros(self.num_layers, enc_c.size(1), enc_c.size(2))
31 |                 hidden_states = (dec_h, dec_c)
32 |             else:
33 |                 enc_h = hidden_states
34 |                 dec_h = enc_h.new_zeros(self.num_layers, enc_h.size(1), enc_h.size(2))
35 |                 hidden_states = dec_h
36 |         elif self.method == 'reverse':
37 |             if self.num_directions == 2:
38 |                 index_slices = [2 * i + 1 for i in range(self.num_layers)]  # from reversed path
39 |                 index_slices = torch.tensor(index_slices, dtype=torch.long, device=hidden_states[0].device)
40 |                 if self.cell == 'LSTM':
41 |                     enc_h, enc_c = hidden_states
42 |                     dec_h = torch.index_select(enc_h, 0, index_slices)
43 |                     dec_c = torch.index_select(enc_c, 0, index_slices)
44 |                     hidden_states = (dec_h.contiguous(), dec_c.contiguous())
45 |                 else:
46 |                     enc_h = hidden_states
47 |                     dec_h = torch.index_select(enc_h, 0, index_slices)
48 |                     hidden_states = dec_h.contiguous()
49 |             else:
50 |                 pass # do nothing, pass states directly
51 |         else:
52 |             if self.cell == 'LSTM':
53 |                 enc_h, enc_c = hidden_states
54 |                 batches = enc_h.size(1)
55 |                 dec_h = self.h_affine(enc_h.transpose(0, 1).contiguous().view(batches * self.num_layers, -1))
56 |                 dec_c = self.c_affine(enc_c.transpose(0, 1).contiguous().view(batches * self.num_layers, -1))
57 |                 if "tanh" in self.method:
58 |                     dec_h, dec_c = torch.tanh(dec_h), torch.tanh(dec_c)
59 |                 dec_h = dec_h.contiguous().view(batches, self.num_layers, -1).transpose(0, 1).contiguous()
60 |                 dec_c = dec_c.contiguous().view(batches, self.num_layers, -1).transpose(0, 1).contiguous()
61 |                 hidden_states = (dec_h, dec_c)
62 |             else:
63 |                 enc_h, batches = hidden_states, hidden_states.size(1)
64 |                 dec_h = self.h_affine(enc_h.transpose(0, 1).contiguous().view(batches * self.num_layers, -1))
65 |                 if "tanh" in self.method:
66 |                     dec_h = torch.tanh(dec_h)
67 |                 dec_h = dec_h.contiguous().view(batches, self.num_layers, -1).transpose(0, 1).contiguous()
68 |                 hidden_states = dec_h
69 |         return hidden_states
70 | 


--------------------------------------------------------------------------------
/models/encoder/encoder_rnn.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | from models.model_utils import rnn_wrapper
 5 | 
 6 | class RNNEncoder(nn.Module):
 7 |     """
 8 |         Core encoder is a stack of N RNN layers
 9 |     """
10 |     def __init__(self, src_emb_size, hidden_dim, num_layers, cell="lstm", bidirectional=True, dropout=0.5):
11 |         super(RNNEncoder, self).__init__()
12 |         self.src_emb_size = src_emb_size
13 |         self.hidden_dim = hidden_dim
14 |         self.num_layers = num_layers
15 |         self.bidirectional = bidirectional
16 |         self.dropout = dropout if self.num_layers > 1 else 0
17 |         self.cell = cell.upper()
18 |         self.rnn_encoder = getattr(nn, self.cell)(self.src_emb_size, self.hidden_dim, 
19 |                         num_layers=self.num_layers, bidirectional=self.bidirectional, 
20 |                         batch_first=True, dropout=self.dropout)
21 |         
22 |     def forward(self, x, lens):
23 |         """
24 |             Pass the x and lens through each RNN layer.
25 |         """
26 |         out, hidden_states = rnn_wrapper(self.rnn_encoder, x, lens, cell=self.cell)  # bsize x srclen x dim
27 |         return out, hidden_states


--------------------------------------------------------------------------------
/models/encoder_decoder.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class EncoderDecoder(nn.Module):
 7 |     """
 8 |         A standard Encoder-Decoder architecture.
 9 |     """
10 |     def __init__(self, src_embed, encoder, tgt_embed, decoder, enc2dec, generator):
11 |         """
12 |             All the arguments are of type nn.Module
13 |         """
14 |         super(EncoderDecoder, self).__init__()
15 |         self.src_embed = src_embed
16 |         self.encoder = encoder
17 |         self.enc2dec = enc2dec
18 |         self.tgt_embed = tgt_embed
19 |         self.decoder = decoder
20 |         self.generator = generator
21 |     
22 |     def forward(self, *args, **kargs):
23 |         raise NotImplementedError
24 | 
25 |     def decode_batch(self, *args, **kargs):
26 |         raise NotImplementedError
27 | 
28 |     def decode_greed(self, *args, **kargs):
29 |         raise NotImplementedError
30 |     
31 |     def decode_beam_search(self, *args, **kargs):
32 |         raise NotImplementedError
33 | 
34 |     def pad_embedding_grad_zero(self):
35 |         self.src_embed.pad_embedding_grad_zero()
36 |         self.tgt_embed.pad_embedding_grad_zero()
37 | 
38 |     def load_model(self, load_dir):
39 |         self.load_state_dict(torch.load(open(load_dir, 'rb'), map_location=lambda storage, loc: storage))
40 | 
41 |     def save_model(self, save_dir):
42 |         torch.save(self.state_dict(), open(save_dir, 'wb'))


--------------------------------------------------------------------------------
/models/generator/generator_naive.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class Generator(nn.Module):
 7 |     """
 8 |         Define standard linear + softmax generation step.
 9 |     """
10 |     def __init__(self, feats, vocab, dropout=0.5):
11 |         super(Generator, self).__init__()
12 |         self.proj = nn.Linear(feats, vocab)
13 |         self.dropout_layer = nn.Dropout(p=dropout)
14 | 
15 |     def forward(self, x):
16 |         return F.log_softmax(self.proj(self.dropout_layer(x)), dim=-1)


--------------------------------------------------------------------------------
/models/generator/generator_pointer.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from utils.constants import MAX_OOV_NUM
 6 | 
 7 | class GeneratorPointer(nn.Module):
 8 |     """
 9 |         Define standard linear + softmax generation step plus pointer copy step.
10 |     """
11 |     def __init__(self, feats, vocab, dropout=0.5):
12 |         super(GeneratorPointer, self).__init__()
13 |         self.proj = nn.Linear(feats, vocab)
14 |         self.dropout_layer = nn.Dropout(p=dropout)
15 | 
16 |     def forward(self, x, copy_distribution, gate_scores):
17 |         """
18 |             x: bsize x tgt_lens x (dec_dim + enc_dim)
19 |             copy_distribution: bsize x tgt_lens x (vocab_size + MAX_OOV_NUM)
20 |             gate_scores: bsize x tgt_lens x 1
21 |         """
22 |         out = F.softmax(self.proj(self.dropout_layer(x)), dim=-1)
23 |         extra_zeros = torch.zeros(x.size(0), x.size(1), MAX_OOV_NUM, dtype=torch.float, device=x.device)
24 |         generate_distribution = torch.cat([out, extra_zeros], dim=-1)
25 |         final_scores = torch.log(gate_scores * generate_distribution + (1 - gate_scores) * copy_distribution + 1e-20)
26 |         return final_scores


--------------------------------------------------------------------------------
/models/language_model.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | 
 3 | ''' Language Model '''
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | from models.model_utils import rnn_wrapper, lens2mask
 9 | 
10 | class LanguageModel(nn.Module):
11 |     """
12 |         Container module with an encoder, a recurrent module, and a decoder.
13 |     """
14 |     def __init__(self, vocab_size=950, emb_size=1024, hidden_dim=256,
15 |             num_layers=1, cell='lstm', pad_token_idxs=[], dropout=0.5,
16 |             decoder_tied=False, init=0.2, **kargs):
17 |         super(LanguageModel, self).__init__()
18 |         self.dropout_layer = nn.Dropout(dropout)
19 |         self.encoder = nn.Embedding(vocab_size, emb_size)
20 |         self.cell = cell.upper() # RNN/LSTM/GRU
21 |         self.rnn = getattr(nn, self.cell)(
22 |             emb_size, hidden_dim, num_layers,
23 |             batch_first=True, dropout=(dropout if num_layers > 1 else 0)
24 |         )
25 |         self.affine = nn.Linear(hidden_dim, emb_size)
26 |         self.decoder = nn.Linear(emb_size, vocab_size)
27 | 
28 |         if decoder_tied:
29 |             self.decoder.weight = self.encoder.weight # shape: vocab_size, emb_size
30 | 
31 |         self.hidden_dim = hidden_dim
32 |         self.num_layers = num_layers
33 |         self.pad_token_idxs = list(pad_token_idxs)
34 | 
35 |         if init:
36 |             for p in self.parameters():
37 |                 p.data.uniform_(-init, init)
38 |             for pad_token_idx in pad_token_idxs:
39 |                 self.encoder.weight.data[pad_token_idx].zero_()
40 | 
41 |     def pad_embedding_grad_zero(self):
42 |         for pad_token_idx in self.pad_token_idxs:
43 |             self.encoder.weight.grad[pad_token_idx].zero_()
44 | 
45 |     def forward(self, input_feats, lens):
46 |         input_feats, lens = input_feats[:, :-1], lens - 1
47 |         emb = self.dropout_layer(self.encoder(input_feats)) # bsize, seq_length, emb_size
48 |         output, _ = rnn_wrapper(self.rnn, emb, lens, self.cell)
49 |         decoded = self.decoder(self.affine(self.dropout_layer(output)))
50 |         scores = F.log_softmax(decoded, dim=-1)
51 |         return scores
52 | 
53 |     def sent_logprobability(self, input_feats, lens):
54 |         '''
55 |             Given sentences, calculate its length-normalized log-probability
56 |             Sequence must contain <s> and </s> symbol
57 |             lens: length tensor
58 |         '''
59 |         lens = lens - 1
60 |         input_feats, output_feats = input_feats[:, :-1], input_feats[:, 1:]
61 |         emb = self.dropout_layer(self.encoder(input_feats)) # bsize, seq_len, emb_size
62 |         output, _ = rnn_wrapper(self.rnn, emb, lens, self.cell)
63 |         decoded = self.decoder(self.affine(self.dropout_layer(output)))
64 |         scores = F.log_softmax(decoded, dim=-1)
65 |         log_prob = torch.gather(scores, 2, output_feats.unsqueeze(-1)).contiguous().view(output.size(0), output.size(1))
66 |         sent_log_prob = torch.sum(log_prob * lens2mask(lens).float(), dim=-1)
67 |         return sent_log_prob / lens.float()
68 | 
69 |     def load_model(self, load_dir):
70 |         self.load_state_dict(torch.load(open(load_dir, 'rb'), map_location=lambda storage, loc: storage))
71 | 
72 |     def save_model(self, save_dir):
73 |         torch.save(self.state_dict(), open(save_dir, 'wb'))
74 | 


--------------------------------------------------------------------------------
/models/model_attn.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import torch
  3 | import torch.nn as nn
  4 | from utils.constants import BOS, EOS, MAX_DECODE_LENGTH
  5 | from models.model_utils import tile, lens2mask
  6 | from models.Beam import Beam, GNMTGlobalScorer
  7 | from models.encoder_decoder import EncoderDecoder
  8 | 
  9 | class AttnModel(EncoderDecoder):
 10 | 
 11 |     def __init__(self, *args, **kargs):
 12 |         super(AttnModel, self).__init__(*args, **kargs)
 13 |         self.copy = False
 14 | 
 15 |     """
 16 |         We use copy_tokens, just to be compatible with Attention Pointer Model
 17 |     """
 18 |     def forward(self, src_inputs, src_lens, tgt_inputs, copy_tokens=None):
 19 |         """
 20 |             Used during training time.
 21 |         """
 22 |         enc_out, hidden_states = self.encoder(self.src_embed(src_inputs), src_lens)
 23 |         hidden_states = self.enc2dec(hidden_states)
 24 |         src_mask = lens2mask(src_lens)
 25 |         dec_out, _ = self.decoder(self.tgt_embed(tgt_inputs), hidden_states, enc_out, src_mask, copy_tokens)
 26 |         out = self.generator(dec_out)
 27 |         return out
 28 | 
 29 |     def decode_batch(self, src_inputs, src_lens, vocab, copy_tokens=None,
 30 |             beam_size=5, n_best=1, alpha=0.6, length_pen='avg'):
 31 |         enc_out, hidden_states = self.encoder(self.src_embed(src_inputs), src_lens)
 32 |         hidden_states = self.enc2dec(hidden_states)
 33 |         src_mask = lens2mask(src_lens)
 34 |         if beam_size == 1:
 35 |             return self.decode_greed(hidden_states, enc_out, src_mask, vocab, copy_tokens)
 36 |         else:
 37 |             return self.decode_beam_search(hidden_states, enc_out, src_mask, vocab, copy_tokens, 
 38 |                 beam_size=beam_size, n_best=n_best, alpha=alpha, length_pen=length_pen)
 39 | 
 40 |     def decode_greed(self, hidden_states, memory, src_mask, vocab, copy_tokens=None):
 41 |         """
 42 |             hidden_states: hidden_states from encoder
 43 |             memory: encoder output, bsize x src_len x enc_dim
 44 |             src_mask: ByteTensor, bsize x max_src_len
 45 |             vocab: tgt word2idx dict containing BOS, EOS
 46 |         """
 47 |         results = {"scores":[], "predictions":[]}
 48 |         
 49 |         # first target token is BOS
 50 |         batches = memory.size(0)
 51 |         ys = torch.ones(batches, 1, dtype=torch.long).fill_(vocab[BOS]).to(memory.device)
 52 |         # record whether each sample is finished
 53 |         all_done = torch.tensor([False] * batches, dtype=torch.uint8, device=memory.device)
 54 |         scores = torch.zeros(batches, 1, dtype=torch.float, device=memory.device)
 55 |         predictions = [[] for i in range(batches)]
 56 | 
 57 |         for i in range(MAX_DECODE_LENGTH):
 58 |             logprob, hidden_states = self.decode_one_step(ys, hidden_states, memory, src_mask, copy_tokens)
 59 |             maxprob, ys = torch.max(logprob, dim=1, keepdim=True)
 60 |             for i in range(batches):
 61 |                 if not all_done[i]:
 62 |                     scores[i] += maxprob[i]
 63 |                     predictions[i].append(ys[i])
 64 |             done = ys.squeeze(dim=1) == vocab[EOS]
 65 |             all_done |= done
 66 |             if all_done.all():
 67 |                 break
 68 |         results["predictions"], results["scores"] = [[torch.cat(pred).tolist()] for pred in predictions], scores
 69 |         return results
 70 |     
 71 |     def decode_one_step(self, ys, hidden_states, memory, src_mask, copy_tokens=None):
 72 |         """
 73 |             ys: bsize x 1
 74 |         """
 75 |         dec_out, hidden_states = self.decoder(self.tgt_embed(ys), hidden_states, memory, src_mask, copy_tokens)
 76 |         out = self.generator(dec_out)
 77 |         return out.squeeze(dim=1), hidden_states
 78 | 
 79 |     def decode_beam_search(self, hidden_states, memory, src_mask, vocab, copy_tokens=None, 
 80 |             beam_size=5, n_best=1, alpha=0.6, length_pen='avg'):
 81 |         """
 82 |             Beam search decoding
 83 |         """
 84 |         results = {"scores":[], "predictions":[]}
 85 |         
 86 |         # Construct beams, we donot use stepwise coverage penalty nor ngrams block
 87 |         remaining_sents = memory.size(0)
 88 |         global_scorer = GNMTGlobalScorer(alpha, length_pen)
 89 |         beam = [ Beam(beam_size, vocab, global_scorer=global_scorer, device=memory.device)
 90 |                 for _ in range(remaining_sents) ]
 91 | 
 92 |         # repeat beam_size times
 93 |         memory, src_mask, copy_tokens = tile([memory, src_mask, copy_tokens], beam_size, dim=0)
 94 |         hidden_states = tile(hidden_states, beam_size, dim=1)
 95 |         h_c = type(hidden_states) in [list, tuple]
 96 |         batch_idx = list(range(remaining_sents))
 97 | 
 98 |         for i in range(MAX_DECODE_LENGTH):
 99 |             # (a) construct beamsize * remaining_sents next words
100 |             ys = torch.stack([b.get_current_state() for b in beam if not b.done()]).contiguous().view(-1,1)
101 | 
102 |             # (b) pass through the decoder network
103 |             out, hidden_states = self.decode_one_step(ys, hidden_states, memory, src_mask, copy_tokens)
104 |             out = out.contiguous().view(remaining_sents, beam_size, -1)
105 | 
106 |             # (c) advance each beam
107 |             active, select_indices_array = [], []
108 |             # Loop over the remaining_batch number of beam
109 |             for b in range(remaining_sents):
110 |                 idx = batch_idx[b] # idx represent the original order in minibatch_size
111 |                 beam[idx].advance(out[b])
112 |                 if not beam[idx].done():
113 |                     active.append((idx, b))
114 |                 select_indices_array.append(beam[idx].get_current_origin() + b * beam_size)
115 | 
116 |             # (d) update hidden_states history
117 |             select_indices_array = torch.cat(select_indices_array, dim=0)
118 |             if h_c:
119 |                 hidden_states = (hidden_states[0].index_select(1, select_indices_array), hidden_states[1].index_select(1, select_indices_array))
120 |             else:
121 |                 hidden_states = hidden_states.index_select(1, select_indices_array)
122 |             
123 |             if not active:
124 |                 break
125 | 
126 |             # (e) reserve un-finished batches
127 |             active_idx = torch.tensor([item[1] for item in active], dtype=torch.long, device=memory.device) # original order in remaining batch
128 |             batch_idx = { idx: item[0] for idx, item in enumerate(active) } # order for next remaining batch
129 | 
130 |             def update_active(t):
131 |                 if t is None: return t
132 |                 t_reshape = t.contiguous().view(remaining_sents, beam_size, -1)
133 |                 new_size = list(t.size())
134 |                 new_size[0] = -1
135 |                 return t_reshape.index_select(0, active_idx).view(*new_size)
136 | 
137 |             if h_c:
138 |                 hidden_states = (
139 |                     update_active(hidden_states[0].transpose(0, 1)).transpose(0, 1).contiguous(),
140 |                     update_active(hidden_states[1].transpose(0, 1)).transpose(0, 1).contiguous()
141 |                 )
142 |             else:
143 |                 hidden_states = update_active(hidden_states.transpose(0, 1)).transpose(0, 1).contiguous()
144 |             memory = update_active(memory)
145 |             src_mask = update_active(src_mask)
146 |             copy_tokens = update_active(copy_tokens)
147 |             remaining_sents = len(active)
148 | 
149 |         for b in beam:
150 |             scores, ks = b.sort_finished(minimum=n_best)
151 |             hyps = []
152 |             for i, (times, k) in enumerate(ks[:n_best]):
153 |                 hyp = b.get_hyp(times, k)
154 |                 hyps.append(hyp.tolist()) # hyp contains </s> but does not contain <s>
155 |             results["predictions"].append(hyps) # batch list of variable_tgt_len
156 |             results["scores"].append(torch.stack(scores)[:n_best]) # list of [n_best], torch.FloatTensor
157 |         results["scores"] = torch.stack(results["scores"])
158 |         return results
159 | 


--------------------------------------------------------------------------------
/models/model_attnptr.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | from models.model_utils import lens2mask
 3 | from models.model_attn import AttnModel
 4 | 
 5 | class AttnPtrModel(AttnModel):
 6 | 
 7 |     def __init__(self, *args, **kargs):
 8 |         super(AttnPtrModel, self).__init__(*args, **kargs)
 9 |         self.copy = True
10 |     
11 |     def forward(self, src_inputs, src_lens, tgt_inputs, copy_tokens):
12 |         """
13 |             Used during training time.
14 |         """
15 |         enc_out, hidden_states = self.encoder(self.src_embed(src_inputs), src_lens)
16 |         hidden_states = self.enc2dec(hidden_states)
17 |         src_mask = lens2mask(src_lens)
18 |         dec_out, _, copy_dist, gates = self.decoder(self.tgt_embed(tgt_inputs), hidden_states, enc_out, src_mask, copy_tokens)
19 |         out = self.generator(dec_out, copy_dist, gates)
20 |         return out
21 | 
22 |     def decode_one_step(self, ys, hidden_states, memory, src_mask, copy_tokens):
23 |         dec_out, hidden_states, copy_dist, gates = self.decoder(self.tgt_embed(ys), hidden_states, memory, src_mask, copy_tokens)
24 |         out = self.generator(dec_out, copy_dist, gates).squeeze(dim=1)
25 |         return out, hidden_states
26 | 


--------------------------------------------------------------------------------
/models/model_utils.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.utils.rnn as rnn_utils
 5 | 
 6 | def tile(x, count, dim=0):
 7 |     """
 8 |         Tiles x on dimension dim count times.
 9 |         E.g. [1, 2, 3], count=2 ==> [1, 1, 2, 2, 3, 3]
10 |             [[1, 2], [3, 4]], count=3, dim=1 ==> [[1, 1, 1, 2, 2, 2], [3, 3, 3, 4, 4, 4]]
11 |         Different from torch.repeat
12 |     """
13 |     if x is None:
14 |         return x
15 |     elif type(x) in [list, tuple]:
16 |         return type(x)([tile(each, count, dim) for each in x])
17 |     else:
18 |         perm = list(range(len(x.size())))
19 |         if dim != 0:
20 |             perm[0], perm[dim] = perm[dim], perm[0]
21 |             x = x.permute(perm).contiguous()
22 |         out_size = list(x.size())
23 |         out_size[0] *= count
24 |         batch = x.size(0)
25 |         x = x.contiguous().view(batch, -1) \
26 |             .transpose(0, 1) \
27 |             .repeat(count, 1) \
28 |             .transpose(0, 1) \
29 |             .contiguous() \
30 |             .view(*out_size)
31 |         if dim != 0:
32 |             x = x.permute(perm).contiguous()
33 |         return x
34 | 
35 | def lens2mask(lens):
36 |     bsize = lens.numel()
37 |     max_len = lens.max()
38 |     masks = torch.arange(0, max_len).type_as(lens).to(lens.device).repeat(bsize, 1).lt(lens.unsqueeze(1))
39 |     masks.requires_grad = False
40 |     return masks
41 | 
42 | def rnn_wrapper(encoder, inputs, lens, cell='lstm'):
43 |     """
44 |         @args:
45 |             encoder(nn.Module): rnn series bidirectional encoder, batch_first=True
46 |             inputs(torch.FloatTensor): rnn inputs, bsize x max_seq_len x in_dim
47 |             lens(torch.LongTensor): seq len for each sample, bsize
48 |         @return:
49 |             out(torch.FloatTensor): output of encoder, bsize x max_seq_len x hidden_dim*2
50 |             hidden_states(tuple or torch.FloatTensor): final hidden states, num_layers*2 x bsize x hidden_dim
51 |     """
52 |     # rerank according to lens and temporarily remove empty inputs
53 |     sorted_lens, sort_key = torch.sort(lens, descending=True)
54 |     nonzero_index = torch.sum(sorted_lens > 0).item()
55 |     sorted_inputs = torch.index_select(inputs, dim=0, index=sort_key[:nonzero_index])
56 |     # forward non empty inputs    
57 |     packed_inputs = rnn_utils.pack_padded_sequence(sorted_inputs, sorted_lens[:nonzero_index].tolist(), batch_first=True)
58 |     packed_out, h = encoder(packed_inputs)  # bsize x srclen x dim
59 |     out, _ = rnn_utils.pad_packed_sequence(packed_out, batch_first=True)
60 |     if cell.upper() == 'LSTM':
61 |         h, c = h
62 |     # pad zeros due to empty inputs
63 |     pad_zeros = torch.zeros(sorted_lens.size(0) - out.size(0), out.size(1), out.size(2)).type_as(out).to(out.device)
64 |     sorted_out = torch.cat([out, pad_zeros], dim=0)
65 |     pad_hiddens = torch.zeros(h.size(0), sorted_lens.size(0) - h.size(1), h.size(2)).type_as(h).to(h.device)
66 |     sorted_hiddens = torch.cat([h, pad_hiddens], dim=1)
67 |     if cell.upper() == 'LSTM': 
68 |         pad_cells = torch.zeros(c.size(0), sorted_lens.size(0) - c.size(1), c.size(2)).type_as(c).to(c.device)
69 |         sorted_cells = torch.cat([c, pad_cells], dim=1)
70 |     # rerank according to sort_key
71 |     shape = list(sorted_out.size())
72 |     out = torch.zeros_like(sorted_out).type_as(sorted_out).to(sorted_out.device).scatter_(0, sort_key.unsqueeze(-1).unsqueeze(-1).expand(*shape), sorted_out)
73 |     shape = list(sorted_hiddens.size())
74 |     hiddens = torch.zeros_like(sorted_hiddens).type_as(sorted_hiddens).to(sorted_hiddens.device).scatter_(1, sort_key.unsqueeze(0).unsqueeze(-1).expand(*shape), sorted_hiddens)
75 |     if cell.upper() == 'LSTM':
76 |         cells = torch.zeros_like(sorted_cells).type_as(sorted_cells).to(sorted_cells.device).scatter_(1, sort_key.unsqueeze(0).unsqueeze(-1).expand(*shape), sorted_cells)
77 |         return out, (hiddens.contiguous(), cells.contiguous())
78 |     return out, hiddens.contiguous()
79 | 


--------------------------------------------------------------------------------
/models/penalties.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import torch
 3 | 
 4 | class PenaltyBuilder(object):
 5 |     """
 6 |     Returns the Length Penalty function for Beam Search.
 7 | 
 8 |     Args:
 9 |         length_pen (str): option name of length pen
10 |     """
11 | 
12 |     def __init__(self, length_pen):
13 |         self.length_pen = length_pen
14 | 
15 |     def length_penalty(self):
16 |         if self.length_pen == "wu":
17 |             return self.length_wu
18 |         elif self.length_pen == "avg":
19 |             return self.length_average
20 |         else:
21 |             return self.length_none
22 | 
23 |     """
24 |     Below are all the different penalty terms implemented so far
25 |     """
26 | 
27 |     def length_wu(self, beam, logprobs, alpha=0.):
28 |         """
29 |         NMT length re-ranking score from
30 |         "Google's Neural Machine Translation System" :cite:`wu2016google`.
31 |         """
32 | 
33 |         modifier = (((5 + len(beam.next_ys)) ** alpha) /
34 |                     ((5 + 1) ** alpha))
35 |         return (logprobs / modifier)
36 | 
37 |     def length_average(self, beam, logprobs, alpha=0.):
38 |         """
39 |         Returns the average probability of tokens in a sequence.
40 |         """
41 |         return logprobs / len(beam.next_ys)
42 | 
43 |     def length_none(self, beam, logprobs, alpha=0.):
44 |         """
45 |         Returns unmodified scores.
46 |         """
47 |         return logprobs
48 | 


--------------------------------------------------------------------------------
/models/reward.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | from utils.constants import *
 3 | from utils.example import Example
 4 | from models.model_utils import lens2mask
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | class RewardModel():
 9 | 
10 |     def __init__(self, dataset, qlm, lflm, lm_vocab, sp_device='cpu', qg_device='cpu'):
11 |         super(RewardModel, self).__init__()
12 |         self.dataset = dataset
13 |         self.qlm = qlm.to(sp_device)
14 |         self.lflm = lflm.to(qg_device)
15 |         self.vocab = lm_vocab
16 |         self.sp_device = sp_device
17 |         self.qg_device = qg_device
18 | 
19 |     def forward(self, *args, choice='sp_val'):
20 |         if choice == 'sp_val':
21 |             return self.sp_validity_reward(*args)
22 |         elif choice == 'qg_val':
23 |             return self.qg_validity_reward(*args)
24 |         elif 'rec' in choice:
25 |             return self.reconstruction_reward(*args)
26 |         else:
27 |             raise ValueError('[Error]: unknown reward choice !')
28 | 
29 |     def sp_validity_reward(self, lf_list):
30 |         # calculate logical form language model length normalized log probability
31 |         input_idxs = [[self.vocab.lf2id[BOS]] + [self.vocab.lf2id[word] if word in self.vocab.lf2id else self.vocab.lf2id[UNK] for word in sent] + [self.vocab.word2id[EOS]] for sent in lf_list]
32 |         lens = [len(each) for each in input_idxs]
33 |         max_len = max(lens)
34 |         input_idxs = [sent + [self.vocab.lf2id[PAD]] * (max_len - len(sent)) for sent in input_idxs]
35 |         input_tensor = torch.tensor(input_idxs, dtype=torch.long, device=self.qg_device)
36 |         lens = torch.tensor(lens, dtype=torch.long, device=self.qg_device)
37 |         self.lflm.eval()
38 |         with torch.no_grad():
39 |             logprob = self.lflm.sent_logprobability(input_tensor, lens).cpu()
40 |         # grammar check
41 |         domain = Example.domain
42 |         ans = domain.is_valid(domain.obtain_denotations(domain.normalize(lf_list)))
43 |         grammar = torch.tensor(ans, dtype=torch.float, requires_grad=False)
44 |         val_reward = 0.5 * logprob + 0.5 * grammar
45 |         return val_reward
46 | 
47 |     def qg_validity_reward(self, utterances):
48 |         # calculate language model length normalized log probability
49 |         input_idxs = [[self.vocab.word2id[BOS]] + [self.vocab.word2id[word] if word in self.vocab.word2id else self.vocab.word2id[UNK] for word in sent] + [self.vocab.word2id[EOS]] for sent in utterances]
50 |         lens = [len(each) for each in input_idxs]
51 |         max_len = max(lens)
52 |         input_idxs = [sent + [self.vocab.word2id[PAD]] * (max_len - len(sent)) for sent in input_idxs]
53 |         input_tensor = torch.tensor(input_idxs, dtype=torch.long, device=self.sp_device)
54 |         lens = torch.tensor(lens, dtype=torch.long, device=self.sp_device)
55 |         self.qlm.eval()
56 |         with torch.no_grad():
57 |             logprob = self.qlm.sent_logprobability(input_tensor, lens).cpu()
58 |         return logprob
59 | 
60 |     def reconstruction_reward(self, logscores, references, lens):
61 |         """
62 |             logscores: bsize x max_out_len x vocab_size[ + MAX_OOV_NUM]
63 |             references: bsize x max_out_len
64 |             lens: len for each sample
65 |         """
66 |         mask = lens2mask(lens)
67 |         pick_score = torch.gather(logscores, dim=-1, index=references.unsqueeze(dim=-1)).squeeze(dim=-1)
68 |         masked_score = mask.float() * pick_score
69 |         reward = masked_score.sum(dim=1)
70 |         return reward
71 | 
72 |     def __call__(self, *args, **kargs):
73 |         return self.forward(*args, **kargs)
74 | 


--------------------------------------------------------------------------------
/pull_dependency.sh:
--------------------------------------------------------------------------------
 1 | evaluator=evaluator.tar.gz
 2 | lib=lib.tar.gz
 3 | 
 4 | if [ ! -e "$evaluator" ]; then
 5 |     echo "Start downloading evaluator for overnight and geo datasets ..."
 6 |     wget -c https://worksheets.codalab.org/rest/bundles/0xbfbf0d1d8ab94874a68646a7d66c478e/contents/blob/ -O $evaluator
 7 | fi
 8 | 
 9 | if [ ! -e "$lib" ] ; then
10 |     echo "Start downloading libraries for evaluation..."
11 |     wget -c https://worksheets.codalab.org/rest/bundles/0xc6821b4f13f445d1b54e9da63019da1d/contents/blob/ -O $lib
12 | fi
13 | 
14 | mkdir evaluator
15 | mkdir lib
16 | tar -zxf $evaluator -C evaluator
17 | tar -zxf $lib -C lib
18 | rm -rf $evaluator
19 | rm -rf $lib
20 | cp evaluator/sempre/module-classes.txt .
21 | 
22 | wget -c http://nlp.stanford.edu/data/glove.6B.zip
23 | mkdir -p data/.cache
24 | unzip glove.6B.zip -d data/.cache/
25 | rm glove.6B.zip
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gpustat==0.6.0
2 | allennlp==0.9.0
3 | matplotlib==3.1.1


--------------------------------------------------------------------------------
/run/run_dual_learning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | task='dual_learning'
 4 | dataset=$1
 5 | if [ "$2" = 'attnptr' ] ; then
 6 |     copy='copy__'
 7 | else
 8 |     copy=''
 9 | fi
10 | read_sp_model_path=exp/task_semantic_parsing/dataset_${1}/labeled_${3}/${copy}cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100__beam_5__nbest_1/
11 | read_qg_model_path=exp/task_question_generation/dataset_${1}/labeled_${3}/${copy}cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100__beam_5__nbest_1/
12 | read_qlm_path=exp/task_language_model/dataset_${1}/question__labeled_1.0/cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100/
13 | read_lflm_path=exp/task_language_model/dataset_${1}/logical_form__labeled_1.0/cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100/
14 | 
15 | # training paras
16 | reduction=sum # sum, mean
17 | lr=0.001
18 | l2=1e-5
19 | batchSize=16
20 | test_batchSize=128
21 | max_norm=5
22 | max_epoch=100
23 | beam=5
24 | n_best=1
25 | 
26 | # special paras
27 | sample=6
28 | alpha=0.5
29 | beta=0.5
30 | labeled=$3
31 | unlabeled=1.0
32 | cycle=sp+qg
33 | deviceId="0 1"
34 | seed=999
35 | extra='--extra'
36 | 
37 | python3 scripts/dual_learning.py --task $task --read_sp_model_path $read_sp_model_path --read_qg_model_path $read_qg_model_path \
38 |     --dataset $dataset --read_qlm_path $read_qlm_path --read_lflm_path $read_lflm_path \
39 |     --reduction $reduction --lr $lr --l2 $l2 --batchSize $batchSize --test_batchSize $test_batchSize \
40 |     --cycle $cycle --max_norm $max_norm --max_epoch $max_epoch --beam $beam --n_best $n_best --sample $sample --alpha $alpha --beta $beta \
41 |     --labeled $labeled --unlabeled $unlabeled --deviceId $deviceId --seed $seed $extra
42 | 


--------------------------------------------------------------------------------
/run/run_language_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | task='language_model'
 3 | dataset=$1
 4 | side=$2 # question, logical_form
 5 | # read_model_path=''
 6 | 
 7 | num_layers=1
 8 | hidden_dim=200
 9 | emb_size=100
10 | cell=lstm # lstm, gru
11 | decoder_tied='' # '--decoder_tied', ''
12 | 
13 | batchSize=16
14 | test_batchSize=128
15 | lr=0.001
16 | dropout=0.5
17 | max_norm=5
18 | l2=1e-5
19 | max_epoch=100
20 | labeled=1.0
21 | deviceId=0
22 | 
23 | python scripts/language_model.py --task $task --dataset $dataset --side $side \
24 |     --num_layers $num_layers --hidden_dim $hidden_dim --emb_size $emb_size --cell $cell \
25 |     --batchSize $batchSize --test_batchSize $test_batchSize --lr $lr --dropout $dropout --max_norm $max_norm --l2 $l2 \
26 |     --labeled $labeled --max_epoch $max_epoch --deviceId $deviceId $decoder_tied
27 | 


--------------------------------------------------------------------------------
/run/run_pseudo_method.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | task='pseudo_method'
 4 | dataset=$1
 5 | if [ "$2" = 'attnptr' ] ; then
 6 |     copy='copy__'
 7 | else
 8 |     copy=''
 9 | fi
10 | read_sp_model_path=exp/task_semantic_parsing/dataset_${1}/labeled_${3}/${copy}cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100__beam_5__nbest_1/
11 | read_qg_model_path=exp/task_question_generation/dataset_${1}/labeled_${3}/${copy}cell_lstm__emb_100__hidden_200_x_1__dropout_0.5__reduce_sum__lr_0.001__mn_5.0__l2_1e-05__bsize_16__me_100__beam_5__nbest_1/
12 | 
13 | # training paras
14 | reduction=sum # sum, mean
15 | lr=0.001
16 | l2=1e-5
17 | batchSize=16
18 | test_batchSize=128
19 | max_norm=5
20 | max_epoch=100
21 | beam=5
22 | n_best=1
23 | 
24 | # special paras
25 | discount=0.5
26 | method=constant # constant, linear
27 | labeled=$3
28 | unlabeled=1.0
29 | deviceId="0 1"
30 | seed=999
31 | extra='--extra'
32 | 
33 | python3 scripts/pseudo_method.py --task $task --dataset $dataset \
34 |     --read_sp_model_path $read_sp_model_path --read_qg_model_path $read_qg_model_path \
35 |     --reduction $reduction --lr $lr --l2 $l2 --batchSize $batchSize --test_batchSize $test_batchSize \
36 |     --discount $discount --method $method --max_norm $max_norm --max_epoch $max_epoch --beam $beam --n_best $n_best \
37 |     --labeled $labeled --unlabeled $unlabeled --seed $seed --deviceId $deviceId $extra
38 | 


--------------------------------------------------------------------------------
/run/run_question_generation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | task='question_generation'
 4 | dataset=$1
 5 | # read_model_path=''
 6 | 
 7 | # model paras
 8 | if [ "$2" = "attnptr" ] ; then
 9 |     copy='--copy'
10 | else
11 |     copy=''
12 | fi
13 | emb_size=100
14 | hidden_dim=200
15 | num_layers=1
16 | cell=lstm # lstm, gru
17 | 
18 | # training paras
19 | reduction=sum # sum, mean
20 | lr=0.001
21 | l2=1e-5
22 | dropout=0.5
23 | batchSize=16
24 | test_batchSize=128
25 | init_weight=0.2
26 | max_norm=5
27 | max_epoch=100
28 | beam=5
29 | n_best=1
30 | 
31 | # special paras
32 | labeled=$3
33 | deviceId=0
34 | seed=999
35 | 
36 | python3 scripts/question_generation.py --task $task $copy --emb_size $emb_size --hidden_dim $hidden_dim --num_layers $num_layers \
37 |     --dataset $dataset --cell $cell --reduction $reduction --lr $lr --l2 $l2 --dropout $dropout --batchSize $batchSize --test_batchSize $test_batchSize \
38 |     --init_weight $init_weight --max_norm $max_norm --max_epoch $max_epoch --beam $beam --n_best $n_best \
39 |     --labeled $labeled --deviceId $deviceId --seed $seed
40 | 


--------------------------------------------------------------------------------
/run/run_semantic_parsing.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | task='semantic_parsing'
 4 | dataset=$1
 5 | # read_model_path=''
 6 | 
 7 | # model paras
 8 | if [ "$2" = "attnptr" ] ; then
 9 |     copy='--copy'
10 | else
11 |     copy=''
12 | fi
13 | emb_size=100
14 | hidden_dim=200
15 | num_layers=1
16 | cell=lstm # lstm, gru
17 | 
18 | # training paras
19 | reduction=sum # sum, mean
20 | lr=0.001
21 | l2=1e-5
22 | dropout=0.5
23 | batchSize=16
24 | test_batchSize=128
25 | init_weight=0.2
26 | max_norm=5
27 | max_epoch=100
28 | beam=5
29 | n_best=1
30 | 
31 | # special paras
32 | labeled=$3
33 | deviceId=0
34 | seed=999
35 | 
36 | python3 scripts/semantic_parsing.py --task $task $copy --emb_size $emb_size --hidden_dim $hidden_dim --num_layers $num_layers \
37 |     --dataset $dataset --cell $cell --reduction $reduction --lr $lr --l2 $l2 --dropout $dropout --batchSize $batchSize --test_batchSize $test_batchSize \
38 |     --init_weight $init_weight --max_norm $max_norm --max_epoch $max_epoch --beam $beam --n_best $n_best \
39 |     --labeled $labeled --deviceId $deviceId --seed $seed
40 | 


--------------------------------------------------------------------------------
/scripts/dual_learning.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import argparse, os, sys, time, json
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | from utils.vocab import Vocab
  5 | from utils.example import Example, split_dataset
  6 | from utils.optimizer import set_optimizer
  7 | from utils.loss import set_loss_function
  8 | from utils.seed import set_random_seed
  9 | from utils.logger import set_logger
 10 | from utils.gpu import set_torch_device
 11 | from utils.constants import *
 12 | from utils.solver.solver_dual_learning import DualLearningSolver
 13 | from utils.hyperparam import hyperparam_dual_learning
 14 | from models.construct_models import construct_model as model
 15 | from models.dual_learning import DualLearning
 16 | from models.reward import RewardModel
 17 | from models.language_model import LanguageModel
 18 | 
 19 | ############################### Arguments parsing and Preparations ##############################
 20 | 
 21 | def main(args=sys.argv[1:]):
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument('--task', required=True, help='pseudo method for semantic parsing')
 24 |     parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)')
 25 |     parser.add_argument('--dataset', required=True, help='which dataset to experiment on')
 26 |     parser.add_argument('--read_model_path', help='Testing mode, load sp and qg model path')
 27 |     # model params
 28 |     parser.add_argument('--read_sp_model_path', required=True, help='pretrained sp model')
 29 |     parser.add_argument('--read_qg_model_path', required=True, help='pretrained qg model path')
 30 |     parser.add_argument('--read_qlm_path', required=True, help='language model for natural language questions')
 31 |     parser.add_argument('--read_lflm_path', required=True, help='language model for logical form')
 32 |     # pseudo training paras
 33 |     parser.add_argument('--reduction', choices=['sum', 'mean'], default='sum')
 34 |     parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 35 |     parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)')
 36 |     parser.add_argument('--batchSize', type=int, default=16, help='input batch size')
 37 |     parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding')
 38 |     parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)")
 39 |     parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for')
 40 |     # special paras
 41 |     parser.add_argument('--sample', type=int, default=5, help='size of sampling during training in dual learning')
 42 |     parser.add_argument('--beam', default=5, type=int, help='used during decoding time')
 43 |     parser.add_argument('--n_best', default=1, type=int, help='used during decoding time')
 44 |     parser.add_argument('--alpha', type=float, default=0.5, help='coefficient which combines sp valid and reconstruction reward')
 45 |     parser.add_argument('--beta', type=float, default=0.5, help='coefficient which combines qg valid and reconstruction reward')
 46 |     parser.add_argument('--cycle', choices=['sp', 'qg', 'sp+qg'], default='sp+qg', help='whether use cycle starts from sp/qg')
 47 |     parser.add_argument('--labeled', type=float, default=1.0, help='ratio of labeled samples')
 48 |     parser.add_argument('--unlabeled', type=float, default=1.0, help='ratio of unlabeled samples')
 49 |     parser.add_argument('--deviceId', type=int, nargs=2, default=[-1, -1], help='device for semantic parsing and question generation model respectively')
 50 |     parser.add_argument('--seed', type=int, default=999, help='set initial random seed')
 51 |     parser.add_argument('--extra', action='store_true', help='whether use synthesized logical forms')
 52 |     opt = parser.parse_args(args)
 53 | 
 54 |     # Some Arguments Check
 55 |     assert opt.labeled > 0.
 56 |     assert opt.unlabeled >= 0. and opt.unlabeled <= 1.0
 57 |     return opt
 58 | 
 59 | opt = main()
 60 | 
 61 | ####################### Output path, logger, device and random seed configuration #################
 62 | 
 63 | exp_path = opt.read_model_path if opt.testing else hyperparam_dual_learning(opt)
 64 | if not os.path.exists(exp_path):
 65 |     os.makedirs(exp_path)
 66 | 
 67 | logger = set_logger(exp_path, testing=opt.testing)
 68 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4)))
 69 | logger.info("Experiment path: %s" % (exp_path))
 70 | sp_device, qg_device = set_torch_device(opt.deviceId[0]), set_torch_device(opt.deviceId[1])
 71 | set_random_seed(opt.seed, device='cuda')
 72 | 
 73 | ################################ Vocab and Data Reader ###########################
 74 | 
 75 | sp_copy, qg_copy = 'copy__' in opt.read_sp_model_path, 'copy__' in opt.read_qg_model_path
 76 | sp_vocab, qg_vocab = Vocab(opt.dataset, task='semantic_parsing', copy=sp_copy), Vocab(opt.dataset, task='question_generation', copy=qg_copy)
 77 | lm_vocab = Vocab(opt.dataset, task='language_model')
 78 | logger.info("Semantic Parsing model vocabulary ...")
 79 | logger.info("Vocab size for input natural language sentence is: %s" % (len(sp_vocab.word2id)))
 80 | logger.info("Vocab size for output logical form is: %s" % (len(sp_vocab.lf2id)))
 81 | 
 82 | logger.info("Question Generation model vocabulary ...")
 83 | logger.info("Vocab size for input logical form is: %s" % (len(qg_vocab.lf2id)))
 84 | logger.info("Vocab size for output natural language sentence is: %s" % (len(qg_vocab.word2id)))
 85 | 
 86 | logger.info("Language model vocabulary ...")
 87 | logger.info("Vocab size for question language model is: %s" % (len(lm_vocab.word2id)))
 88 | logger.info("Vocab size for logical form language model is: %s" % (len(lm_vocab.lf2id)))
 89 | 
 90 | logger.info("Read dataset starts at %s" % (time.asctime(time.localtime(time.time()))))
 91 | Example.set_domain(opt.dataset)
 92 | if not opt.testing:
 93 |     train_dataset, dev_dataset = Example.load_dataset(choice='train')
 94 |     labeled_train_dataset, unlabeled_train_dataset = split_dataset(train_dataset, opt.labeled)
 95 |     unlabeled_train_dataset, _ = split_dataset(unlabeled_train_dataset, opt.unlabeled)
 96 |     unlabeled_train_dataset += labeled_train_dataset
 97 |     if opt.extra:
 98 |         q_unlabeled_train_dataset = unlabeled_train_dataset
 99 |         lf_unlabeled_train_dataset = unlabeled_train_dataset + Example.load_dataset(choice='extra')
100 |     else:
101 |         q_unlabeled_train_dataset, lf_unlabeled_train_dataset = unlabeled_train_dataset, unlabeled_train_dataset
102 |     logger.info("Labeled/Unlabeled train dataset size is: %s and %s" % (len(labeled_train_dataset), len(lf_unlabeled_train_dataset)))
103 |     logger.info("Dev dataset size is: %s" % (len(dev_dataset)))
104 | test_dataset = Example.load_dataset(choice='test')
105 | logger.info("Test dataset size is: %s" % (len(test_dataset)))
106 | 
107 | ###################################### Model Construction ########################################
108 | 
109 | if not opt.testing:
110 |     params = {
111 |         "read_sp_model_path": opt.read_sp_model_path, "read_qg_model_path": opt.read_qg_model_path,
112 |         "read_qlm_path": opt.read_qlm_path, "read_lflm_path": opt.read_lflm_path,
113 |         "sample": opt.sample, "alpha": opt.alpha, "beta": opt.beta, "reduction": opt.reduction
114 |     }
115 |     json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4)
116 | else:
117 |     params = json.load(open(os.path.join(exp_path, "params.json"), 'r'))
118 | sp_params = json.load(open(os.path.join(params['read_sp_model_path'], 'params.json'), 'r'))
119 | sp_model = model(**sp_params)
120 | qg_params = json.load(open(os.path.join(params['read_qg_model_path'], 'params.json'), 'r'))
121 | qg_model = model(**qg_params)
122 | if not opt.testing:
123 |     sp_model.load_model(os.path.join(params['read_sp_model_path'], 'model.pkl'))
124 |     logger.info("Load Semantic Parsing model from path %s" % (params['read_sp_model_path']))
125 |     qg_model.load_model(os.path.join(params['read_qg_model_path'], 'model.pkl'))
126 |     logger.info("Load Question Generation model from path %s" % (params['read_qg_model_path']))
127 |     qlm_params = json.load(open(os.path.join(params['read_qlm_path'], 'params.json'), 'r'))
128 |     qlm_model = LanguageModel(**qlm_params)
129 |     qlm_model.load_model(os.path.join(params['read_qlm_path'], 'model.pkl'))
130 |     logger.info("Load Question Language Model from path %s" % (params['read_qlm_path']))
131 |     lflm_params = json.load(open(os.path.join(params['read_lflm_path'], 'params.json'), 'r'))
132 |     lflm_model = LanguageModel(**lflm_params)
133 |     lflm_model.load_model(os.path.join(params['read_lflm_path'], 'model.pkl'))
134 |     logger.info("Load Logical Form Language Model from path %s" % (params['read_lflm_path']))
135 |     reward_model = RewardModel(opt.dataset, qlm_model, lflm_model, lm_vocab, sp_device=sp_device, qg_device=qg_device)
136 | else:
137 |     sp_model.load_model(os.path.join(exp_path, 'sp_model.pkl'))
138 |     logger.info("Load Semantic Parsing model from path %s" % (exp_path))
139 |     qg_model.load_model(os.path.join(exp_path, 'qg_model.pkl'))
140 |     logger.info("Load Question Generation model from path %s" % (exp_path))
141 |     reward_model = None
142 | train_model = DualLearning(sp_model, qg_model, reward_model, sp_vocab, qg_vocab,
143 |     alpha=params['alpha'], beta=params['beta'], sample=params['sample'],
144 |     reduction=params["reduction"], sp_device=sp_device, qg_device=qg_device)
145 | 
146 | loss_function = {'sp': {}, 'qg': {}}
147 | loss_function['sp'] = set_loss_function(ignore_index=sp_vocab.lf2id[PAD], reduction=opt.reduction)
148 | loss_function['qg'] = set_loss_function(ignore_index=qg_vocab.word2id[PAD], reduction=opt.reduction)
149 | optimizer = set_optimizer(sp_model, qg_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm)
150 | 
151 | ###################################### Training and Decoding #######################################
152 | 
153 | vocab = {'sp': sp_vocab, 'qg': qg_vocab}
154 | device = {'sp': sp_device, 'qg': qg_device}
155 | solver = DualLearningSolver(train_model, vocab, loss_function, optimizer, exp_path, logger, device=device)
156 | if not opt.testing:
157 |     logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time()))))
158 |     solver.train_and_decode(labeled_train_dataset, q_unlabeled_train_dataset, lf_unlabeled_train_dataset, dev_dataset, test_dataset,
159 |         batchSize=opt.batchSize, test_batchSize=opt.test_batchSize, cycle=opt.cycle,
160 |         max_epoch=opt.max_epoch, beam=opt.beam, n_best=opt.n_best)
161 | else:
162 |     logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time()))))
163 |     start_time = time.time()
164 |     acc, bleu = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'), opt.test_batchSize, beam=opt.beam, n_best=opt.n_best)
165 |     logger.info('Evaluation cost: %.4fs\tSemantic Parsing (acc : %.4f)\tQuestion Generation (bleu: %.4f)' 
166 |         % (time.time() - start_time, acc, bleu))
167 | 


--------------------------------------------------------------------------------
/scripts/language_model.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import argparse, os, sys, time, json
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | from utils.vocab import Vocab
  5 | from utils.example import Example, split_dataset
  6 | from utils.optimizer import set_optimizer
  7 | from utils.loss import set_loss_function
  8 | from utils.seed import set_random_seed
  9 | from utils.logger import set_logger
 10 | from utils.gpu import set_torch_device
 11 | from utils.constants import *
 12 | from utils.solver.solver_language_model import LMSolver
 13 | from utils.word2vec import load_embeddings
 14 | from utils.hyperparam import hyperparam_lm
 15 | from models.language_model import LanguageModel as model
 16 | 
 17 | ############################### Arguments parsing and Preparations ##############################
 18 | 
 19 | def main(args=sys.argv[1:]):
 20 |     parser = argparse.ArgumentParser()
 21 |     parser.add_argument('--task', type=str, default='language_model', help='language model')
 22 |     parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)')
 23 |     parser.add_argument('--dataset', required=True, help='which dataset to experiemnt on')
 24 |     parser.add_argument('--side', choices=['question', 'logical_form'], help='which side to build language model')
 25 |     # pretrained models
 26 |     parser.add_argument('--read_model_path', required=False, help='Read model and hyperparams from this path')
 27 |     # model paras
 28 |     parser.add_argument('--emb_size', type=int, default=100, help='embedding size')
 29 |     parser.add_argument('--hidden_dim', type=int, default=200, help='hidden layer dimension')
 30 |     parser.add_argument('--num_layers', type=int, default=1, help='number of hidden layers')
 31 |     parser.add_argument('--cell', default='lstm', choices=['lstm', 'gru'], help='rnn cell choice')
 32 |     # training paras
 33 |     parser.add_argument('--reduction', default='sum', choices=['mean', 'sum'], help='loss function argument')
 34 |     parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 35 |     parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)')
 36 |     parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate at each non-recurrent layer')
 37 |     parser.add_argument('--batchSize', type=int, default=16, help='input batch size')
 38 |     parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding')
 39 |     parser.add_argument('--init_weight', type=float, default=0.2, help='all weights will be set to [-init_weight, init_weight] during initialization')
 40 |     parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)")
 41 |     parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for')
 42 |     # special paras
 43 |     parser.add_argument('--decoder_tied', action='store_true', help='whether use the same embedding weights and output matrix')
 44 |     parser.add_argument('--labeled', type=float, default=1.0, help='training use only this propotion of dataset')
 45 |     parser.add_argument('--deviceId', type=int, default=-1, help='train model on ith gpu. -1:cpu')
 46 |     parser.add_argument('--seed', type=int, default=999, help='set initial random seed')
 47 |     opt = parser.parse_args(args)
 48 |     if opt.testing:
 49 |         assert opt.read_model_path
 50 |     return opt
 51 | 
 52 | opt = main()
 53 | 
 54 | ####################### Output path, logger, device and random seed configuration #################
 55 | 
 56 | exp_path = opt.read_model_path if opt.testing else hyperparam_lm(opt)
 57 | if not os.path.exists(exp_path):
 58 |     os.makedirs(exp_path)
 59 | 
 60 | logger = set_logger(exp_path, testing=opt.testing)
 61 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4)))
 62 | logger.info("Experiment path: %s" % (exp_path))
 63 | opt.device = set_torch_device(opt.deviceId)
 64 | set_random_seed(opt.seed, device=opt.device.type)
 65 | 
 66 | ################################ Vocab and Data Reader ###########################
 67 | 
 68 | lm_vocab = Vocab(opt.dataset, task='language_model')
 69 | if opt.side == 'question':
 70 |     word2id = lm_vocab.word2id
 71 |     logger.info("Vocab size for natural language sentence is: %s" % (len(word2id)))
 72 | else:
 73 |     word2id = lm_vocab.lf2id
 74 |     logger.info("Vocab size for logical form is: %s" % (len(word2id)))
 75 | 
 76 | logger.info("Read dataset %s starts at %s" % (opt.dataset, time.asctime(time.localtime(time.time()))))
 77 | Example.set_domain(opt.dataset)
 78 | if not opt.testing:
 79 |     train_dataset, dev_dataset = Example.load_dataset(choice='train')
 80 |     train_dataset, _ = split_dataset(train_dataset, opt.labeled)
 81 |     logger.info("Train and dev dataset size is: %s and %s" % (len(train_dataset), len(dev_dataset)))
 82 | test_dataset = Example.load_dataset(choice='test')
 83 | logger.info("Test dataset size is: %s" % (len(test_dataset)))
 84 | 
 85 | ###################################### Model Construction ########################################
 86 | 
 87 | if not opt.testing:
 88 |     params = {
 89 |         'emb_size': opt.emb_size, 'vocab_size': len(word2id), 'pad_token_idxs': [word2id[PAD]],
 90 |         'hidden_dim': opt.hidden_dim, 'decoder_tied': opt.decoder_tied, 'num_layers': opt.num_layers, 'cell': opt.cell,
 91 |         'dropout': opt.dropout, 'init': opt.init_weight
 92 |     }
 93 |     json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4)
 94 | else:
 95 |     params = json.load(open(os.path.join(exp_path, 'params.json'), 'r'))
 96 | train_model = model(**params)
 97 | train_model = train_model.to(opt.device)
 98 | 
 99 | ##################################### Model Initialization #########################################
100 | 
101 | if not opt.testing:
102 |     ratio = load_embeddings(train_model.encoder, word2id, opt.device)
103 |     logger.info("%.2f%% word embeddings from pretrained vectors" % (ratio * 100))
104 | if opt.testing:
105 |     model_path = os.path.join(opt.read_model_path, 'model.pkl')
106 |     train_model.load_model(model_path)
107 |     logger.info("Load model from path %s" % (model_path))
108 | 
109 | # set loss function and optimizer
110 | loss_function = set_loss_function(ignore_index=word2id[PAD], reduction=opt.reduction)
111 | optimizer = set_optimizer(train_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm)
112 | 
113 | ###################################### Training and Decoding #######################################
114 | 
115 | solver = LMSolver(train_model, lm_vocab, loss_function, optimizer, exp_path, logger, device=opt.device, side=opt.side)
116 | if not opt.testing:
117 |     logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time()))))
118 |     solver.train_and_decode(train_dataset, dev_dataset, test_dataset,
119 |         batchSize=opt.batchSize, test_batchSize=opt.test_batchSize, max_epoch=opt.max_epoch)
120 | else:
121 |     logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time()))))
122 |     start_time = time.time()
123 |     ppl = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'), opt.test_batchSize)
124 |     logger.info('Evaluation cost: %.4fs\tppl : %.4f' % (time.time() - start_time, ppl))
125 | 


--------------------------------------------------------------------------------
/scripts/pseudo_method.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import argparse, os, sys, time, json
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | from utils.vocab import Vocab
  5 | from utils.example import Example, split_dataset
  6 | from utils.optimizer import set_optimizer
  7 | from utils.loss import set_loss_function
  8 | from utils.seed import set_random_seed
  9 | from utils.logger import set_logger
 10 | from utils.gpu import set_torch_device
 11 | from utils.constants import *
 12 | from utils.solver.solver_pseduo_method import PseudoSolver
 13 | from utils.hyperparam import hyperparam_pseudo_method
 14 | from models.construct_models import construct_model as model
 15 | 
 16 | ############################### Arguments parsing and Preparations ##############################
 17 | 
 18 | def main(args=sys.argv[1:]):
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument('--task', required=True, help='pseudo method for semantic parsing')
 21 |     parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)')
 22 |     parser.add_argument('--dataset', required=True, help='which dataset to experiment on')
 23 |     parser.add_argument('--read_model_path', help='Testing mode, load sp and qg model path')
 24 |     # model params
 25 |     parser.add_argument('--read_sp_model_path', required=True, help='pretrained sp model')
 26 |     parser.add_argument('--read_qg_model_path', required=True, help='pretrained qg model path')
 27 |     # pseudo training paras
 28 |     parser.add_argument('--reduction', choices=['sum', 'mean'], default='sum')
 29 |     parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 30 |     parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)')
 31 |     parser.add_argument('--batchSize', type=int, default=16, help='input batch size')
 32 |     parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding')
 33 |     parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)")
 34 |     parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for')
 35 |     # special paras
 36 |     parser.add_argument('--beam', default=5, type=int, help='used during decoding time')
 37 |     parser.add_argument('--n_best', default=1, type=int, help='used during decoding time')
 38 |     parser.add_argument('--labeled', type=float, default=1.0, help='ratio of labeled samples')
 39 |     parser.add_argument('--unlabeled', type=float, default=1.0, help='ratio of unlabeled samples')
 40 |     parser.add_argument('--method', choices=['constant', 'linear'], help='how to change confidence during training')
 41 |     parser.add_argument('--discount', type=float, default=1.0, help="final confidence for pseudo examples")
 42 |     parser.add_argument('--deviceId', type=int, nargs=2, default=[-1, -1], help='gpu indexes for slu and nlg models respectively, -1:cpu')
 43 |     parser.add_argument('--seed', type=int, default=999, help='set initial random seed')
 44 |     parser.add_argument('--extra', action='store_true', help='whether use synthesized logical forms')
 45 |     opt = parser.parse_args(args)
 46 | 
 47 |     # Some Arguments Check
 48 |     assert opt.labeled > 0. and opt.labeled < 1.0
 49 |     assert opt.unlabeled > 0. and opt.unlabeled <= 1.0
 50 |     return opt
 51 | 
 52 | opt = main()
 53 | 
 54 | ####################### Output path, logger, device and random seed configuration #################
 55 | 
 56 | exp_path = opt.read_model_path if opt.testing else hyperparam_pseudo_method(opt)
 57 | if not os.path.exists(exp_path):
 58 |     os.makedirs(exp_path)
 59 | 
 60 | logger = set_logger(exp_path, testing=opt.testing)
 61 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4)))
 62 | logger.info("Experiment path: %s" % (exp_path))
 63 | sp_device, qg_device = set_torch_device(opt.deviceId[0]), set_torch_device(opt.deviceId[1])
 64 | set_random_seed(opt.seed, device='cuda')
 65 | 
 66 | ################################ Vocab and Data Reader ###########################
 67 | 
 68 | sp_copy, qg_copy = 'copy__' in opt.read_sp_model_path, 'copy__' in opt.read_qg_model_path
 69 | sp_vocab, qg_vocab = Vocab(opt.dataset, task='semantic_parsing', copy=sp_copy), Vocab(opt.dataset, task='question_generation', copy=qg_copy)
 70 | logger.info("Semantic Parsing model vocabulary ...")
 71 | logger.info("Vocab size for input natural language sentence is: %s" % (len(sp_vocab.word2id)))
 72 | logger.info("Vocab size for output logical form is: %s" % (len(sp_vocab.lf2id)))
 73 | 
 74 | logger.info("Question Generation model vocabulary ...")
 75 | logger.info("Vocab size for input logical form is: %s" % (len(qg_vocab.lf2id)))
 76 | logger.info("Vocab size for output natural language sentence is: %s" % (len(qg_vocab.word2id)))
 77 | 
 78 | logger.info("Read dataset starts at %s" % (time.asctime(time.localtime(time.time()))))
 79 | Example.set_domain(opt.dataset)
 80 | if not opt.testing:
 81 |     train_dataset, dev_dataset = Example.load_dataset(choice='train')
 82 |     labeled_train_dataset, unlabeled_train_dataset = split_dataset(train_dataset, opt.labeled)
 83 |     unlabeled_train_dataset, _ = split_dataset(unlabeled_train_dataset, opt.unlabeled)
 84 |     if opt.extra:
 85 |         q_unlabeled_train_dataset = unlabeled_train_dataset
 86 |         lf_unlabeled_train_dataset = unlabeled_train_dataset + Example.load_dataset(choice='extra')
 87 |     else:
 88 |         q_unlabeled_train_dataset, lf_unlabeled_train_dataset = unlabeled_train_dataset, unlabeled_train_dataset
 89 |     logger.info("Labeled/Unlabeled train dataset size is: %s and %s" % (len(labeled_train_dataset), len(lf_unlabeled_train_dataset)))
 90 |     logger.info("Dev dataset size is: %s" % (len(dev_dataset)))
 91 | test_dataset = Example.load_dataset(choice='test')
 92 | logger.info("Test dataset size is: %s" % (len(test_dataset)))
 93 | 
 94 | ###################################### Model Construction ########################################
 95 | 
 96 | if not opt.testing:
 97 |     params = { "read_sp_model_path": opt.read_sp_model_path, "read_qg_model_path": opt.read_qg_model_path }
 98 |     json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4)
 99 | else:
100 |     params = json.load(open(os.path.join(exp_path, "params.json"), 'r'))
101 | sp_params = json.load(open(os.path.join(params['read_sp_model_path'], 'params.json'), 'r'))
102 | sp_model = model(**sp_params)
103 | qg_params = json.load(open(os.path.join(params['read_qg_model_path'], 'params.json'), 'r'))
104 | qg_model = model(**qg_params)
105 | if not opt.testing:
106 |     sp_model.load_model(os.path.join(params['read_sp_model_path'], 'model.pkl'))
107 |     logger.info("Load Semantic Parsing model from path %s" % (params['read_sp_model_path']))
108 |     qg_model.load_model(os.path.join(params['read_qg_model_path'], 'model.pkl'))
109 |     logger.info("Load Question Generation model from path %s" % (params['read_qg_model_path']))
110 | else:
111 |     sp_model.load_model(os.path.join(exp_path, 'sp_model.pkl'))
112 |     logger.info("Load Semantic Parsing model from path %s" % (exp_path))
113 |     qg_model.load_model(os.path.join(exp_path, 'qg_model.pkl'))
114 |     logger.info("Load Question Generation model from path %s" % (exp_path))
115 | sp_model, qg_model = sp_model.to(sp_device), qg_model.to(qg_device)
116 | 
117 | loss_function = {'sp': {}, 'qg': {}}
118 | loss_function['sp'] = set_loss_function(ignore_index=sp_vocab.lf2id[PAD], reduction=opt.reduction)
119 | loss_function['qg'] = set_loss_function(ignore_index=qg_vocab.word2id[PAD], reduction=opt.reduction)
120 | optimizer = set_optimizer(sp_model, qg_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm)
121 | 
122 | ###################################### Training and Decoding #######################################
123 | 
124 | train_model = {'sp': sp_model, 'qg': qg_model}
125 | vocab = {'sp': sp_vocab, 'qg': qg_vocab}
126 | device = {'sp': sp_device, 'qg': qg_device}
127 | solver = PseudoSolver(train_model, vocab, loss_function, optimizer, exp_path, logger, device=device,
128 |     discount=opt.discount, method=opt.method)
129 | if not opt.testing:
130 |     logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time()))))
131 |     solver.train_and_decode(labeled_train_dataset, q_unlabeled_train_dataset, lf_unlabeled_train_dataset, dev_dataset, test_dataset,
132 |         batchSize=opt.batchSize, test_batchSize=opt.test_batchSize,
133 |         max_epoch=opt.max_epoch, beam=opt.beam, n_best=opt.n_best)
134 | else:
135 |     logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time()))))
136 |     start_time = time.time()
137 |     acc, bleu = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'), opt.test_batchSize, beam=opt.beam, n_best=opt.n_best)
138 |     logger.info('Evaluation cost: %.4fs\tSemantic Parsing (acc : %.4f)\tQuestion Generation (bleu: %.4f)' 
139 |         % (time.time() - start_time, acc, bleu))
140 | 


--------------------------------------------------------------------------------
/scripts/question_generation.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import argparse, os, sys, time, json
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | from utils.hyperparam import hyperparam_seq2seq
  5 | from utils.logger import set_logger
  6 | from utils.vocab import Vocab
  7 | from utils.seed import set_random_seed
  8 | from utils.example import split_dataset, Example
  9 | from utils.constants import PAD, UNK
 10 | from utils.loss import set_loss_function
 11 | from utils.optimizer import set_optimizer
 12 | from utils.gpu import set_torch_device
 13 | from models.construct_models import construct_model as model
 14 | from utils.word2vec import load_embeddings
 15 | from utils.solver.solver_question_generation import QGSolver
 16 | 
 17 | ############################### Arguments parsing and Preparations ##############################
 18 | 
 19 | def main(args=sys.argv[1:]):
 20 |     parser = argparse.ArgumentParser()
 21 |     parser.add_argument('--task', type=str, default='question_generation', help='question generation')
 22 |     parser.add_argument('--dataset', type=str, required=True, help='which dataset to experiment on')
 23 |     parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)')
 24 |     # pretrained models
 25 |     parser.add_argument('--read_model_path', required=False, help='Read model and hyperparams from this path')
 26 |     # model paras
 27 |     parser.add_argument('--copy', action='store_true', help='attn model or attnptr model')
 28 |     parser.add_argument('--emb_size', type=int, default=100, help='embedding size')
 29 |     parser.add_argument('--hidden_dim', type=int, default=200, help='hidden layer dimension')
 30 |     parser.add_argument('--num_layers', type=int, default=1, help='number of hidden layers')
 31 |     parser.add_argument('--cell', default='lstm', choices=['lstm', 'gru'], help='rnn cell choice')
 32 |     # training paras
 33 |     parser.add_argument('--reduction', default='sum', choices=['mean', 'sum'], help='loss function argument')
 34 |     parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 35 |     parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)')
 36 |     parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate at each non-recurrent layer')
 37 |     parser.add_argument('--batchSize', type=int, default=16, help='input batch size')
 38 |     parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding')
 39 |     parser.add_argument('--init_weight', type=float, default=0.2, help='all weights will be set to [-init_weight, init_weight] during initialization')
 40 |     parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)")
 41 |     parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for')
 42 |     parser.add_argument('--beam', default=5, type=int, help='beam search size')
 43 |     parser.add_argument('--n_best', default=1, type=int, help='return n best results')
 44 |     # special paras
 45 |     parser.add_argument('--labeled', type=float, default=1.0, help='training use only this propotion of dataset')
 46 |     parser.add_argument('--deviceId', type=int, default=-1, help='train model on ith gpu. -1: cpu, o.w. gpu index')
 47 |     parser.add_argument('--seed', type=int, default=999, help='set initial random seed')
 48 |     opt = parser.parse_args(args)
 49 |     if opt.testing:
 50 |         assert opt.read_model_path
 51 |     return opt
 52 | 
 53 | opt = main()
 54 | 
 55 | ####################### Output path, logger, device and random seed configuration #################
 56 | 
 57 | exp_path = opt.read_model_path if opt.testing else hyperparam_seq2seq(opt)
 58 | if not os.path.exists(exp_path):
 59 |     os.makedirs(exp_path)
 60 | 
 61 | logger = set_logger(exp_path, testing=opt.testing)
 62 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4)))
 63 | logger.info("Experiment path: %s" % (exp_path))
 64 | opt.device = set_torch_device(opt.deviceId)
 65 | set_random_seed(opt.seed, device=opt.device.type)
 66 | 
 67 | ################################ Vocab and Data Reader ###########################
 68 | 
 69 | qg_vocab = Vocab(opt.dataset, task='question_generation', copy=opt.copy)
 70 | logger.info("Vocab size for input logical form is: %s" % (len(qg_vocab.lf2id)))
 71 | logger.info("Vocab size for output natural language sentence is: %s" % (len(qg_vocab.word2id)))
 72 | 
 73 | logger.info("Read dataset %s starts at %s" % (opt.dataset, time.asctime(time.localtime(time.time()))))
 74 | Example.set_domain(opt.dataset)
 75 | if not opt.testing:
 76 |     train_dataset, dev_dataset = Example.load_dataset(choice='train')
 77 |     train_dataset, _ = split_dataset(train_dataset, opt.labeled)
 78 |     logger.info("Train and dev dataset size is: %s and %s" % (len(train_dataset), len(dev_dataset)))
 79 | test_dataset = Example.load_dataset(choice='test')
 80 | logger.info("Test dataset size is: %s" % (len(test_dataset)))
 81 | 
 82 | ###################################### Model Construction ########################################
 83 | 
 84 | if not opt.testing:
 85 |     params = {
 86 |         "copy": opt.copy, # model attn or model attnptr
 87 |         "src_vocab": len(qg_vocab.lf2id), "tgt_vocab": len(qg_vocab.word2id),
 88 |         "src_unk_idx": qg_vocab.lf2id[UNK], "tgt_unk_idx": qg_vocab.word2id[UNK],
 89 |         "pad_src_idxs": [qg_vocab.lf2id[PAD]], "pad_tgt_idxs": [qg_vocab.word2id[PAD]],
 90 |         "src_emb_size": opt.emb_size, "tgt_emb_size": opt.emb_size, "hidden_dim": opt.hidden_dim,
 91 |         "num_layers": opt.num_layers, "cell": opt.cell, "dropout": opt.dropout, "init": opt.init_weight
 92 |     }
 93 |     json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4)
 94 | else:
 95 |     params = json.load(open(os.path.join(exp_path, 'params.json'), 'r'))
 96 | train_model = model(**params)
 97 | train_model = train_model.to(opt.device)
 98 | 
 99 | ##################################### Model Initialization #########################################
100 | 
101 | if not opt.testing:
102 |     ratio1 = load_embeddings(train_model.src_embed.embed, qg_vocab.lf2id, opt.device)
103 |     ratio2 = load_embeddings(train_model.tgt_embed.embed, qg_vocab.word2id, opt.device)
104 |     logger.info("%.2f%% token embeddings from pretrained vectors" % (ratio1 * 100))
105 |     logger.info("%.2f%% word embeddings from pretrained vectors" % (ratio2 * 100))
106 | else:
107 |     model_path = os.path.join(opt.read_model_path, 'model.pkl')
108 |     train_model.load_model(model_path)
109 |     logger.info("Load model from path %s" % (model_path))
110 | 
111 | # set loss function and optimizer
112 | loss_function = set_loss_function(ignore_index=qg_vocab.word2id[PAD], reduction=opt.reduction)
113 | optimizer = set_optimizer(train_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm)
114 | 
115 | ###################################### Training and Decoding #######################################
116 | 
117 | solver = QGSolver(train_model, qg_vocab, loss_function, optimizer, exp_path, logger, device=opt.device)
118 | if not opt.testing:
119 |     logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time()))))
120 |     solver.train_and_decode(train_dataset, dev_dataset, test_dataset,
121 |         batchSize=opt.batchSize, test_batchSize=opt.test_batchSize,
122 |         max_epoch=opt.max_epoch, beam=opt.beam, n_best=opt.n_best)
123 | else:
124 |     logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time()))))
125 |     start_time = time.time()
126 |     bleu = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'),
127 |         opt.test_batchSize, beam=opt.beam, n_best=opt.n_best)
128 |     logger.info('Evaluation cost: %.4fs\tAcc : %.4f' % (time.time() - start_time, bleu))


--------------------------------------------------------------------------------
/scripts/semantic_parsing.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import argparse, os, sys, time, json
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | from utils.hyperparam import hyperparam_seq2seq
  5 | from utils.logger import set_logger
  6 | from utils.vocab import Vocab
  7 | from utils.seed import set_random_seed
  8 | from utils.example import split_dataset, Example
  9 | from utils.constants import PAD, UNK
 10 | from utils.loss import set_loss_function
 11 | from utils.optimizer import set_optimizer
 12 | from utils.gpu import set_torch_device
 13 | from models.construct_models import construct_model as model
 14 | from utils.word2vec import load_embeddings
 15 | from utils.solver.solver_semantic_parsing import SPSolver
 16 | 
 17 | ############################### Arguments parsing and Preparations ##############################
 18 | 
 19 | def main(args=sys.argv[1:]):
 20 |     parser = argparse.ArgumentParser()
 21 |     parser.add_argument('--task', type=str, default='semantic_parsing', help='semantic parsing')
 22 |     parser.add_argument('--dataset', type=str, required=True, help='which dataset to experiment on')
 23 |     parser.add_argument('--testing', action='store_true', help='Only test your model (default is training && testing)')
 24 |     # pretrained models
 25 |     parser.add_argument('--read_model_path', required=False, help='Read model and hyperparams from this path')
 26 |     # model paras
 27 |     parser.add_argument('--copy', action='store_true', help='attn model or attnptr model')
 28 |     parser.add_argument('--emb_size', type=int, default=100, help='embedding size')
 29 |     parser.add_argument('--hidden_dim', type=int, default=200, help='hidden layer dimension')
 30 |     parser.add_argument('--num_layers', type=int, default=1, help='number of hidden layers')
 31 |     parser.add_argument('--cell', default='lstm', choices=['lstm', 'gru'], help='rnn cell choice')
 32 |     # training paras
 33 |     parser.add_argument('--reduction', default='sum', choices=['mean', 'sum'], help='loss function argument')
 34 |     parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 35 |     parser.add_argument('--l2', type=float, default=1e-5, help='weight decay (L2 penalty)')
 36 |     parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate at each non-recurrent layer')
 37 |     parser.add_argument('--batchSize', type=int, default=16, help='input batch size')
 38 |     parser.add_argument('--test_batchSize', type=int, default=128, help='input batch size in decoding')
 39 |     parser.add_argument('--init_weight', type=float, default=0.2, help='all weights will be set to [-init_weight, init_weight] during initialization')
 40 |     parser.add_argument('--max_norm', type=float, default=5, help="threshold of gradient clipping (2-norm)")
 41 |     parser.add_argument('--max_epoch', type=int, default=100, help='max number of epochs to train for')
 42 |     parser.add_argument('--beam', default=5, type=int, help='beam search size')
 43 |     parser.add_argument('--n_best', default=1, type=int, help='return n best results')
 44 |     # special paras
 45 |     parser.add_argument('--labeled', type=float, default=1.0, help='training use only this propotion of dataset')
 46 |     parser.add_argument('--deviceId', type=int, default=-1, help='train model on ith gpu. -1: cpu, o.w. gpu index')
 47 |     parser.add_argument('--seed', type=int, default=999, help='set initial random seed')
 48 |     opt = parser.parse_args(args)
 49 |     if opt.testing:
 50 |         assert opt.read_model_path
 51 |     return opt
 52 | 
 53 | opt = main()
 54 | 
 55 | ####################### Output path, logger, device and random seed configuration #################
 56 | 
 57 | exp_path = opt.read_model_path if opt.testing else hyperparam_seq2seq(opt)
 58 | if not os.path.exists(exp_path):
 59 |     os.makedirs(exp_path)
 60 | 
 61 | logger = set_logger(exp_path, testing=opt.testing)
 62 | logger.info("Parameters: " + str(json.dumps(vars(opt), indent=4)))
 63 | logger.info("Experiment path: %s" % (exp_path))
 64 | opt.device = set_torch_device(opt.deviceId)
 65 | set_random_seed(opt.seed, device=opt.device.type)
 66 | 
 67 | ################################ Vocab and Data Reader ###########################
 68 | 
 69 | sp_vocab = Vocab(opt.dataset, task='semantic_parsing', copy=opt.copy)
 70 | logger.info("Vocab size for input natural language sentence is: %s" % (len(sp_vocab.word2id)))
 71 | logger.info("Vocab size for output logical form is: %s" % (len(sp_vocab.lf2id)))
 72 | 
 73 | logger.info("Read dataset %s starts at %s" % (opt.dataset, time.asctime(time.localtime(time.time()))))
 74 | Example.set_domain(opt.dataset)
 75 | if not opt.testing:
 76 |     train_dataset, dev_dataset = Example.load_dataset(choice='train')
 77 |     train_dataset, _ = split_dataset(train_dataset, opt.labeled)
 78 |     logger.info("Train and dev dataset size is: %s and %s" % (len(train_dataset), len(dev_dataset)))
 79 | test_dataset = Example.load_dataset(choice='test')
 80 | logger.info("Test dataset size is: %s" % (len(test_dataset)))
 81 | 
 82 | ###################################### Model Construction ########################################
 83 | 
 84 | if not opt.testing:
 85 |     params = {
 86 |         "copy": opt.copy, # model attn or model attnptr
 87 |         "src_vocab": len(sp_vocab.word2id), "tgt_vocab": len(sp_vocab.lf2id),
 88 |         "src_unk_idx": sp_vocab.word2id[UNK], "tgt_unk_idx": sp_vocab.lf2id[UNK],
 89 |         "pad_src_idxs": [sp_vocab.word2id[PAD]], "pad_tgt_idxs": [sp_vocab.lf2id[PAD]],
 90 |         "src_emb_size": opt.emb_size, "tgt_emb_size": opt.emb_size, "hidden_dim": opt.hidden_dim,
 91 |         "num_layers": opt.num_layers, "cell": opt.cell, "dropout": opt.dropout, "init": opt.init_weight
 92 |     }
 93 |     json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4)
 94 | else:
 95 |     params = json.load(open(os.path.join(exp_path, 'params.json'), 'r'))
 96 | train_model = model(**params)
 97 | train_model = train_model.to(opt.device)
 98 | 
 99 | ##################################### Model Initialization #########################################
100 | 
101 | if not opt.testing:
102 |     ratio1 = load_embeddings(train_model.src_embed.embed, sp_vocab.word2id, opt.device)
103 |     ratio2 = load_embeddings(train_model.tgt_embed.embed, sp_vocab.lf2id, opt.device)
104 |     logger.info("%.2f%% word embeddings from pretrained vectors" % (ratio1 * 100))
105 |     logger.info("%.2f%% token embeddings from pretrained vectors" % (ratio2 * 100))
106 | else:
107 |     model_path = os.path.join(opt.read_model_path, 'model.pkl')
108 |     train_model.load_model(model_path)
109 |     logger.info("Load model from path %s" % (model_path))
110 | 
111 | # set loss function and optimizer
112 | loss_function = set_loss_function(ignore_index=sp_vocab.lf2id[PAD], reduction=opt.reduction)
113 | optimizer = set_optimizer(train_model, lr=opt.lr, l2=opt.l2, max_norm=opt.max_norm)
114 | 
115 | ###################################### Training and Decoding #######################################
116 | 
117 | solver = SPSolver(train_model, sp_vocab, loss_function, optimizer, exp_path, logger, device=opt.device)
118 | if not opt.testing:
119 |     logger.info("Training starts at %s" % (time.asctime(time.localtime(time.time()))))
120 |     solver.train_and_decode(train_dataset, dev_dataset, test_dataset,
121 |         batchSize=opt.batchSize, test_batchSize=opt.test_batchSize,
122 |         max_epoch=opt.max_epoch, beam=opt.beam, n_best=opt.n_best)
123 | else:
124 |     logger.info("Testing starts at %s" % (time.asctime(time.localtime(time.time()))))
125 |     start_time = time.time()
126 |     accuracy = solver.decode(test_dataset, os.path.join(exp_path, 'test.eval'),
127 |         opt.test_batchSize, beam=opt.beam, n_best=opt.n_best)
128 |     logger.info('Evaluation cost: %.4fs\tAcc : %.4f' % (time.time() - start_time, accuracy))


--------------------------------------------------------------------------------
/utils/batch.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import sys, os, random
  3 | import torch
  4 | from utils.constants import *
  5 | 
  6 | def get_minibatch(data_list, vocab, task='semantic_parsing', data_index=None, index=0, batch_size=16, device=None, **kargs):
  7 |     index = index % len(data_list)
  8 |     batch_data_list = [data_list[idx] for idx in data_index[index: index + batch_size]]
  9 |     return BATCH_FUNC[task](batch_data_list, vocab, device, **kargs)
 10 | 
 11 | def get_minibatch_sp(ex_list, vocab, device, copy=False, **kargs):
 12 |     inputs = [ex.question for ex in ex_list]
 13 |     lens = [len(ex) for ex in inputs]
 14 |     lens_tensor = torch.tensor(lens, dtype=torch.long, device=device)
 15 | 
 16 |     max_len = max(lens)
 17 |     padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in inputs]
 18 |     inputs_idx = [[vocab.word2id[w] if w in vocab.word2id else vocab.word2id[UNK] for w in sent] for sent in padded_inputs]
 19 |     inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device)
 20 | 
 21 |     outputs = [ex.logical_form for ex in ex_list]
 22 |     bos_eos_outputs = [[BOS] + sent + [EOS] for sent in outputs]
 23 |     out_lens = [len(each) for each in bos_eos_outputs]
 24 |     max_out_len = max(out_lens)
 25 |     padded_outputs = [sent + [PAD] * (max_out_len - len(sent)) for sent in bos_eos_outputs]
 26 |     outputs_idx = [[vocab.lf2id[w] if w in vocab.lf2id else vocab.lf2id[UNK] for w in sent] for sent in padded_outputs]
 27 |     outputs_tensor = torch.tensor(outputs_idx, dtype=torch.long, device=device)
 28 |     out_lens_tensor = torch.tensor(out_lens, dtype=torch.long, device=device)
 29 | 
 30 |     if copy: # pointer network need additional information
 31 |         mapped_inputs = [ex.mapped_question for ex in ex_list]
 32 |         oov_list, copy_inputs = [], []
 33 |         for sent in mapped_inputs:
 34 |             tmp_oov_list, tmp_copy_inputs = [], []
 35 |             for idx, word in enumerate(sent):
 36 |                 if word not in vocab.lf2id and word not in tmp_oov_list and len(tmp_oov_list) < MAX_OOV_NUM:
 37 |                     tmp_oov_list.append(word)
 38 |                 tmp_copy_inputs.append(
 39 |                     (
 40 |                         vocab.lf2id.get(word, vocab.lf2id[UNK]) if word in vocab.lf2id or word not in tmp_oov_list \
 41 |                         else len(vocab.lf2id) + tmp_oov_list.index(word) # tgt_vocab_size + oov_id
 42 |                     )
 43 |                 )
 44 |             tmp_oov_list += [UNK] * (MAX_OOV_NUM - len(tmp_oov_list))
 45 |             oov_list.append(tmp_oov_list)
 46 |             copy_inputs.append(tmp_copy_inputs)
 47 | 
 48 |         copy_tokens = [
 49 |             torch.cat([
 50 |                 torch.zeros(len(each), len(vocab.lf2id) + MAX_OOV_NUM, dtype=torch.float)\
 51 |                     .scatter_(-1, torch.tensor(each, dtype=torch.long).unsqueeze(-1), 1.0),
 52 |                 torch.zeros(max_len - len(each), len(vocab.lf2id) + MAX_OOV_NUM, dtype=torch.float)
 53 |             ], dim=0)
 54 |             for each in copy_inputs
 55 |         ]
 56 |         copy_tokens = torch.stack(copy_tokens, dim=0).to(device) # bsize x src_len x (tgt_vocab + MAX_OOV_NUM)
 57 | 
 58 |         dec_outputs = [
 59 |             [
 60 |                 len(vocab.lf2id) + oov_list[idx].index(tok)
 61 |                     if tok not in vocab.lf2id and tok in oov_list[idx] \
 62 |                     else vocab.lf2id.get(tok, vocab.lf2id[UNK])
 63 |                 for tok in sent
 64 |             ] + [vocab.lf2id[PAD]] * (max_out_len - len(sent))
 65 |             for idx, sent in enumerate(bos_eos_outputs)
 66 |         ]
 67 |         dec_outputs_tensor = torch.tensor(dec_outputs, dtype=torch.long, device=device)
 68 |     else:
 69 |         dec_outputs_tensor, copy_tokens, oov_list = outputs_tensor, None, []
 70 | 
 71 |     return inputs_tensor, lens_tensor, outputs_tensor, dec_outputs_tensor, out_lens_tensor, copy_tokens, oov_list, (inputs, outputs)
 72 | 
 73 | def get_minibatch_qg(ex_list, vocab, device, copy=False, **kargs):
 74 |     raw_inputs = [ex.logical_form for ex in ex_list]
 75 |     inputs = [ex.mapped_logical_form for ex in ex_list] if copy else raw_inputs
 76 |     lens = [len(ex) for ex in inputs]
 77 |     lens_tensor = torch.tensor(lens, dtype=torch.long, device=device)
 78 | 
 79 |     max_len = max(lens)
 80 |     padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in inputs]
 81 |     inputs_idx = [[vocab.lf2id[w] if w in vocab.lf2id else vocab.lf2id[UNK] for w in sent] for sent in padded_inputs]
 82 |     inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device)
 83 | 
 84 |     outputs = [ex.question for ex in ex_list]
 85 |     bos_eos_outputs = [[BOS] + sent + [EOS] for sent in outputs]
 86 |     out_lens = [len(each) for each in bos_eos_outputs]
 87 |     max_out_len = max(out_lens)
 88 |     padded_outputs = [sent + [PAD] * (max_out_len - len(sent)) for sent in bos_eos_outputs]
 89 |     outputs_idx = [[vocab.word2id[w] if w in vocab.word2id else vocab.word2id[UNK] for w in sent] for sent in padded_outputs]
 90 |     outputs_tensor = torch.tensor(outputs_idx, dtype=torch.long, device=device)
 91 |     out_lens_tensor = torch.tensor(out_lens, dtype=torch.long, device=device)
 92 | 
 93 |     if copy: # pointer network need additional information
 94 |         oov_list, copy_inputs = [], []
 95 |         for sent in inputs:
 96 |             tmp_oov_list, tmp_copy_inputs = [], []
 97 |             for idx, word in enumerate(sent):
 98 |                 if word not in vocab.word2id and word not in tmp_oov_list and len(tmp_oov_list) < MAX_OOV_NUM:
 99 |                     tmp_oov_list.append(word)
100 |                 tmp_copy_inputs.append(
101 |                     (
102 |                         vocab.word2id.get(word, vocab.word2id[UNK]) if word in vocab.word2id or word not in tmp_oov_list \
103 |                         else len(vocab.word2id) + tmp_oov_list.index(word) # tgt_vocab_size + oov_id
104 |                     )
105 |                 )
106 |             tmp_oov_list += [UNK] * (MAX_OOV_NUM - len(tmp_oov_list))
107 |             oov_list.append(tmp_oov_list)
108 |             copy_inputs.append(tmp_copy_inputs)
109 | 
110 |         copy_tokens = [
111 |             torch.cat([
112 |                 torch.zeros(len(each), len(vocab.word2id) + MAX_OOV_NUM, dtype=torch.float)\
113 |                     .scatter_(-1, torch.tensor(each, dtype=torch.long).unsqueeze(-1), 1.0),
114 |                 torch.zeros(max_len - len(each), len(vocab.word2id) + MAX_OOV_NUM, dtype=torch.float)
115 |             ], dim=0)
116 |             for each in copy_inputs
117 |         ]
118 |         copy_tokens = torch.stack(copy_tokens, dim=0).to(device) # bsize x src_len x (tgt_vocab + MAX_OOV_NUM)
119 | 
120 |         dec_outputs = [
121 |             [
122 |                 len(vocab.word2id) + oov_list[idx].index(tok)
123 |                     if tok not in vocab.word2id and tok in oov_list[idx] \
124 |                     else vocab.word2id.get(tok, vocab.word2id[UNK])
125 |                 for tok in sent
126 |             ] + [vocab.word2id[PAD]] * (max_out_len - len(sent))
127 |             for idx, sent in enumerate(bos_eos_outputs)
128 |         ]
129 |         dec_outputs_tensor = torch.tensor(dec_outputs, dtype=torch.long, device=device)
130 |     else:
131 |         dec_outputs_tensor, copy_tokens, oov_list = outputs_tensor, None, []
132 | 
133 |     return inputs_tensor, lens_tensor, outputs_tensor, dec_outputs_tensor, out_lens_tensor, copy_tokens, oov_list, (raw_inputs, outputs)
134 | 
135 | def get_minibatch_unlabeled_sp(ex_list, vocab, device, copy=False, **kargs):
136 |     inputs = [ex.question for ex in ex_list]
137 |     lens = [len(ex) for ex in inputs]
138 |     lens_tensor = torch.tensor(lens, dtype=torch.long, device=device)
139 | 
140 |     max_len = max(lens)
141 |     padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in inputs]
142 |     inputs_idx = [[vocab.word2id[w] if w in vocab.word2id else vocab.word2id[UNK] for w in sent] for sent in padded_inputs]
143 |     inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device)
144 | 
145 |     if copy: # pointer network need additional information
146 |         mapped_inputs = [ex.mapped_question for ex in ex_list]
147 |         oov_list, copy_inputs = [], []
148 |         for sent in mapped_inputs:
149 |             tmp_oov_list, tmp_copy_inputs = [], []
150 |             for idx, word in enumerate(sent):
151 |                 if word not in vocab.lf2id and word not in tmp_oov_list and len(tmp_oov_list) < MAX_OOV_NUM:
152 |                     tmp_oov_list.append(word)
153 |                 tmp_copy_inputs.append(
154 |                     (
155 |                         vocab.lf2id.get(word, vocab.lf2id[UNK]) if word in vocab.lf2id or word not in tmp_oov_list \
156 |                         else len(vocab.lf2id) + tmp_oov_list.index(word) # tgt_vocab_size + oov_id
157 |                     )
158 |                 )
159 |             tmp_oov_list += [UNK] * (MAX_OOV_NUM - len(tmp_oov_list))
160 |             oov_list.append(tmp_oov_list)
161 |             copy_inputs.append(tmp_copy_inputs)
162 | 
163 |         copy_tokens = [
164 |             torch.cat([
165 |                 torch.zeros(len(each), len(vocab.lf2id) + MAX_OOV_NUM, dtype=torch.float)\
166 |                     .scatter_(-1, torch.tensor(each, dtype=torch.long).unsqueeze(-1), 1.0),
167 |                 torch.zeros(max_len - len(each), len(vocab.lf2id) + MAX_OOV_NUM, dtype=torch.float)
168 |             ], dim=0)
169 |             for each in copy_inputs
170 |         ]
171 |         copy_tokens = torch.stack(copy_tokens, dim=0).to(device) # bsize x src_len x (tgt_vocab + MAX_OOV_NUM)
172 |     else:
173 |         copy_tokens, oov_list = None, []
174 | 
175 |     return inputs_tensor, lens_tensor, copy_tokens, oov_list, inputs
176 | 
177 | def get_minibatch_unlabeled_qg(ex_list, vocab, device, copy=False, **kargs):
178 |     raw_inputs = [ex.logical_form for ex in ex_list]
179 |     inputs = [ex.mapped_logical_form for ex in ex_list] if copy else raw_inputs
180 |     lens = [len(ex) for ex in inputs]
181 |     lens_tensor = torch.tensor(lens, dtype=torch.long, device=device)
182 | 
183 |     max_len = max(lens)
184 |     padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in inputs]
185 |     inputs_idx = [[vocab.lf2id[w] if w in vocab.lf2id else vocab.lf2id[UNK] for w in sent] for sent in padded_inputs]
186 |     inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device)
187 | 
188 |     if copy: # pointer network need additional information
189 |         oov_list, copy_inputs = [], []
190 |         for sent in inputs:
191 |             tmp_oov_list, tmp_copy_inputs = [], []
192 |             for idx, word in enumerate(sent):
193 |                 if word not in vocab.word2id and word not in tmp_oov_list and len(tmp_oov_list) < MAX_OOV_NUM:
194 |                     tmp_oov_list.append(word)
195 |                 tmp_copy_inputs.append(
196 |                     (
197 |                         vocab.word2id.get(word, vocab.word2id[UNK]) if word in vocab.word2id or word not in tmp_oov_list \
198 |                         else len(vocab.word2id) + tmp_oov_list.index(word) # tgt_vocab_size + oov_id
199 |                     )
200 |                 )
201 |             tmp_oov_list += [UNK] * (MAX_OOV_NUM - len(tmp_oov_list))
202 |             oov_list.append(tmp_oov_list)
203 |             copy_inputs.append(tmp_copy_inputs)
204 | 
205 |         copy_tokens = [
206 |             torch.cat([
207 |                 torch.zeros(len(each), len(vocab.word2id) + MAX_OOV_NUM, dtype=torch.float)\
208 |                     .scatter_(-1, torch.tensor(each, dtype=torch.long).unsqueeze(-1), 1.0),
209 |                 torch.zeros(max_len - len(each), len(vocab.word2id) + MAX_OOV_NUM, dtype=torch.float)
210 |             ], dim=0)
211 |             for each in copy_inputs
212 |         ]
213 |         copy_tokens = torch.stack(copy_tokens, dim=0).to(device) # bsize x src_len x (tgt_vocab + MAX_OOV_NUM)
214 |     else:
215 |         copy_tokens, oov_list = None, []
216 | 
217 |     return inputs_tensor, lens_tensor, copy_tokens, oov_list, raw_inputs
218 | 
219 | def get_minibatch_pseudo_sp(ex_list, vocab, device, copy=False, **kargs):
220 |     inputs, lens, outputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch_sp(ex_list, vocab, device, copy=copy, **kargs)
221 |     conf = torch.tensor([ex.conf for ex in ex_list], dtype=torch.float, device=device)
222 |     return inputs, lens, outputs, dec_outputs, out_lens, copy_tokens, conf
223 | 
224 | def get_minibatch_pseudo_qg(ex_list, vocab, device, copy=False, **kargs):
225 |     inputs, lens, outputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch_qg(ex_list, vocab, device, copy=copy, **kargs)
226 |     conf = torch.tensor([ex.conf for ex in ex_list], dtype=torch.float, device=device)
227 |     return inputs, lens, outputs, dec_outputs, out_lens, copy_tokens, conf
228 | 
229 | def get_minibatch_lm(ex_list, vocab, device, side='question', **kargs):
230 |     if side == 'question':
231 |         word2id = vocab.word2id
232 |         inputs = [ex.question for ex in ex_list]
233 |     else:
234 |         word2id = vocab.lf2id
235 |         inputs = [ex.logical_form for ex in ex_list]
236 |     bos_eos_inputs = [[BOS] + sent + [EOS] for sent in inputs]
237 |     lens = [len(each) for each in bos_eos_inputs]
238 |     max_len = max(lens)
239 |     padded_inputs = [sent + [PAD] * (max_len - len(sent)) for sent in bos_eos_inputs]
240 |     inputs_idx = [[word2id[w] if w in word2id else word2id[UNK] for w in sent] for sent in padded_inputs]
241 |     inputs_tensor = torch.tensor(inputs_idx, dtype=torch.long, device=device)
242 |     lens = torch.tensor(lens, dtype=torch.long, device=device)
243 |     return inputs_tensor, lens, inputs
244 | 
245 | BATCH_FUNC = {
246 |     "semantic_parsing": get_minibatch_sp,
247 |     "question_generation": get_minibatch_qg,
248 |     "unlabeled_semantic_parsing": get_minibatch_unlabeled_sp,
249 |     "unlabeled_question_generation": get_minibatch_unlabeled_qg,
250 |     "pseudo_semantic_parsing": get_minibatch_pseudo_sp,
251 |     "pseudo_question_generation": get_minibatch_pseudo_qg,
252 |     "language_model": get_minibatch_lm
253 | }
254 | 


--------------------------------------------------------------------------------
/utils/bleu.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import os, sys, nltk
 3 | from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
 4 | from nltk.translate.bleu_score import SmoothingFunction
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 6 | 
 7 | def get_bleu_score(candidate_list, references_list, method=0, weights=(0.25, 0.25, 0.25, 0.25)):
 8 |     '''
 9 |         @args:
10 |         if candidate_list is words list, e.g. ['which','flight']
11 |             references_list is list of words list, e.g. [ ['which','flight'] , ['what','flight'] ]
12 |             calculate bleu score of one sentence
13 |         if candidate_list is list of words list, e.g. [ ['which','flight'] , ['when','to','flight'] ]
14 |             references_list is list of list of words list, e.g.
15 |             [   [ ['which','flight'] , ['what','flight'] ]   ,   [ ['when','to','flight'] , ['when','to','go'] ]   ]
16 |             calculate bleu score of multiple sentences, a whole corpus
17 |         method(int): chencherry smoothing methods choice
18 |     '''
19 |     chencherry = SmoothingFunction()
20 |     if len(candidate_list) == 0:
21 |         raise ValueError('[Error]: there is no candidate sentence!')
22 |     if type(candidate_list[0]) == str:
23 |         return sentence_bleu(
24 |                     references_list,
25 |                     candidate_list,
26 |                     weights,
27 |                     eval('chencherry.method' + str(method))
28 |                 )
29 |     else:
30 |         return corpus_bleu(
31 |                     references_list,
32 |                     candidate_list,
33 |                     weights,
34 |                     eval('chencherry.method' + str(method))
35 |                 )


--------------------------------------------------------------------------------
/utils/constants.py:
--------------------------------------------------------------------------------
1 | #coding=utf8
2 | BOS = '<s>'
3 | EOS = '</s>'
4 | PAD = '<pad>'
5 | UNK = '<unk>'
6 | MAX_DECODE_LENGTH = 100
7 | MAX_OOV_NUM = 50
8 | VECTORCACHE = lambda emb_dim: 'data/.cache/glove.6B.' + str(emb_dim) + 'd.txt'


--------------------------------------------------------------------------------
/utils/domain/domain_atis.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import sys, os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
  4 | from utils.domain.domain_base import Domain
  5 | from utils.domain.atis_evaluator import ATISEvaluator
  6 | 
  7 | class ATISDomain(Domain):
  8 | 
  9 |     def __init__(self):
 10 | 
 11 |         self.dataset = 'atis'
 12 |         self.denotation = False
 13 |         self.evaluator = ATISEvaluator()
 14 | 
 15 |     def to_lisp_tree(self, toks):
 16 |         '''
 17 |             input(list): ['lambda', '$0', 'e', '(', 'flight', '$0', ')']
 18 |             return(recursive list): ['lambda', '$0', 'e', ['flight', '$0']]
 19 |         '''
 20 |         def recurse(i):
 21 |             if toks[i] == '(':
 22 |                 subtrees = []
 23 |                 j = i + 1
 24 |                 while True:
 25 |                     subtree, j = recurse(j)
 26 |                     subtrees.append(subtree)
 27 |                     if toks[j] == ')':
 28 |                         return subtrees, j + 1
 29 |             else:
 30 |                 return toks[i], i+1
 31 | 
 32 |         try:
 33 |             lisp_tree, final_ind = recurse(0)
 34 |             return lisp_tree
 35 |         except Exception as e:
 36 |             return None
 37 | 
 38 |     def sort_args(self, lf):
 39 |         lisp_tree = self.to_lisp_tree(lf)
 40 |         if lisp_tree is None: # failed to convert to logical tree
 41 |             return ' '.join(lf)
 42 | 
 43 |         def recurse(node): # Post-order traversal, sort and/or subtrees
 44 |             if isinstance(node, str):
 45 |                 return
 46 |             for child in node:
 47 |                 recurse(child)
 48 |             if node[0] in ('_and', '_or', 'and', 'or'):
 49 |                 node[1:] = sorted(node[1:], key=lambda x: str(x))
 50 | 
 51 |         recurse(lisp_tree)
 52 | 
 53 |         def tree_to_str(node):
 54 |             if isinstance(node, str):
 55 |                 return node
 56 |             else:
 57 |                 return '( %s )' % ' '.join(tree_to_str(child) for child in node)
 58 | 
 59 |         return tree_to_str(lisp_tree)
 60 | 
 61 |     def normalize(self, lf_list):
 62 |         sorted_lf_list = [self.sort_args(lf) for lf in lf_list]
 63 |         return sorted_lf_list
 64 | 
 65 |     def is_valid(self, ans_list):
 66 | 
 67 |         def bracket_matching(lf):
 68 |             left = 0
 69 |             for each in lf:
 70 |                 if each == '(':
 71 |                     left += 1
 72 |                 elif each == ')':
 73 |                     left -= 1
 74 |                 if left < 0:
 75 |                     return 0.0
 76 |             return 1.0 if left == 0 else 0.0
 77 | 
 78 |         ans_list = [[i.strip() for i in lf.split(' ') if i.strip() != ''] for lf in ans_list]
 79 |         bracket_signal = list(map(bracket_matching, ans_list))
 80 |         lisp_trees = [self.to_lisp_tree(each) if bracket_signal[idx] == 1.0 else None for idx, each in enumerate(ans_list)]
 81 |         type_consistency = [self.evaluator.eval(each) if each is not None else 0.0 for each in lisp_trees]
 82 |         return list(map(lambda x, y: (0.5 if x is not None else 0.0) + 0.5 * y, lisp_trees, type_consistency))
 83 | 
 84 | if __name__ =='__main__':
 85 | 
 86 |     d = ATISDomain()
 87 | 
 88 |     lfs = [
 89 |        'al0',
 90 |        '( count $0 ( and ( flight $0 ) ( airline $0 al0 ) ) )',
 91 |        '( lambda $0 e ( and ( flight $0 ) ( approx_departure_time $0 ti0 ) ( during_day $0 afternoon:pd ) ( from $0 ci0 ) ( to $0 ci1 ) ) )',
 92 |        '( min $0 ( exists $1 ( and ( from $1 ci0 ) ( to $1 ci1 ) ( round_trip $1 ) ( = ( fare $1 ) $0 ) ) ) )',
 93 |        '( sum $0 ( and ( aircraft $0 ) ( airline $0 al0 ) ) ( capacity $0 ) )',
 94 |        '( argmin $0 ( and ( flight $0 ) ( from $0 ci0 ) ( to $0 ci1 ) ( day $0 da0 ) ) ( departure_time $0 ) )',
 95 |     ]
 96 |     lfs = [[i.strip() for i in lf.split(' ') if i.strip() != ''] for lf in lfs]
 97 |     lisp_trees = [d.to_lisp_tree(lf) for lf in lfs]
 98 |     for lf in lisp_trees:
 99 |         print('\nInput:', lf)
100 |         try:
101 |             t, v = d.evaluator.check_type_consistency(lf, variables={})
102 |             print('Return type: %s' % (t))
103 |             print('Variables: %s' % (v))
104 |         except Exception as e:
105 |             print('Type inconsistent !')
106 |             print(e)
107 | 
108 |     exit(0)
109 | 
110 |     def read_lfs(fp, idx=0):
111 |         lfs = []
112 |         with open(fp, 'r') as infile:
113 |             for line in infile:
114 |                 line = line.strip()
115 |                 if line == '': continue
116 |                 q, lf = line.split('\t')
117 |                 lf = [i.strip() for i in lf.split(' ') if i.strip() != '']
118 |                 lfs.append((q, lf, idx))
119 |                 idx += 1
120 |         return lfs
121 | 
122 |     dataset = read_lfs('data/atis/atis_train.tsv') + read_lfs('data/atis/atis_dev.tsv') + \
123 |             read_lfs('data/atis/atis_test.tsv') + read_lfs('data/atis/atis_extra.tsv')
124 |     count = 0
125 |     for (q, lf, i) in dataset:
126 |         lisp_tree = d.to_lisp_tree(lf)
127 |         score = d.evaluator.eval(lisp_tree)
128 |         if score == 0.0:
129 |             count += 1
130 |             print('The %d-th sample is type inconsistent:' % (i + 1))
131 |             print(q)
132 |             print(' '.join(lf))
133 |             print('')
134 |     print('Total error: %d' % (count))
135 | 


--------------------------------------------------------------------------------
/utils/domain/domain_base.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import sys, os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
  4 | from utils.constants import BOS, EOS, PAD, UNK
  5 | from utils.bleu import get_bleu_score
  6 | 
  7 | class Domain():
  8 | 
  9 |     def __init__(self):
 10 |         super(Domain, self).__init__()
 11 |         self.dataset = None
 12 |         self.denotation = False
 13 | 
 14 |     @classmethod
 15 |     def from_dataset(self, dataset):
 16 |         if dataset == 'atis':
 17 |             from utils.domain.domain_atis import ATISDomain
 18 |             return ATISDomain()
 19 |         elif dataset == 'geo':
 20 |             from utils.domain.domain_geo import GEODomain
 21 |             return GEODomain()
 22 |         else:
 23 |             from utils.domain.domain_overnight import OvernightDomain
 24 |             return OvernightDomain(dataset)
 25 | 
 26 |     def reverse(self, idx_list, vocab, end_mask=EOS, oov_list=[],
 27 |                     special_list=[BOS, EOS, PAD]):
 28 |         ''' 
 29 |         Change idx list to token list without special tokens.
 30 |         @args:
 31 |             1. idx_list: list of idx list, not tensor
 32 |             2. vocab: idx to token list
 33 |             3. end_mask: stop parsing when meets this token
 34 |             4. oov_list: out of tgt vocab words, but in src inputs
 35 |             5. special_list: remove these tokens in sequence, list of symbols
 36 |         @return:
 37 |             token list
 38 |         '''
 39 |         unk_index = vocab.index(UNK)
 40 |         n_best = len(idx_list) / len(oov_list) if oov_list else None
 41 |         seq = [
 42 |             [
 43 |                 oov_list[int(idx / n_best)][tok - len(vocab)] if tok >= len(vocab) else vocab[tok] for tok in tokens
 44 |             ]
 45 |             for idx, tokens in enumerate(idx_list)
 46 |         ] if oov_list else [[vocab[tok] if tok < len(vocab) else UNK for tok in tokens] for tokens in idx_list]
 47 | 
 48 |         def trim(s, t):
 49 |             sentence = []
 50 |             for w in s:
 51 |                 if w == t:
 52 |                     break
 53 |                 sentence.append(w)
 54 |             return sentence
 55 | 
 56 |         result = [trim(ex, end_mask) for ex in seq]
 57 | 
 58 |         def filter_special(tok):
 59 |             return tok not in special_list
 60 |         
 61 |         result = [list(filter(filter_special, ex)) for ex in result]
 62 |         return result
 63 | 
 64 |     def compare_question(self, predictions, references):
 65 |         """
 66 |             predictions and references should be list of token list
 67 |         """
 68 |         n_best = int(len(predictions) / len(references))
 69 |         references = [[ref] for ref in references for _ in range(n_best)]
 70 |         bleu_list = list(map(get_bleu_score, predictions, references)) # sentence-level bleu score
 71 |         return bleu_list
 72 | 
 73 |     def compare_logical_form(self, predictions, references, pick=True):
 74 |         """
 75 |             predictions and references should be list of token list
 76 |             pick(bool): pick the first prediction without syntax or execution error if n_best > 1
 77 |         """
 78 |         predictions = self.normalize(predictions)
 79 |         references = self.normalize(references)
 80 |         n_best = int(len(predictions) / len(references))
 81 |         if self.denotation:
 82 |             all_lf = predictions + references
 83 |             denotations = self.obtain_denotations(all_lf)
 84 |             predictions, references = denotations[:len(predictions)], denotations[len(predictions):]
 85 |         if pick:
 86 |             predictions, _ = self.pick_predictions(predictions, n_best)
 87 |         else:
 88 |             references = [each for each in references for _ in range(n_best)]
 89 |         return list(map(lambda x, y: 1.0 if x == y else 0.0, predictions, references))
 90 | 
 91 |     def normalize(self, lf_list):
 92 |         """
 93 |             Normalize each logical form, at least changes token list into string list
 94 |         """
 95 |         return [' '.join(lf) for lf in lf_list]
 96 | 
 97 |     def obtain_denotations(self, lf_list):
 98 |         """
 99 |             Obtain denotations for each logical form
100 |         """
101 |         return lf_list
102 | 
103 |     def pick_predictions(self, pred_ans, n_best=1):
104 |         if n_best == 1:
105 |             return pred_ans, [i for i in range(len(pred_ans))]
106 |         flags = self.is_valid(pred_ans)
107 |         batches = int(len(pred_ans) / n_best)
108 |         return_ans, return_idx = [], []
109 |         for idx in range(batches):
110 |             for j in range(n_best):
111 |                 if int(flags[idx * n_best + j]) == 1:
112 |                     return_ans.append(pred_ans[idx * n_best + j])
113 |                     return_idx.append(idx * n_best + j)
114 |                     break
115 |             else:
116 |                 return_ans.append(pred_ans[idx * n_best])
117 |                 return_idx.append(idx * n_best)
118 |         return return_ans, return_idx
119 | 
120 |     def is_valid(self, ans_list):
121 |         """
122 |             Check whether ans is syntax or semantic invalid
123 |             ans_list(str list): denotation list or logical form list
124 |         """
125 |         raise [1.0 for _ in range(len(ans_list))]
126 | 


--------------------------------------------------------------------------------
/utils/domain/domain_geo.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import sys, os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 4 | from utils.domain.domain_base import Domain
 5 | import tempfile
 6 | import subprocess
 7 | import re
 8 | 
 9 | class GEODomain(Domain):
10 | 
11 |     def __init__(self):
12 |         self.dataset = 'geo'
13 |         self.denotation = True
14 | 
15 |     def normalize(self, lf_list):
16 |         def format_geo(lf):
17 |             """
18 |                 lf is token list
19 |                 1. for entity longer than one word, add double quotes
20 |                 2. remove underline _ for predicates
21 |                 3. remove unnecessary space, except spaces in entities
22 |             """
23 |             toks, quoted_toks, in_quotes = [], [], False
24 |             for t in lf:
25 |                 if in_quotes:
26 |                     if t == "'": # entity ending
27 |                         toks.append('"%s"' % ' '.join(quoted_toks))
28 |                         in_quotes, quoted_toks = False, []
29 |                     else:
30 |                         quoted_toks.append(t)
31 |                 else:
32 |                     if t == "'": # entity start
33 |                         in_quotes = True
34 |                     else:
35 |                         if len(t) > 1 and t.startswith('_'): # predicate remove prefix _
36 |                             toks.append(t[1:])
37 |                         else:
38 |                             toks.append(t)
39 |             return ''.join(toks)
40 |         return [format_geo(lf) for lf in lf_list]
41 | 
42 |     def obtain_denotations(self, lf_list):
43 |         tf = tempfile.NamedTemporaryFile('w+t', encoding='utf8', suffix='.dlog')
44 |         tf_lines = ['_parse([query], %s).' % lf for lf in lf_list]
45 |         for line in tf_lines:
46 |             tf.write(line + '\n')
47 |         tf.flush()
48 |         msg = subprocess.check_output(['evaluator/geoquery', tf.name])
49 |         msg = msg.decode('utf8')
50 |         tf.close()
51 | 
52 |         def get_denotation(line):
53 |             m = re.search('\{[^}]*\}', line)
54 |             if m:
55 |                 return m.group(0)
56 |             else:
57 |                 return line.strip()
58 | 
59 |         denotations = [
60 |             get_denotation(line)
61 |             for line in msg.split('\n')
62 |             if line.startswith('        Example')
63 |         ]
64 |         return denotations
65 | 
66 |     def is_valid(self, ans_list):
67 |         return list(map(lambda ans: 0.0 if 'FAILED' in ans or 'Join failed syntactically' in ans else 1.0, ans_list))
68 | 


--------------------------------------------------------------------------------
/utils/domain/domain_overnight.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import sys, os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 4 | from utils.domain.domain_base import Domain
 5 | import tempfile
 6 | import subprocess
 7 | import re
 8 | 
 9 | class OvernightDomain(Domain):
10 | 
11 |     def __init__(self, dataset):
12 |         self.dataset = dataset
13 |         self.denotation = True
14 | 
15 |     def normalize(self, lf_list):
16 |         lf_list = [' '.join(lf) for lf in lf_list]
17 | 
18 |         def format_overnight(lf):
19 |             replacements = [
20 |                 ('(', ' ( '), # make sure ( and ) must have blank space around
21 |                 (')', ' ) '),
22 |                 ('! ', '!'),
23 |                 ('SW', 'edu.stanford.nlp.sempre.overnight.SimpleWorld'),
24 |             ]
25 |             for a, b in replacements:
26 |                 lf = lf.replace(a, b)
27 |             # remove redundant blank spaces
28 |             lf = re.sub(' +', ' ', lf)
29 |             return lf.strip()
30 | 
31 |         return [format_overnight(lf) for lf in lf_list]
32 | 
33 |     def obtain_denotations(self, lf_list):
34 |         tf = tempfile.NamedTemporaryFile('w+t', encoding='utf8', suffix='.examples')
35 |         for line in lf_list:
36 |             tf.write(line + '\n')
37 |         tf.flush()
38 |         msg = subprocess.check_output(['evaluator/overnight', self.dataset, tf.name])
39 |         msg = msg.decode('utf8')
40 |         tf.close()
41 |         denotations = [
42 |             line.split('\t')[1] for line in msg.split('\n')
43 |             if line.startswith('targetValue\t')
44 |         ]
45 |         return denotations
46 | 
47 |     def is_valid(self, ans_list):
48 |         return list(map(lambda ans: 0.0 if 'BADJAVA' in ans or 'ERROR' in ans or ans == 'null' else 1.0, ans_list)) 
49 | 


--------------------------------------------------------------------------------
/utils/example.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import numpy as np
 3 | import sys, os
 4 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 5 | from utils.lexicon import Lexicon
 6 | from utils.domain.domain_base import Domain
 7 | 
 8 | def split_dataset(dataset, split_ratio=1.0):
 9 |     assert split_ratio >= 0. and split_ratio <= 1.0
10 |     index = np.arange(len(dataset))
11 |     np.random.shuffle(index)
12 |     splt = int(len(dataset) * split_ratio)
13 |     first = [dataset[idx] for idx in index[:splt]]
14 |     second = [dataset[idx] for idx in index[splt:]]
15 |     return first, second
16 | 
17 | class Example():
18 | 
19 |     __slots__ = ('question', 'logical_form', "mapped_question", "mapped_logical_form", "conf")
20 | 
21 |     @classmethod
22 |     def set_domain(cls, dataset):
23 |         cls.dataset = dataset # dataset name
24 |         cls.db = Lexicon(dataset)
25 |         cls.domain = Domain.from_dataset(dataset) # class Domain object
26 |         if dataset in ['geo', 'atis']:
27 |             cls.file_paths = [
28 |                 os.path.join('data', dataset, dataset + '_train.tsv'),
29 |                 os.path.join('data', dataset, dataset + '_dev.tsv'),
30 |                 os.path.join('data', dataset, dataset + '_test.tsv')
31 |             ]
32 |             cls.extra_path = os.path.join('data', dataset, dataset + '_extra.tsv')
33 |         else: #Overnight
34 |             cls.file_paths = [
35 |                 os.path.join('data', 'overnight', dataset + '_train.tsv'),
36 |                 os.path.join('data', 'overnight', dataset + '_test.tsv')
37 |             ]
38 |             cls.extra_path = os.path.join('data', 'overnight', dataset + '_extra.tsv')
39 | 
40 |     def __init__(self, question='', logical_form='', conf=1.0):
41 |         super(Example, self).__init__()
42 |         self.question = [each for each in question.split(' ') if each != '']
43 |         self.logical_form = [each for each in logical_form.split(' ') if each != '']
44 |         self.mapped_question = Example.db.entity_mapping(self.question)
45 |         self.mapped_logical_form = Example.db.reverse_entity_mapping(self.logical_form, self.question)
46 |         self.conf = conf
47 | 
48 |     @classmethod
49 |     def load_dataset(cls, choice='train'):
50 |         """
51 |             return example list of train, test or extra
52 |         """
53 |         if choice == 'train':
54 |             if len(cls.file_paths) == 2:
55 |                 # no dev dataset, split train dataset
56 |                 train_dataset = cls.load_dataset_from_file(cls.file_paths[0])
57 |                 train_dataset, dev_dataset = split_dataset(train_dataset, split_ratio=0.8)
58 |             else:
59 |                 assert len(cls.file_paths) == 3
60 |                 train_dataset = cls.load_dataset_from_file(cls.file_paths[0])
61 |                 dev_dataset = cls.load_dataset_from_file(cls.file_paths[1])
62 |             return train_dataset, dev_dataset
63 |         elif choice == 'test':
64 |             test_dataset = cls.load_dataset_from_file(cls.file_paths[-1])
65 |             return test_dataset
66 |         else:
67 |             extra_dataset = cls.load_dataset_from_file(cls.extra_path)
68 |             return extra_dataset
69 | 
70 |     @classmethod
71 |     def load_dataset_from_file(cls, path):
72 |         ex_list = []
73 |         with open(path, 'r') as infile:
74 |             for line in infile:
75 |                 line = line.strip()
76 |                 if line == '': continue
77 |                 q, lf = line.split('\t')
78 |                 ex_list.append(cls(q.strip(), lf.strip()))
79 |         return ex_list
80 | 


--------------------------------------------------------------------------------
/utils/gpu.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import os, sys, math
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | # special case in our remote server, just ignore
 5 | if '/cm/local/apps/cuda/libs/current/pynvml' in sys.path:
 6 |     sys.path.remove('/cm/local/apps/cuda/libs/current/pynvml')
 7 | import gpustat
 8 | import torch
 9 | 
10 | def set_torch_device(deviceId):
11 |     # Simplified version of gpu selection
12 |     if deviceId < 0:
13 |         device = torch.device("cpu")
14 |         print('Use CPU ...')
15 |     else:
16 |         assert torch.cuda.device_count() >= deviceId + 1
17 |         device = torch.device("cuda:%d" % (deviceId))
18 |         print('Use GPU with index %d' % (deviceId))
19 |         # os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # used when debug
20 |         ## These two sentences are used to ensure reproducibility with cudnnbacken
21 |         # torch.backends.cudnn.deterministic = True
22 |         # torch.backends.cudnn.benchmark = False
23 |     return device
24 | 
25 | if __name__ == '__main__':
26 | 
27 |     set_torch_device(0)


--------------------------------------------------------------------------------
/utils/hyperparam.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | ''' 
  3 |     Construct exp directory according to hyper parameters 
  4 | '''
  5 | import os
  6 | 
  7 | EXP_PATH = 'exp'
  8 | 
  9 | def hyperparam_seq2seq(options):
 10 |     """Hyerparam string for semantic parsing and question generation."""
 11 |     task_path = 'task_%s' % (options.task)
 12 |     dataset_path = 'dataset_%s' % (options.dataset)
 13 |     ratio = 'labeled_%s' % (options.labeled)
 14 | 
 15 |     exp_name = 'copy__' if options.copy else ''
 16 |     exp_name += 'cell_%s__' % (options.cell)
 17 |     exp_name += 'emb_%s__' % (options.emb_size)
 18 |     exp_name += 'hidden_%s_x_%s__' % (options.hidden_dim, options.num_layers)
 19 |     exp_name += 'dropout_%s__' % (options.dropout)
 20 |     exp_name += 'reduce_%s__' % (options.reduction)
 21 |     exp_name += 'lr_%s__' % (options.lr)
 22 |     exp_name += 'mn_%s__' % (options.max_norm)
 23 |     exp_name += 'l2_%s__' % (options.l2)
 24 |     exp_name += 'bsize_%s__' % (options.batchSize)
 25 |     exp_name += 'me_%s__' % (options.max_epoch)
 26 |     exp_name += 'beam_%s__' % (options.beam)
 27 |     exp_name += 'nbest_%s' % (options.n_best)
 28 |     return os.path.join(EXP_PATH, task_path, dataset_path, ratio, exp_name)
 29 | 
 30 | def hyperparam_lm(options):
 31 |     task = 'task_%s' % (options.task)
 32 |     dataset_path = 'dataset_%s' % (options.dataset)
 33 |     ratio = '%s__labeled_%s' % (options.side, options.labeled)
 34 | 
 35 |     exp_name = ''
 36 |     exp_name += 'cell_%s__' % (options.cell)
 37 |     exp_name += 'emb_%s__' % (options.emb_size)
 38 |     exp_name += 'hidden_%s_x_%s__' % (options.hidden_dim, options.num_layers)
 39 |     exp_name += 'dropout_%s__' % (options.dropout)
 40 |     exp_name += 'reduce_%s__' % (options.reduction)
 41 |     exp_name += 'lr_%s__' % (options.lr)
 42 |     exp_name += 'mn_%s__' % (options.max_norm)
 43 |     exp_name += 'l2_%s__' % (options.l2)
 44 |     exp_name += 'bsize_%s__' % (options.batchSize)
 45 |     exp_name += 'me_%s' % (options.max_epoch)
 46 |     exp_name += '__decTied' if options.decoder_tied else ''
 47 |     return os.path.join(EXP_PATH, task, dataset_path, ratio, exp_name)
 48 | 
 49 | def hyperparam_pseudo_method(options):
 50 |     task = 'task_%s' % (options.task)
 51 |     dataset_path = 'dataset_%s' % (options.dataset)
 52 |     ratio = 'labeled_%s__unlabeled_%s' % (options.labeled, options.unlabeled)
 53 |     ratio += '__extra' if options.extra else ''
 54 | 
 55 |     exp_name = ''
 56 |     if 'copy__' in options.read_sp_model_path:
 57 |         exp_name += 'sp_attnptr__'
 58 |     else:
 59 |         exp_name += 'sp_attn__'
 60 |     if 'copy__' in options.read_qg_model_path:
 61 |         exp_name += 'qg_attnptr__'
 62 |     else:
 63 |         exp_name += 'qg_attn__' 
 64 |     exp_name += 'reduce_%s__' % (options.reduction)
 65 |     exp_name += 'lr_%s__' % (options.lr)
 66 |     exp_name += 'mn_%s__' % (options.max_norm)
 67 |     exp_name += 'l2_%s__' % (options.l2)
 68 |     exp_name += 'bsize_%s__' % (options.batchSize)
 69 |     exp_name += 'me_%s__' % (options.max_epoch)
 70 |     exp_name += 'beam_%s__' % (options.beam)
 71 |     exp_name += 'nbest_%s__' % (options.n_best)
 72 |     exp_name += 'discount_%s__method_%s' % (options.discount, options.method)
 73 |     return os.path.join(EXP_PATH, task, dataset_path, ratio, exp_name)
 74 | 
 75 | def hyperparam_dual_learning(options):
 76 |     task = 'task_%s' % (options.task)
 77 |     dataset_path = 'dataset_%s' % (options.dataset)
 78 |     ratio = 'labeled_%s__unlabeled_%s' % (options.labeled, options.unlabeled)
 79 |     ratio += '__extra' if options.extra else ''
 80 | 
 81 |     exp_name = ''
 82 |     if 'copy__' in options.read_sp_model_path:
 83 |         exp_name += 'sp_attnptr__'
 84 |     else:
 85 |         exp_name += 'sp_attn__'
 86 |     if 'copy__' in options.read_qg_model_path:
 87 |         exp_name += 'qg_attnptr__'
 88 |     else:
 89 |         exp_name += 'qg_attn__' 
 90 |     exp_name += 'reduce_%s__' % (options.reduction)
 91 |     exp_name += 'lr_%s__' % (options.lr)
 92 |     exp_name += 'mn_%s__' % (options.max_norm)
 93 |     exp_name += 'l2_%s__' % (options.l2)
 94 |     exp_name += 'bsize_%s__' % (options.batchSize)
 95 |     exp_name += 'me_%s__' % (options.max_epoch)
 96 |     exp_name += 'beam_%s__' % (options.beam)
 97 |     exp_name += 'nbest_%s__' % (options.n_best)
 98 |     exp_name += 'cycle_%s__' % (options.cycle)
 99 |     exp_name += 'sample_%s__alpha_%s__beta_%s' % (options.sample, options.alpha, options.beta)
100 |     return os.path.join(EXP_PATH, task, dataset_path, ratio, exp_name)
101 | 


--------------------------------------------------------------------------------
/utils/lexicon.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import random, os
  3 | import collections
  4 | import itertools
  5 | 
  6 | class Lexicon():
  7 |     """
  8 |         A Lexicon class used for entity mapping and reverse entity mapping (in pointer network)
  9 | 
 10 |         1. Entity mapping: mapping word phrase into entity, replaced after copy
 11 |         ['in', 'which', 'seasons', 'kob', 'bryant', 'made', '3', 'blocks'] => (kob bryant, en.player.kobe_bryant)
 12 |         ==> ['in', 'which', 'seasons', 'en.player.kobe_bryant', 'en.player.kobe_bryant', 'made', '3', 'blocks']
 13 |         For word phrases with multiple choices, entity that matches longer spans takes precedence (Longest Match First)
 14 | 
 15 |         2. Reverse entity mapping: transform input logical form entities into natural phrases, replaced before copy (actually before feeding into network)
 16 |         ['(', 'lambda', '$0', 'e', '(', 'and', '(', 'flight', '$0', ')', '(', 'during_day', '$0', 'late:pd', ')'] => (late:pd, late flight|late|night)
 17 |         ==> ['(', 'lambda', '$0', 'e', '(', 'and', '(', 'flight', '$0', ')', '(', 'during_day', '$0', 'late', 'flight', ')']
 18 |         Randomly select one natural phrase from multiple choices if question word is not available
 19 |         Otherwise, use exactly the longest word phrase in the question.
 20 |         Attention: Remember to add late, flight to logical form vocabulary
 21 |     """
 22 |     def __init__(self, dataset):
 23 |         super(Lexicon, self).__init__()
 24 |         self.phrase2entity = collections.OrderedDict()
 25 |         self.entity2phrase = collections.OrderedDict()
 26 |         self.seen_words = set()
 27 |         self._load_lexicon(dataset)
 28 | 
 29 |     def _load_lexicon(self, dataset):
 30 |         entries = []
 31 |         if dataset in ['atis', 'geo']:
 32 |             lexicon_path = os.path.join('data', dataset, dataset + '_lexicon.txt')
 33 |         else:
 34 |             lexicon_path = os.path.join('data', 'overnight', dataset + '_lexicon.txt')
 35 |         print('Start load lexicon from file %s ...' % (lexicon_path))
 36 |         with open(lexicon_path, 'r') as f:
 37 |             for line in f:
 38 |                 line = line.strip()
 39 |                 if line == '': continue
 40 |                 x, y = line.split(' :- NP : ')
 41 |                 entries.append((x.strip(), y.strip()))
 42 |         self._add_entries(entries)
 43 | 
 44 |     def _add_entries(self, entries):
 45 |         for name, entity in entries:
 46 |             if entity not in self.entity2phrase:
 47 |                 self.entity2phrase[entity] = [name]
 48 |             elif name not in self.entity2phrase[entity]:
 49 |                 self.entity2phrase[entity].append(name)
 50 |             if name in self.phrase2entity:
 51 |                 if self.phrase2entity[name] != entity: # we do not handle entity disambiguation
 52 |                     print('Collision detected: %s -> %s, %s' % (name, self.entries[name], entity))
 53 |                 continue
 54 |             # Update self.seen_words
 55 |             for w in name.split(' '):
 56 |                 self.seen_words.add(w)
 57 |             self.phrase2entity[name] = entity
 58 |         for entity in self.entity2phrase: # sorted according to length of noun phrases
 59 |             self.entity2phrase[entity] = sorted(self.entity2phrase[entity], key=lambda x: len(x), reverse=True)
 60 | 
 61 |     def entity_mapping(self, words):
 62 |         """
 63 |             @args:
 64 |                 words: a list of words
 65 |             @return:
 66 |                 mapped_words: a list of words, where words[i] is replaced with entity if available
 67 |         """
 68 |         entities = ['' for i in range(len(words))]
 69 |         index_pairs = sorted(list(itertools.combinations(range(len(words) + 1), 2)),
 70 |                                 key=lambda x: x[0] - x[1])
 71 |         ret_entries = []
 72 | 
 73 |         for i, j in index_pairs:
 74 |             # Longest match first
 75 |             if any(x for x in entities[i: j]): continue
 76 |             span = ' '.join(words[i: j])
 77 |             if span in self.phrase2entity:
 78 |                 entity = self.phrase2entity[span]
 79 |                 for k in range(i, j):
 80 |                     entities[k] = entity
 81 |                 ret_entries.append(((i, j), entity))
 82 |         mapped_words = [words[idx] if not item else item for idx, item in enumerate(entities)]
 83 |         return mapped_words
 84 | 
 85 |     def reverse_entity_mapping(self, tokens, words=None):
 86 |         """
 87 |             @args:
 88 |                 tokens: a list of logical form tokens
 89 |                 words: a list of words if available
 90 |             @return:
 91 |                 mapped_tokens: a list of tokens, where tokens[i] is replaced with noun phrases if available,
 92 |                     prefer to use raw noun phrase in words if available
 93 |         """
 94 |         entities = ['' for each in tokens]
 95 |         words = ' '.join(words) if words and words != ['none'] else None
 96 |         for idx, tok in enumerate(tokens):
 97 |             if tok in self.entity2phrase:
 98 |                 if words:
 99 |                     choices = self.entity2phrase[tok]
100 |                     among_words = list(filter(lambda item: item in words, choices))
101 |                     if len(among_words) > 0:
102 |                         entities[idx] = among_words[0]
103 |                     else:
104 |                         entities[idx] = random.choice(self.entity2phrase[tok])
105 |                 else:
106 |                     entities[idx] = random.choice(self.entity2phrase[tok])
107 |         mapped_words = [tokens[idx] if not item else item for idx, item in enumerate(entities)]
108 |         return ' '.join(mapped_words).split(' ')
109 | 


--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import sys, logging
 3 | 
 4 | def set_logger(exp_path, testing=False):
 5 |     logFormatter = logging.Formatter('%(asctime)s - %(message)s') #('%(asctime)s - %(levelname)s - %(message)s')
 6 |     logger = logging.getLogger('mylogger')
 7 |     logger.setLevel(logging.DEBUG)
 8 |     if testing:
 9 |         fileHandler = logging.FileHandler('%s/log_test.txt' % (exp_path), mode='w')
10 |     else:
11 |         fileHandler = logging.FileHandler('%s/log_train.txt' % (exp_path), mode='w')
12 |     fileHandler.setFormatter(logFormatter)
13 |     logger.addHandler(fileHandler)
14 |     consoleHandler = logging.StreamHandler(sys.stdout)
15 |     consoleHandler.setFormatter(logFormatter)
16 |     logger.addHandler(consoleHandler)
17 |     return logger


--------------------------------------------------------------------------------
/utils/loss.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | '''
 3 |     Set loss function, allow different confidence for different training samples
 4 | '''
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | def set_loss_function(reduction='sum', ignore_index=-100):
 9 |     loss_function = MyNLLLoss(reduction=reduction, ignore_index=ignore_index)
10 |     return loss_function
11 | 
12 | class MyNLLLoss(nn.Module):
13 | 
14 |     def __init__(self, *args, **kargs):
15 |         super(MyNLLLoss, self).__init__()
16 |         self.real_reduction = kargs.pop('reduction', 'sum')
17 |         kargs['reduction'] = 'none'
18 |         self.loss_function = nn.NLLLoss(*args, **kargs)
19 | 
20 |     def forward(self, inputs, targets, lens=None, conf=None):
21 |         if conf is None:
22 |             conf = torch.ones(inputs.size(0), dtype=torch.float).to(inputs.device)
23 |         bsize, seq_len, voc_size = list(inputs.size())
24 |         loss = self.loss_function(inputs.contiguous().view(-1, voc_size), targets.contiguous().view(-1))
25 |         loss = loss.contiguous().view(bsize, seq_len).sum(dim=1)
26 |         loss = loss if self.real_reduction == 'sum' else loss / lens.float()
27 |         loss = (loss * conf).sum() if self.real_reduction == 'sum' else (loss * conf).sum() / conf.sum()
28 |         return loss


--------------------------------------------------------------------------------
/utils/optimizer.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | '''
 3 |     Set optimizer for train_model
 4 | '''
 5 | import torch
 6 | import torch.nn as nn
 7 | from torch.optim import Adam
 8 | 
 9 | def set_optimizer(*args, lr=1e-3, l2=1e-5, max_norm=5):
10 |     params = []
11 |     for train_model in args:
12 |         params += list(train_model.named_parameters())
13 |     grouped_params = [
14 |         {'params': list(set([p for n, p in params if p.requires_grad and 'bias' not in n])), 'weight_decay': l2},
15 |         {'params': list(set([p for n, p in params if p.requires_grad and 'bias' in n])), 'weight_decay': 0.0}
16 |     ]
17 |     optimizer = MyAdam(grouped_params, lr=lr, max_norm=max_norm)
18 |     return optimizer
19 | 
20 | class MyAdam(Adam):
21 |     """
22 |         Add clip_grad_norm_ for Optimizer Adam
23 |     """
24 |     def __init__(self, *args, **kargs):
25 |         self.max_norm = kargs.pop('max_norm', -1)
26 |         super(MyAdam, self).__init__(*args, **kargs)
27 | 
28 |     def step(self, *args, **kargs):
29 |         if self.max_norm > 0:
30 |             for group in self.param_groups:
31 |                 torch.nn.utils.clip_grad_norm_(group['params'], self.max_norm)
32 |         super(MyAdam, self).step(*args, **kargs)
33 | 


--------------------------------------------------------------------------------
/utils/seed.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import random, torch
 3 | import numpy as np
 4 | 
 5 | def set_random_seed(random_seed=999, device='cuda'):
 6 |     random.seed(random_seed)
 7 |     torch.manual_seed(random_seed)
 8 |     if torch.cuda.is_available():
 9 |         if device != 'cuda':
10 |             print("WARNING: You have a CUDA device, so you should probably run with --deviceId [1|2|3]")
11 |         else:
12 |             torch.cuda.manual_seed(random_seed)
13 |     np.random.seed(random_seed)


--------------------------------------------------------------------------------
/utils/solver/solver_base.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | 
 3 | class Solver():
 4 | 
 5 |     def __init__(self, model, vocab, loss_function, optimizer, exp_path, logger, device=None, **kargs):
 6 |         super(Solver, self).__init__()
 7 |         self.model = model
 8 |         self.vocab = vocab
 9 |         self.loss_function = loss_function
10 |         self.optimizer = optimizer
11 |         self.exp_path = exp_path
12 |         self.logger = logger
13 |         self.device = device
14 | 
15 |     def decode(self, data_inputs, output_path, test_batchSize, beam_size=5, n_best=1):
16 |         raise NotImplementedError
17 |     
18 |     def train_and_decode(self, train_dataset, dev_dataset, test_dataset, batchSize=16, test_batchSize=128,
19 |         max_epoch=100, beam_size=5, n_best=1):
20 |         raise NotImplementedError
21 | 


--------------------------------------------------------------------------------
/utils/solver/solver_dual_learning.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import os, sys, time, gc
  3 | import numpy as np
  4 | import torch
  5 | from utils.constants import *
  6 | from utils.batch import get_minibatch
  7 | from utils.example import Example
  8 | from utils.bleu import get_bleu_score
  9 | from utils.solver.solver_base import Solver
 10 | 
 11 | class DualLearningSolver(Solver):
 12 |     '''
 13 |         For Dual Learning Solver
 14 |     '''
 15 |     def __init__(self, *args, **kargs):
 16 |         super(DualLearningSolver, self).__init__(*args, **kargs)
 17 |         self.best_result = {
 18 |             "iter_sp": 0, "dev_acc": 0., "test_acc": 0.,
 19 |             "iter_qg": 0, "dev_bleu": 0., "test_bleu": 0.
 20 |         }
 21 | 
 22 |     def decode(self, data_inputs, output_path, test_batchSize, beam=5, n_best=1):
 23 |         data_index = np.arange(len(data_inputs))
 24 |         nsentences = len(data_index)
 25 |         domain = Example.domain
 26 |         total, candidate_list, references_list = [], [], []
 27 |         ########################### Evaluation Phase ############################
 28 |         with open(output_path, 'w') as of:
 29 |             self.model.eval()
 30 |             for j in range(0, nsentences, test_batchSize):
 31 |                 ###################### Obtain minibatch data ######################
 32 |                 inputs, lens, _, _, _, copy_tokens, oov_list, (raw_inputs, raw_outputs) = get_minibatch(
 33 |                     data_inputs, self.vocab['sp'], task='semantic_parsing', data_index=data_index, 
 34 |                     index=j, batch_size=test_batchSize, device=self.device['sp'], copy=self.model.sp_model.copy)
 35 |                 ############################ Forward Model ############################
 36 |                 with torch.no_grad():
 37 |                     results = self.model.decode_batch(inputs, lens, self.vocab['sp'].lf2id, copy_tokens, task='semantic_parsing', beam_size=beam, n_best=n_best)
 38 |                     predictions = results["predictions"]
 39 |                     predictions = [pred for each in predictions for pred in each]
 40 |                     predictions = domain.reverse(predictions, self.vocab['sp'].id2lf, oov_list=oov_list)
 41 |                 accuracy = domain.compare_logical_form(predictions, raw_outputs, pick=True)
 42 |                 total.extend(accuracy)
 43 |                 ############################ Write result to file ############################
 44 |                 for idx in range(len(raw_inputs)):
 45 |                     of.write("Utterance: " + ' '.join(raw_inputs[idx]) + '\n')
 46 |                     of.write("Target: " + ' '.join(raw_outputs[idx]) + '\n')
 47 |                     for i in range(n_best):
 48 |                         of.write("Pred" + str(i) + ": " + ' '.join(predictions[n_best * idx + i]) + '\n')
 49 |                     of.write("Correct: " + ("True" if accuracy[idx] == 1 else "False") + '\n\n')
 50 | 
 51 |             of.write('=' * 50 + '\n' + '=' * 50 + '\n\n')
 52 | 
 53 |             for j in range(0, nsentences, test_batchSize):
 54 |                 ###################### Obtain minibatch data ######################
 55 |                 inputs, lens, _, _, _, copy_tokens, oov_list, (raw_inputs, raw_outputs) = get_minibatch(
 56 |                     data_inputs, self.vocab['qg'], task='question_generation', data_index=data_index,
 57 |                     index=j, batch_size=test_batchSize, device=self.device['qg'], copy=self.model.qg_model.copy)
 58 |                 ########################## Beam Search/Greed Decode #######################
 59 |                 with torch.no_grad():
 60 |                     results = self.model.decode_batch(inputs, lens, self.vocab['qg'].word2id, copy_tokens, task='question_generation', beam_size=beam, n_best=n_best)
 61 |                     predictions = results["predictions"]
 62 |                     predictions = [each[0] for each in predictions]
 63 |                     predictions = domain.reverse(predictions, self.vocab['qg'].id2word, oov_list=oov_list)
 64 |                 bleu_scores = domain.compare_question(predictions, raw_outputs)
 65 |                 candidate_list.extend(predictions)
 66 |                 references_list.extend([[ref] for ref in raw_outputs])
 67 |                 ############################# Writing Result to File ###########################
 68 |                 for idx in range(len(raw_inputs)):
 69 |                     of.write("LogicalForm: " + ' '.join(raw_inputs[idx]) + '\n')
 70 |                     of.write("Target: " + ' '.join(raw_outputs[idx]) + '\n')
 71 |                     of.write("Pred0: " + ' '.join(predictions[idx]) + '\n')
 72 |                     of.write("Bleu: " + str(bleu_scores[idx]) + '\n\n')
 73 |             ########################### Calculate accuracy ###########################
 74 |             acc = sum(total) / float(len(total))
 75 |             avg_bleu = get_bleu_score(candidate_list, references_list)
 76 |             of.write('Overall accuracy: %.4f | Overall bleu score: %.4f' % (acc, avg_bleu))
 77 |         return acc, avg_bleu
 78 | 
 79 |     def train_and_decode(self, labeled_train_dataset, q_unlabeled_train_dataset, lf_unlabeled_train_dataset, dev_dataset, test_dataset,
 80 |             batchSize, test_batchSize, cycle='sp+qg', max_epoch=100, beam=5, n_best=1):
 81 |         sp_unlabeled_train_index = np.arange(len(q_unlabeled_train_dataset))
 82 |         qg_unlabeled_train_index = np.arange(len(lf_unlabeled_train_dataset))
 83 |         labeled_train_index = np.arange(len(labeled_train_dataset))
 84 |         nsentences = max([len(q_unlabeled_train_dataset), len(lf_unlabeled_train_dataset), len(labeled_train_dataset)])
 85 |         for i in range(max_epoch):
 86 |             ########################### Training Phase ############################
 87 |             start_time = time.time()
 88 |             np.random.shuffle(sp_unlabeled_train_index)
 89 |             np.random.shuffle(qg_unlabeled_train_index)
 90 |             np.random.shuffle(labeled_train_index)
 91 |             losses = { 'sp': [], 'qg': [] }
 92 |             self.model.train()
 93 |             for j in range(0, nsentences, batchSize):
 94 |                 self.model.zero_grad()
 95 | 
 96 |                 ''' Cycle start from Semantic Parsing '''
 97 |                 if 'sp' in cycle:
 98 |                     ###################### Obtain minibatch data ######################
 99 |                     inputs, lens, copy_tokens, oov_list, raw_in = get_minibatch(q_unlabeled_train_dataset, self.vocab['sp'], task='unlabeled_semantic_parsing',
100 |                         data_index=sp_unlabeled_train_index, index=j, batch_size=batchSize, device=self.device['sp'], copy=self.model.sp_model.copy)
101 |                     ######################## Forward Model ##########################
102 |                     sp_loss, qg_loss = self.model(inputs, lens, copy_tokens, oov_list, raw_in, start_from='semantic_parsing')
103 |                     losses['sp'].append(sp_loss.item())
104 |                     losses['qg'].append(qg_loss.item())
105 |                     sp_loss.backward()
106 |                     qg_loss.backward()
107 | 
108 |                 ''' Cycle start from Question Generation '''
109 |                 if 'qg' in cycle:
110 |                     ###################### Obtain minibatch data ######################
111 |                     inputs, lens, copy_tokens, oov_list, raw_in = get_minibatch(lf_unlabeled_train_dataset, self.vocab['qg'], task='unlabeled_question_generation',
112 |                         data_index=qg_unlabeled_train_index, index=j, batch_size=batchSize, device=self.device['qg'], copy=self.model.qg_model.copy)
113 |                     ########################### Forward Model ########################
114 |                     sp_loss, qg_loss = self.model(inputs, lens, copy_tokens, oov_list, raw_in, start_from='question_generation')
115 |                     losses['sp'].append(sp_loss.item())
116 |                     losses['qg'].append(qg_loss.item())
117 |                     sp_loss.backward()
118 |                     qg_loss.backward()
119 | 
120 |                 ''' Supervised Training '''
121 |                 if True:
122 |                     ###################### Obtain minibatch data ######################
123 |                     inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch(
124 |                         labeled_train_dataset, self.vocab['sp'], task='semantic_parsing',
125 |                         data_index=labeled_train_index, index=j, batch_size=batchSize, device=self.device['sp'], copy=self.model.sp_model.copy)
126 |                     ############################ Forward Model ############################
127 |                     batch_scores = self.model.sp_model(inputs, lens, dec_inputs[:, :-1], copy_tokens)
128 |                     batch_loss = self.loss_function['sp'](batch_scores, dec_outputs[:, 1:], out_lens - 1)
129 |                     losses['sp'].append(batch_loss.item())
130 |                     batch_loss.backward()
131 | 
132 |                     ###################### Obtain minibatch data ######################
133 |                     inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch(
134 |                         labeled_train_dataset, self.vocab['qg'], task='question_generation',
135 |                         data_index=labeled_train_index, index=j, batch_size=batchSize, device=self.device['qg'], copy=self.model.qg_model.copy)
136 |                     ############################ Forward Model ############################
137 |                     batch_scores = self.model.qg_model(inputs, lens, dec_inputs[:, :-1], copy_tokens)
138 |                     batch_loss = self.loss_function['qg'](batch_scores, dec_outputs[:, 1:], out_lens - 1)
139 |                     losses['qg'].append(batch_loss.item())
140 |                     batch_loss.backward()
141 | 
142 |                 self.model.pad_embedding_grad_zero()
143 |                 self.optimizer.step()
144 |                 gc.collect()
145 |                 torch.cuda.empty_cache()
146 | 
147 |             print('[learning] epoch %i >> %3.2f%%' % (i, 100), 'completed in %.2f (sec) <<' % (time.time() - start_time))
148 |             sp_loss, qg_loss = np.sum(losses['sp'], axis=0), np.sum(losses['qg'], axis=0)
149 |             self.logger.info('Training:\tEpoch : %d\tTime : %.4fs\tLoss(sp loss : %.4f ; qg loss : %.4f)' \
150 |                 % (i, time.time() - start_time, sp_loss, qg_loss))
151 | 
152 |             ########################### Evaluation Phase ############################
153 |             start_time = time.time()
154 |             dev_acc, dev_bleu = self.decode(dev_dataset, os.path.join(self.exp_path, 'valid.iter' + str(i)),
155 |                 test_batchSize, beam=beam, n_best=n_best)
156 |             self.logger.info('Evaluation:\tEpoch : %d\tTime : %.4fs\tSemantic Parsing (acc : %.4f)\tQuestion Generation (bleu : %.4f)' \
157 |                                 % (i, time.time() - start_time, dev_acc, dev_bleu))
158 |             start_time = time.time()
159 |             test_acc, test_bleu = self.decode(test_dataset, os.path.join(self.exp_path, 'test.iter' + str(i)),
160 |                 test_batchSize, beam=beam, n_best=n_best)
161 |             self.logger.info('Evaluation:\tEpoch : %d\tTime : %.4fs\tSemantic Parsing (acc : %.4f)\tQuestion Generation (bleu : %.4f)' \
162 |                                 % (i, time.time() - start_time, test_acc, test_bleu))
163 | 
164 |             ######################## Pick best result and save #####################
165 |             if dev_acc > self.best_result['dev_acc']:
166 |                 self.model.save_model(sp_save_dir=os.path.join(self.exp_path, 'sp_model.pkl'))
167 |                 self.best_result['iter_sp'] = i
168 |                 self.best_result['dev_acc'], self.best_result['test_acc'] = dev_acc, test_acc
169 |                 self.logger.info('NEW BEST Semantic Parsing:\tEpoch : %d\tBest Valid (acc : %.4f)\tBest Test (acc : %.4f)' \
170 |                         % (i, dev_acc, test_acc))
171 |             if dev_bleu >= self.best_result['dev_bleu']:
172 |                 self.model.save_model(qg_save_dir=os.path.join(self.exp_path, 'qg_model.pkl'))
173 |                 self.best_result['iter_qg'] = i
174 |                 self.best_result['dev_bleu'], self.best_result['test_bleu'] = dev_bleu, test_bleu
175 |                 self.logger.info('NEW BEST Question Generation:\tEpoch : %d\tBest Valid (bleu : %.4f)\tBest Test (bleu : %.4f)' \
176 |                         % (i, dev_bleu, test_bleu))
177 |             gc.collect()
178 |             torch.cuda.empty_cache()
179 | 
180 |         ######################## Reload best model for later usage #####################
181 |         self.logger.info('FINAL BEST Semantic Parsing RESULT: \tEpoch : %d\tBest Valid (acc : %.4f)\tBest Test (acc : %.4f)'
182 |                 % (self.best_result['iter_sp'], self.best_result['dev_acc'], self.best_result['test_acc']))
183 |         self.logger.info('FINAL BEST Question Generation RESULT: \tEpoch : %d\tBest Valid (bleu : %.4f)\tBest Test (bleu : %.4f)'
184 |                 % (self.best_result['iter_qg'], self.best_result['dev_bleu'], self.best_result['test_bleu']))
185 |         self.model.load_model(os.path.join(self.exp_path, 'sp_model.pkl'), os.path.join(self.exp_path, 'qg_model.pkl'))
186 | 


--------------------------------------------------------------------------------
/utils/solver/solver_language_model.py:
--------------------------------------------------------------------------------
  1 | # coding=utf8
  2 | import os, sys, time, gc
  3 | import numpy as np
  4 | import torch
  5 | from utils.solver.solver_base import Solver
  6 | from utils.batch import get_minibatch
  7 | 
  8 | class LMSolver(Solver):
  9 |     '''
 10 |         For traditional RNN-based Language Model
 11 |     '''
 12 |     def __init__(self, *args, **kargs):
 13 |         self.side = kargs.pop('side', 'question')
 14 |         super(LMSolver, self).__init__(*args, **kargs)
 15 |         self.best_result = {"losses": [], "iter": 0, "dev_ppl": float('inf'), "test_ppl": float('inf')}
 16 | 
 17 |     def decode(self, data_inputs, output_path, test_batchSize):
 18 |         data_index = np.arange(len(data_inputs))
 19 |         count, eval_loss, length_list = 0, [], []
 20 |         ########################### Evaluation Phase ############################
 21 |         self.model.eval()
 22 |         with open(output_path, 'w') as f:
 23 |             for j in range(0, len(data_index), test_batchSize):
 24 |                 ###################### Obtain minibatch data ######################
 25 |                 inputs, lens, raw_inputs = get_minibatch(data_inputs, self.vocab, task='language_model',
 26 |                     data_index=data_index, index=j, batch_size=test_batchSize, device=self.device, side=self.side)
 27 |                 length_list.extend((lens - 1).tolist())
 28 |                 ########################## Calculate Sentence PPL #######################
 29 |                 with torch.no_grad():
 30 |                     scores = self.model(inputs, lens) # bsize, seq_len, voc_size
 31 |                     batch_loss = self.loss_function(scores, inputs[:, 1:]).item()
 32 |                     eval_loss.append(batch_loss)
 33 |                     norm_log_prob = self.model.sent_logprobability(inputs, lens).cpu().tolist()
 34 | 
 35 |                 ############################# Writing Result to File ###########################
 36 |                 for idx in range(len(inputs)):
 37 |                     f.write('Utterance: ' + ' '.join(raw_inputs[idx]) + '\n')
 38 |                     f.write('NormLogProb: ' + str(norm_log_prob[idx]) + '\n')
 39 |                     current_ppl = np.exp(- norm_log_prob[idx])
 40 |                     f.write('PPL: ' + str(current_ppl) + '\n\n')
 41 | 
 42 |             ########################### Calculate Corpus PPL ###########################
 43 |             word_count = np.sum(length_list, axis=0)
 44 |             eval_loss = np.sum(eval_loss, axis=0)
 45 |             final_ppl = np.exp(eval_loss / word_count)
 46 |             f.write('Overall ppl: %.4f' % (final_ppl))
 47 |         return final_ppl
 48 | 
 49 |     def train_and_decode(self, train_inputs, dev_inputs, test_inputs, batchSize=16, test_batchSize=128, max_epoch=100):
 50 |         train_data_index = np.arange(len(train_inputs))
 51 |         nsentences = len(train_data_index)
 52 |         for i in range(max_epoch):
 53 |             ########################### Training Phase ############################
 54 |             start_time = time.time()
 55 |             np.random.shuffle(train_data_index)
 56 |             losses = []
 57 |             self.model.train()
 58 |             for j in range(0, nsentences, batchSize):
 59 |                 ###################### Obtain minibatch data ######################
 60 |                 inputs, lens, _ = get_minibatch(train_inputs, self.vocab, task='language_model',
 61 |                     data_index=train_data_index, index=j, batch_size=batchSize, device=self.device, side=self.side)
 62 |                 ############################ Forward Model ############################
 63 |                 self.optimizer.zero_grad()
 64 |                 batch_scores = self.model(inputs, lens)
 65 |                 ############################ Loss Calculation #########################
 66 |                 batch_loss = self.loss_function(batch_scores, inputs[:, 1:], lens - 1)
 67 |                 losses.append(batch_loss.item())
 68 |                 ########################### Backward and Optimize ######################
 69 |                 batch_loss.backward()
 70 |                 self.model.pad_embedding_grad_zero()
 71 |                 self.optimizer.step()
 72 | 
 73 |             print('[learning] epoch %i >> %3.2f%%' % (i, 100), 'completed in %.2f (sec) <<' % (time.time() - start_time))
 74 |             epoch_loss = np.sum(losses, axis=0)
 75 |             self.best_result['losses'].append(epoch_loss)
 76 |             self.logger.info('Training:\tEpoch : %d\tTime : %.4fs\t Loss of tgt : %.5f' \
 77 |                                 % (i, time.time() - start_time, epoch_loss))
 78 |             gc.collect()
 79 |             torch.cuda.empty_cache()
 80 | 
 81 |             # whether evaluate later after training for some epochs
 82 |             if i < 10:
 83 |                 continue
 84 | 
 85 |             ########################### Evaluation Phase ############################
 86 |             start_time = time.time()
 87 |             dev_ppl = self.decode(dev_inputs, os.path.join(self.exp_path, 'valid.iter' + str(i)), test_batchSize)
 88 |             self.logger.info('Evaluation:\tEpoch : %d\tTime : %.4fs\tppl : %.4f' % (i, time.time() - start_time, dev_ppl))
 89 |             start_time = time.time()
 90 |             test_ppl = self.decode(test_inputs, os.path.join(self.exp_path, 'test.iter' + str(i)), test_batchSize)
 91 |             self.logger.info('Evaluation:\tEpoch : %d\tTime : %.4fs\tppl : %.4f' % (i, time.time() - start_time, test_ppl))
 92 | 
 93 |             ######################## Pick best result and save #####################
 94 |             if dev_ppl < self.best_result['dev_ppl']:
 95 |                 self.model.save_model(os.path.join(self.exp_path, 'model.pkl'))
 96 |                 self.best_result['iter'] = i
 97 |                 self.best_result['dev_ppl'], self.best_result['test_ppl'] = dev_ppl, test_ppl
 98 |                 self.logger.info('NEW BEST:\tEpoch : %d\tBest Valid ppl : %.4f;\tBest Test ppl : %.4f' % (i, dev_ppl, test_ppl))
 99 | 
100 |         ######################## Reload best model for later usage #####################
101 |         self.logger.info('FINAL BEST RESULT: \tEpoch : %d\tBest Valid (ppl : %.4f)\tBest Test (ppl : %.4f) '
102 |             % (self.best_result['iter'], self.best_result['dev_ppl'], self.best_result['test_ppl']))
103 |         self.model.load_model(os.path.join(self.exp_path, 'model.pkl'))
104 | 


--------------------------------------------------------------------------------
/utils/solver/solver_question_generation.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import time, os, gc
  3 | from utils.solver.solver_base import Solver
  4 | from utils.example import Example
  5 | from utils.batch import get_minibatch
  6 | from utils.bleu import get_bleu_score
  7 | import numpy as np
  8 | import torch
  9 | 
 10 | class QGSolver(Solver):
 11 | 
 12 |     def __init__(self, *args, **kargs):
 13 |         super(QGSolver, self).__init__(*args, **kargs)
 14 |         self.best_result = { "losses": [], "iter": 0, "dev_bleu": 0., "test_bleu": 0. }
 15 | 
 16 |     def decode(self, data_inputs, output_path, test_batchSize, beam=5, n_best=1):
 17 |         data_index= np.arange(len(data_inputs))
 18 |         nsentences, candidate_list, references_list = len(data_index), [], []
 19 |         domain = Example.domain
 20 |         self.model.eval()
 21 |         with open(output_path, 'w') as of:
 22 |             for j in range(0, nsentences, test_batchSize):
 23 |                 ###################### Obtain minibatch data ######################
 24 |                 inputs, lens, dec_inputs, _, _, copy_tokens, oov_list, (raw_inputs, raw_outputs) = get_minibatch(
 25 |                     data_inputs, self.vocab, task='question_generation', data_index=data_index,
 26 |                     index=j, batch_size=test_batchSize, device=self.device, copy=self.model.copy)
 27 |                 ############################ Forward Model ############################
 28 |                 with torch.no_grad():
 29 |                     results = self.model.decode_batch(inputs, lens, self.vocab.word2id, copy_tokens, beam_size=beam, n_best=n_best)
 30 |                     predictions = results["predictions"]
 31 |                     predictions = [each[0] for each in predictions]
 32 |                     predictions = domain.reverse(predictions, self.vocab.id2word, oov_list=oov_list)
 33 |                 bleu_scores = domain.compare_question(predictions, raw_outputs)
 34 |                 candidate_list.extend(predictions)
 35 |                 references_list.extend([[ref] for ref in raw_outputs])
 36 |                 ############################ Write result to file ############################
 37 |                 for idx in range(len(raw_inputs)):
 38 |                     of.write("LogicalForm: " + ' '.join(raw_inputs[idx]) + '\n')
 39 |                     of.write("Target: " + ' '.join(raw_outputs[idx]) + '\n')
 40 |                     of.write("Pred0: " + ' '.join(predictions[idx]) + '\n')
 41 |                     of.write("Bleu: " + str(bleu_scores[idx]) + '\n\n')
 42 |             avg_bleu = get_bleu_score(candidate_list, references_list)
 43 |             of.write('Overall bleu is %.4f' % (avg_bleu))
 44 |         return avg_bleu
 45 | 
 46 |     def train_and_decode(self, train_dataset, dev_dataset, test_dataset, batchSize=16, test_batchSize=128,
 47 |             max_epoch=100, beam=5, n_best=1):
 48 |         train_data_index = np.arange(len(train_dataset))
 49 |         nsentences = len(train_data_index)
 50 |         for i in range(max_epoch):
 51 |             ########################### Training Phase ############################
 52 |             start_time = time.time()
 53 |             np.random.shuffle(train_data_index)
 54 |             losses = []
 55 |             self.model.train()
 56 |             for j in range(0, nsentences, batchSize):
 57 |                 ###################### Obtain minibatch data ######################
 58 |                 inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch(
 59 |                     train_dataset, self.vocab, task='question_generation', data_index=train_data_index,
 60 |                     index=j, batch_size=batchSize, device=self.device, copy=self.model.copy)
 61 |                 ############################ Forward Model ############################
 62 |                 self.model.zero_grad()
 63 |                 batch_scores = self.model(inputs, lens, dec_inputs[:, :-1], copy_tokens)
 64 |                 ############################ Loss Calculation #########################
 65 |                 batch_loss = self.loss_function(batch_scores, dec_outputs[:, 1:], out_lens - 1)
 66 |                 losses.append(batch_loss.item())
 67 |                 ########################### Backward and Optimize ######################
 68 |                 batch_loss.backward()
 69 |                 self.model.pad_embedding_grad_zero()
 70 |                 self.optimizer.step()
 71 | 
 72 |             print('[learning] epoch %i >> %3.2f%%' % (i, 100), 'completed in %.2f (sec) <<' % (time.time() - start_time))
 73 |             epoch_loss = np.sum(losses, axis=0)
 74 |             self.best_result['losses'].append(epoch_loss)
 75 |             self.logger.info('Training:\tEpoch : %d\tTime : %.4fs\t Loss: %.5f' \
 76 |                                 % (i, time.time() - start_time, epoch_loss))
 77 |             gc.collect()
 78 |             torch.cuda.empty_cache()
 79 | 
 80 |             if i < 10:
 81 |                 continue
 82 | 
 83 |             ########################### Evaluation Phase ############################
 84 |             start_time = time.time()
 85 |             dev_bleu = self.decode(dev_dataset, os.path.join(self.exp_path, 'valid.iter' + str(i)),
 86 |                 test_batchSize, beam=beam, n_best=n_best)
 87 |             self.logger.info('Dev Evaluation:\tEpoch : %d\tTime : %.4fs\tBleu : %.4f' \
 88 |                                 % (i, time.time() - start_time, dev_bleu))
 89 |             start_time = time.time()
 90 |             test_bleu = self.decode(test_dataset, os.path.join(self.exp_path, 'test.iter' + str(i)),
 91 |                 test_batchSize, beam=beam, n_best=n_best)
 92 |             self.logger.info('Test Evaluation:\tEpoch : %d\tTime : %.4fs\tBleu : %.4f' \
 93 |                                 % (i, time.time() - start_time, test_bleu))
 94 | 
 95 |             ######################## Pick best result on dev and save #####################
 96 |             if dev_bleu >= self.best_result['dev_bleu']:
 97 |                 self.model.save_model(os.path.join(self.exp_path, 'model.pkl'))
 98 |                 self.best_result['iter'] = i
 99 |                 self.best_result['dev_bleu'], self.best_result['test_bleu'] = dev_bleu, test_bleu
100 |                 self.logger.info('NEW BEST:\tEpoch : %d\tBest Valid Bleu : %.4f;\tBest Test Bleu : %.4f' % (i, dev_bleu, test_bleu))
101 | 
102 |         ######################## Reload best model for later usage #####################
103 |         self.logger.info('FINAL BEST RESULT: \tEpoch : %d\tBest Valid (Bleu : %.4f)\tBest Test (Bleu : %.4f)'
104 |                 % (self.best_result['iter'], self.best_result['dev_bleu'], self.best_result['test_bleu']))
105 |         self.model.load_model(os.path.join(self.exp_path, 'model.pkl'))
106 | 


--------------------------------------------------------------------------------
/utils/solver/solver_semantic_parsing.py:
--------------------------------------------------------------------------------
  1 | #coding=utf8
  2 | import time, os, gc
  3 | from utils.solver.solver_base import Solver
  4 | from utils.example import Example
  5 | from utils.batch import get_minibatch
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | class SPSolver(Solver):
 10 | 
 11 |     def __init__(self, *args, **kargs):
 12 |         super(SPSolver, self).__init__(*args, **kargs)
 13 |         self.best_result = { "losses": [], "iter": 0, "dev_acc": 0., "test_acc": 0. }
 14 | 
 15 |     def decode(self, data_inputs, output_path, test_batchSize, beam=5, n_best=1):
 16 |         data_index= np.arange(len(data_inputs))
 17 |         nsentences, total = len(data_index), []
 18 |         domain = Example.domain
 19 |         self.model.eval()
 20 |         with open(output_path, 'w') as of:
 21 |             for j in range(0, nsentences, test_batchSize):
 22 |                 ###################### Obtain minibatch data ######################
 23 |                 inputs, lens, dec_inputs, _, _, copy_tokens, oov_list, (raw_inputs, raw_outputs) = get_minibatch(
 24 |                     data_inputs, self.vocab, task='semantic_parsing', data_index=data_index,
 25 |                     index=j, batch_size=test_batchSize, device=self.device, copy=self.model.copy)
 26 |                 ############################ Forward Model ############################
 27 |                 with torch.no_grad():
 28 |                     results = self.model.decode_batch(inputs, lens, self.vocab.lf2id, copy_tokens, beam_size=beam, n_best=n_best)
 29 |                     predictions = results["predictions"]
 30 |                     predictions = [pred for each in predictions for pred in each]
 31 |                     predictions = domain.reverse(predictions, self.vocab.id2lf, oov_list=oov_list)
 32 |                 accuracy = domain.compare_logical_form(predictions, raw_outputs, pick=True)
 33 |                 total.extend(accuracy)
 34 |                 ############################ Write result to file ############################
 35 |                 for idx in range(len(raw_inputs)):
 36 |                     of.write("Utterance: " + ' '.join(raw_inputs[idx]) + '\n')
 37 |                     of.write("Target: " + ' '.join(raw_outputs[idx]) + '\n')
 38 |                     for i in range(n_best):
 39 |                         of.write("Pred" + str(i) + ": " + ' '.join(predictions[n_best * idx + i]) + '\n')
 40 |                     of.write("Correct: " + ("True" if accuracy[idx] == 1 else "False") + '\n\n')
 41 |             acc = sum(total) / float(len(total))
 42 |             of.write('Overall accuracy is %.4f' % (acc))
 43 |         return acc
 44 | 
 45 |     def train_and_decode(self, train_dataset, dev_dataset, test_dataset, batchSize=16, test_batchSize=128,
 46 |             max_epoch=100, beam=5, n_best=1):
 47 |         train_data_index = np.arange(len(train_dataset))
 48 |         nsentences = len(train_data_index)
 49 |         for i in range(max_epoch):
 50 |             ########################### Training Phase ############################
 51 |             start_time = time.time()
 52 |             np.random.shuffle(train_data_index)
 53 |             losses = []
 54 |             self.model.train()
 55 |             for j in range(0, nsentences, batchSize):
 56 |                 ###################### Obtain minibatch data ######################
 57 |                 inputs, lens, dec_inputs, dec_outputs, out_lens, copy_tokens, _, _ = get_minibatch(
 58 |                     train_dataset, self.vocab, task='semantic_parsing', data_index=train_data_index,
 59 |                     index=j, batch_size=batchSize, device=self.device, copy=self.model.copy)
 60 |                 ############################ Forward Model ############################
 61 |                 self.model.zero_grad()
 62 |                 batch_scores = self.model(inputs, lens, dec_inputs[:, :-1], copy_tokens)
 63 |                 ############################ Loss Calculation #########################
 64 |                 batch_loss = self.loss_function(batch_scores, dec_outputs[:, 1:], out_lens - 1)
 65 |                 losses.append(batch_loss.item())
 66 |                 ########################### Backward and Optimize ######################
 67 |                 batch_loss.backward()
 68 |                 self.model.pad_embedding_grad_zero()
 69 |                 self.optimizer.step()
 70 | 
 71 |             print('[learning] epoch %i >> %3.2f%%' % (i, 100), 'completed in %.2f (sec) <<' % (time.time() - start_time))
 72 |             epoch_loss = np.sum(losses, axis=0)
 73 |             self.best_result['losses'].append(epoch_loss)
 74 |             self.logger.info('Training:\tEpoch : %d\tTime : %.4fs\t Loss: %.5f' \
 75 |                                 % (i, time.time() - start_time, epoch_loss))
 76 |             gc.collect()
 77 |             torch.cuda.empty_cache()
 78 | 
 79 |             if i < 10:
 80 |                 continue
 81 | 
 82 |             ########################### Evaluation Phase ############################
 83 |             start_time = time.time()
 84 |             dev_acc = self.decode(dev_dataset, os.path.join(self.exp_path, 'valid.iter' + str(i)),
 85 |                 test_batchSize, beam=beam, n_best=n_best)
 86 |             self.logger.info('Dev Evaluation:\tEpoch : %d\tTime : %.4fs\tAcc : %.4f' \
 87 |                                 % (i, time.time() - start_time, dev_acc))
 88 |             start_time = time.time()
 89 |             test_acc = self.decode(test_dataset, os.path.join(self.exp_path, 'test.iter' + str(i)),
 90 |                 test_batchSize, beam=beam, n_best=n_best)
 91 |             self.logger.info('Test Evaluation:\tEpoch : %d\tTime : %.4fs\tAcc : %.4f' \
 92 |                                 % (i, time.time() - start_time, test_acc))
 93 | 
 94 |             ######################## Pick best result on dev and save #####################
 95 |             if dev_acc >= self.best_result['dev_acc']:
 96 |                 self.model.save_model(os.path.join(self.exp_path, 'model.pkl'))
 97 |                 self.best_result['iter'] = i
 98 |                 self.best_result['dev_acc'], self.best_result['test_acc'] = dev_acc, test_acc
 99 |                 self.logger.info('NEW BEST:\tEpoch : %d\tBest Valid Acc : %.4f;\tBest Test Acc : %.4f' % (i, dev_acc, test_acc))
100 | 
101 |         ######################## Reload best model for later usage #####################
102 |         self.logger.info('FINAL BEST RESULT: \tEpoch : %d\tBest Valid (Acc : %.4f)\tBest Test (Acc : %.4f)'
103 |                 % (self.best_result['iter'], self.best_result['dev_acc'], self.best_result['test_acc']))
104 |         self.model.load_model(os.path.join(self.exp_path, 'model.pkl'))
105 | 


--------------------------------------------------------------------------------
/utils/statistics.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | """
 3 |     Construct vocabulary for each dataset.
 4 | """
 5 | import os, sys, argparse
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 7 | from utils.lexicon import Lexicon
 8 | from utils.constants import BOS, EOS, PAD, UNK
 9 | import operator
10 | 
11 | def read_data(path):
12 |     ex_list = []
13 |     with open(path, 'r') as infile:
14 |         for line in infile:
15 |             line = line.strip()
16 |             if line == '':
17 |                 continue
18 |             q, lf = line.split('\t')
19 |             q = [each.strip() for each in q.strip().split(' ') if each.strip() != '']
20 |             lf = [each.strip() for each in lf.strip().split(' ') if each.strip() != '']
21 |             ex_list.append((q, lf))
22 |     return ex_list
23 | 
24 | def save_vocab(idx2word, vocab_path):
25 |     with open(vocab_path, 'w') as f:
26 |         for idx in range(len(idx2word)):
27 |             f.write(idx2word[idx] + '\n')
28 | 
29 | def construct_vocab(input_seqs, mwf=1):
30 |     '''
31 |         Construct vocabulary given input_seqs
32 |         @params:
33 |             1. input_seqs: a list of seqs, e.g.
34 |                 [ ['what', 'flight'] , ['which', 'flight'] ]
35 |             2. mwf: minimum word frequency
36 |         @return:
37 |             1. word2idx(dict)
38 |             2. idx2word(dict)
39 |     '''
40 |     vocab, word2idx, idx2word = {}, {}, []
41 |     for seq in input_seqs:
42 |         if type(seq) in [tuple, list]:
43 |             for word in seq:
44 |                 if word not in vocab:
45 |                     vocab[word] = 1
46 |                 else:
47 |                     vocab[word] += 1
48 |         else:
49 |             if seq not in vocab:
50 |                 vocab[seq] = 1
51 |             else:
52 |                 vocab[seq] += 1
53 |     
54 |     # Discard those special tokens if already exist
55 |     if PAD in vocab: del vocab[PAD]
56 |     if UNK in vocab: del vocab[UNK]
57 |     if BOS in vocab: del vocab[BOS]
58 |     if EOS in vocab: del vocab[EOS]
59 | 
60 |     sorted_words = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)
61 |     sorted_words = [x[0] for x in sorted_words if x[1] >= mwf]
62 |     for word in sorted_words:
63 |         idx = len(word2idx)
64 |         word2idx[word] = idx
65 |         idx2word.append(word)
66 |     return word2idx, idx2word
67 | 
68 | def main(args=sys.argv[1:]):
69 |     """
70 |         Construct vocabulary for each dataset
71 |     """
72 |     parser = argparse.ArgumentParser()
73 |     parser.add_argument('--dataset', default='all', help='dataset name')
74 |     parser.add_argument('--mwf', type=int, default=1, help='minimum word frequency, if less than this int, not included')
75 |     opt = parser.parse_args(args)
76 |     all_dataset = [opt.dataset] if opt.dataset != 'all' else ['atis', 'geo', 'basketball', 'blocks', 'calendar', 'housing', 'publications', 'recipes', 'restaurants', 'socialnetwork']
77 | 
78 |     for dataset in all_dataset:
79 |         dirname = os.path.join('data', 'overnight') if dataset != 'atis' and dataset != 'geo' else os.path.join('data', dataset)
80 |         file_path = os.path.join(dirname, dataset + '_train.tsv')
81 |         word_vocab_path, lf_vocab_path, copy_vocab_path = os.path.join(dirname, dataset + '_vocab.word'), \
82 |             os.path.join(dirname, dataset + '_vocab.lf'), os.path.join(dirname, dataset + '_vocab.copy')
83 |         lexicon_words = sorted(list(Lexicon(dataset).seen_words))
84 | 
85 |         ex_list = read_data(file_path)
86 |         questions, logical_forms = list(zip(*ex_list))
87 |         _, id2word = construct_vocab(questions, mwf=opt.mwf)
88 |         _, id2lf = construct_vocab(logical_forms, mwf=opt.mwf)
89 |         _, id2copy = construct_vocab(lexicon_words, mwf=opt.mwf)
90 |         save_vocab(id2word, word_vocab_path)
91 |         save_vocab(id2lf, lf_vocab_path)
92 |         save_vocab(id2copy, copy_vocab_path)
93 | 
94 | if __name__ == '__main__':
95 | 
96 |     main()
97 | 


--------------------------------------------------------------------------------
/utils/vocab.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import sys, os
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | from utils.constants import BOS, EOS, PAD, UNK
 5 | 
 6 | class Vocab():
 7 | 
 8 |     def __init__(self, dataset, task='semantic_parsing', copy=False):
 9 |         super(Vocab, self).__init__()
10 |         self.dataset = dataset
11 |         dirname = os.path.join('data', 'overnight') if dataset != 'atis' and dataset != 'geo' else os.path.join('data', dataset)
12 |         word_path = os.path.join(dirname, dataset + '_vocab.word')
13 |         lf_path = os.path.join(dirname, dataset + '_vocab.lf')
14 |         copy_path = os.path.join(dirname, dataset + '_vocab.copy')
15 |         if task == 'semantic_parsing':
16 |             self.word2id, self.id2word = self.read_vocab(word_path, bos_eos=False)
17 |             self.lf2id, self.id2lf = self.read_vocab(lf_path, bos_eos=True)
18 |         elif task == 'question_generation':
19 |             self.word2id, self.id2word = self.read_vocab(word_path, bos_eos=True)
20 |             if copy:
21 |                 self.lf2id, self.id2lf = self.read_vocab(lf_path, copy_path, bos_eos=False)
22 |             else:
23 |                 self.lf2id, self.id2lf = self.read_vocab(lf_path, bos_eos=False)
24 |         elif task == 'language_model':
25 |             self.word2id, self.id2word = self.read_vocab(word_path, bos_eos=True)
26 |             self.lf2id, self.id2lf = self.read_vocab(lf_path, bos_eos=True)
27 |         else:
28 |             raise ValueError('[Error]: unknown task !')
29 | 
30 |     def read_vocab(self, *args, bos_eos=True, pad=True, unk=True, separator=' : '):
31 |         word2idx, idx2word = {}, []
32 |         if pad:
33 |             word2idx[PAD] = len(word2idx)
34 |             idx2word.append(PAD)
35 |         if unk:
36 |             word2idx[UNK] = len(word2idx)
37 |             idx2word.append(UNK)
38 |         if bos_eos:
39 |             word2idx[BOS] = len(word2idx)
40 |             idx2word.append(BOS)
41 |             word2idx[EOS] = len(word2idx)
42 |             idx2word.append(EOS)
43 |         for vocab_path in args:
44 |             with open(vocab_path, 'r') as f:
45 |                 for line in f:
46 |                     line = line.strip()
47 |                     if line == '':
48 |                         continue
49 |                     if separator in line:
50 |                         word, _ = line.split(separator)
51 |                     else:
52 |                         word = line
53 |                     idx = len(word2idx)
54 |                     if word not in word2idx:
55 |                         word2idx[word] = idx
56 |                         idx2word.append(word)
57 |         return word2idx, idx2word


--------------------------------------------------------------------------------
/utils/word2vec.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | """
 3 |     Word2vec utilities: load pre-trained embeddings: Glove6B embeddings
 4 | """
 5 | import torch
 6 | import numpy as np
 7 | from collections import defaultdict
 8 | from utils.constants import BOS, EOS, PAD, UNK, VECTORCACHE
 9 | 
10 | def read_pretrained_vectors(filename, vocab, device):
11 |     word2vec, mapping = {}, {}
12 |     mapping['bos'], mapping['eos'], mapping['padding'], mapping['unknown'] = BOS, EOS, PAD, UNK
13 |     with open(filename, 'r') as infile:
14 |         for line in infile:
15 |             line = line.strip()
16 |             if line == '':
17 |                 continue
18 |             word = line[:line.index(' ')]
19 |             word = mapping[word] if word in mapping else word
20 |             if word in vocab:
21 |                 values = line[line.index(' ') + 1:]
22 |                 word2vec[word] = torch.tensor(np.fromstring(values, sep=' ', dtype=np.float), device=device)
23 |     return word2vec
24 | 
25 | def load_embeddings(module, word2id, device=None):
26 |     emb_dim = module.weight.data.size(-1)
27 |     if emb_dim not in [50, 100, 200, 300]:
28 |         print('Not use pretrained glove6B embeddings ...')
29 |         return 0.0
30 |     word2vec_file = VECTORCACHE(emb_dim)
31 |     pretrained_vectors = read_pretrained_vectors(word2vec_file, word2id, device)
32 |     for word in pretrained_vectors:
33 |         module.weight.data[word2id[word]] = pretrained_vectors[word]
34 |     return len(pretrained_vectors)/float(len(word2id))
35 | 


--------------------------------------------------------------------------------