22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cbert_aug
2 | This repo is deprecate, please refer to https://github.com/IIEKES/cbert_aug/ for detailed trails.
3 |
4 |
5 | This is a implementation of paper "Conditional BERT Contextual Augmentation" https://arxiv.org/pdf/1812.06705.pdf.
6 | Our original implementation was two-stage, for convenience, we rewrite the code.
7 |
8 | The *global.config* contains the global configuration for bert and classifier.
9 | The datasets directory contains files for bert, and the aug_data directory contain augmented files for classifier.
10 |
11 | You can run the code in two ways. Our default way is the second.
12 | - You can load the original bert and run aug_dataset_wo_ft.py directly
13 | - python aug_dataset_wo_ft.py
14 | - Or you can finetune bert on each dataset before run aug_dataset.py, and then load fine-tuned bert in aug_dataset.py
15 | 1. python finetune_dataset.py
16 | 2. python aug_dataset.py
17 |
18 | The hyperparameters of the models and training were selected by a grid-search using baseline models without data augmentation in each task’s validation set individually.
19 |
20 | We show the process of augmenting stsa.binary and training rnn-classifier on it, in aug_data/stsa.binary directory.
21 |
22 | The classifier code is from , thanks to the author.
23 |
--------------------------------------------------------------------------------
/__pycache__/args_of_text_classifier.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/args_of_text_classifier.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/evaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/evaluator.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/nets.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/nets.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/train_text_classifier.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/train_text_classifier.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/triggers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/triggers.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/args_of_text_classifier.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import print_function
3 |
4 | import argparse
5 | import json
6 |
7 |
8 | def get_basic_arg_parser():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--batchsize', '-b', type=int, default=64,
11 | help='Number of images in each mini-batch')
12 | parser.add_argument('--epoch', '-e', type=int, default=100,
13 | help='Number of sweeps over the dataset to train')
14 | parser.add_argument('--gpu', '-g', type=int, default=0,
15 | help='GPU ID (negative value indicates CPU)')
16 | parser.add_argument('--out', '-o', default='result',
17 | help='Directory to output the result')
18 | parser.add_argument('--unit', '-u', type=int, default=300,
19 | help='Number of units')
20 | parser.add_argument('--layer', '-l', type=int, default=1,
21 | help='Number of layers of RNN or MLP following CNN')
22 | parser.add_argument('--dropout', '-d', type=float, default=0.5,
23 | help='Dropout rate')
24 | parser.add_argument('--learning-rate', '-lr', type=float, default=1e-4,
25 | help='Learning rate')
26 | parser.add_argument('--dataset', '-data', default='imdb.binary',
27 | choices=['dbpedia', 'imdb.binary', 'imdb.fine',
28 | 'TREC', 'stsa.binary', 'stsa.fine',
29 | 'custrev', 'mpqa', 'rt-polarity', 'subj'],
30 | help='Name of dataset.')
31 | parser.add_argument('--model', '-model', default='cnn',
32 | choices=['cnn', 'rnn', 'bow'],
33 | help='Name of encoder model type.')
34 |
35 | parser.add_argument('--bilm', '-bilm')
36 | parser.add_argument('--bilm-unit', '-bilm-u', type=int, default=1024)
37 | parser.add_argument('--bilm-layer', '-bilm-l', type=int, default=1)
38 | parser.add_argument('--bilm-dropout', '-bilm-d', type=float, default=0.)
39 |
40 | parser.add_argument('--bilm-ratio', '-bilm-r', type=float, default=0.25)
41 | parser.add_argument('--bilm-temp', '-bilm-t', type=float, default=1.)
42 | parser.add_argument('--bilm-mode', '-bilm-m', default='sampling',
43 | choices=['weighted_sum', 'sampling'])
44 | parser.add_argument('--bilm-gumbel', action='store_true')
45 | parser.add_argument('--bilm-wordwise', action='store_true', default=True)
46 | parser.add_argument('--bilm-add-original', type=float, default=0.)
47 | parser.add_argument('--bilm-residual', type=float, default=0.,
48 | help='if not 0, (original + context) * THIS')
49 |
50 | parser.add_argument('--resume-vocab')
51 |
52 | parser.add_argument('--validation', default=True)
53 | parser.add_argument('--seed', type=int, default=2018)
54 | parser.add_argument('--save-model', action='store_true')
55 | parser.add_argument('--stop-epoch', type=int, default=20)
56 |
57 | parser.add_argument('--no-label', action='store_true')
58 |
59 | return parser
60 |
61 |
62 | if __name__ == '__main__':
63 | parser = get_basic_arg_parser()
64 | args = parser.parse_args()
65 | print(json.dumps(args.__dict__, indent=2))
66 |
--------------------------------------------------------------------------------
/aug_data/TREC/test.tsv:
--------------------------------------------------------------------------------
1 | sentence label
2 | How far is it from Denver to Aspen ? 5
3 | What county is Modesto , California in ? 4
4 | Who was Galileo ? 3
5 | What is an atom ? 0
6 | When did Hawaii become a state ? 5
7 | How tall is the Sears Building ? 5
8 | George Bush purchased a small interest in which baseball team ? 3
9 | What is Australia 's national flower ? 1
10 | Why does the moon turn orange ? 0
11 | What is autism ? 0
12 | What city had a world fair in 1900 ? 4
13 | What person 's head is on a dime ? 3
14 | What is the average weight of a Yellow Labrador ? 5
15 | Who was the first man to fly across the Pacific Ocean ? 3
16 | When did Idaho become a state ? 5
17 | What is the life expectancy for crickets ? 5
18 | What metal has the highest melting point ? 1
19 | Who developed the vaccination against polio ? 3
20 | What is epilepsy ? 0
21 | What year did the Titanic sink ? 5
22 | Who was the first American to walk in space ? 3
23 | What is a biosphere ? 0
24 | What river in the US is known as the Big Muddy ? 4
25 | What is bipolar disorder ? 0
26 | What is cholesterol ? 0
27 | Who developed the Macintosh computer ? 3
28 | What is caffeine ? 0
29 | What imaginary line is halfway between the North and South Poles ? 4
30 | Where is John Wayne airport ? 4
31 | What hemisphere is the Philippines in ? 4
32 | What is the average speed of the horses at the Kentucky Derby ? 5
33 | Where are the Rocky Mountains ? 4
34 | What are invertebrates ? 0
35 | What is the temperature at the center of the earth ? 5
36 | When did John F. Kennedy get elected as President ? 5
37 | How old was Elvis Presley when he died ? 5
38 | Where is the Orinoco River ? 4
39 | How far is the service line from the net in tennis ? 5
40 | How much fiber should you have per day ? 5
41 | How many Great Lakes are there ? 5
42 | Material called linen is made from what plant ? 1
43 | What is Teflon ? 0
44 | What is amitriptyline ? 0
45 | What is a shaman ? 0
46 | What is the proper name for a female walrus ? 1
47 | What is a group of turkeys called ? 1
48 | How long did Rip Van Winkle sleep ? 5
49 | What are triglycerides ? 0
50 | How many liters in a gallon ? 5
51 | What is the name of the chocolate company in San Francisco ? 3
52 | What are amphibians ? 0
53 | Who discovered x-rays ? 3
54 | Which comedian 's signature line is `` Can we talk '' ? 3
55 | What is fibromyalgia ? 0
56 | What is done with worn or outdated flags ? 0
57 | What does cc in engines mean ? 0
58 | When did Elvis Presley die ? 5
59 | What is the capital of Yugoslavia ? 4
60 | Where is Milan ? 4
61 | What is the speed hummingbirds fly ? 5
62 | What is the oldest city in the United States ? 4
63 | What was W.C. Fields ' real name ? 3
64 | What river flows between Fargo , North Dakota and Moorhead , Minnesota ? 4
65 | What do bats eat ? 1
66 | What state did the Battle of Bighorn take place in ? 4
67 | Who was Abraham Lincoln ? 3
68 | What do you call a newborn kangaroo ? 1
69 | What are spider veins ? 0
70 | What day and month did John Lennon die ? 5
71 | What strait separates North America from Asia ? 4
72 | What is the population of Seattle ? 5
73 | How much was a ticket for the Titanic ? 5
74 | What is the largest city in the world ? 4
75 | What American composer wrote the music for `` West Side Story '' ? 3
76 | Where is the Mall of the America ? 4
77 | What is the pH scale ? 0
78 | What type of currency is used in Australia ? 1
79 | How tall is the Gateway Arch in St. Louis , MO ? 5
80 | How much does the human adult female brain weigh ? 5
81 | Who was the first governor of Alaska ? 3
82 | What is a prism ? 0
83 | When was the first liver transplant ? 5
84 | Who was elected president of South Africa in 1994 ? 3
85 | What is the population of China ? 5
86 | When was Rosa Parks born ? 5
87 | Why is a ladybug helpful ? 0
88 | What is amoxicillin ? 0
89 | Who was the first female United States Representative ? 3
90 | What are xerophytes ? 0
91 | What country did Ponce de Leon come from ? 4
92 | The U.S. Department of Treasury first issued paper currency for the U.S. during which war ? 1
93 | What is desktop publishing ? 0
94 | What is the temperature of the sun 's surface ? 5
95 | What year did Canada join the United Nations ? 5
96 | What is the oldest university in the US ? 3
97 | Where is Prince Edward Island ? 4
98 | Mercury , what year was it discovered ? 5
99 | What is cryogenics ? 0
100 | What are coral reefs ? 0
101 | What is the longest major league baseball-winning streak ? 1
102 | What is neurology ? 0
103 | Who invented the calculator ? 3
104 | How do you measure earthquakes ? 0
105 | Who is Duke Ellington ? 3
106 | What county is Phoenix , AZ in ? 4
107 | What is a micron ? 0
108 | The sun 's core , what is the temperature ? 5
109 | What is the Ohio state bird ? 1
110 | When were William Shakespeare 's twins born ? 5
111 | What is the highest dam in the U.S. ? 4
112 | What color is a poison arrow frog ? 1
113 | What is acupuncture ? 0
114 | What is the length of the coastline of the state of Alaska ? 5
115 | What is the name of Neil Armstrong 's wife ? 3
116 | What is Hawaii 's state flower ? 1
117 | Who won Ms. American in 1989 ? 3
118 | When did the Hindenberg crash ? 5
119 | What mineral helps prevent osteoporosis ? 1
120 | What was the last year that the Chicago Cubs won the World Series ? 5
121 | Where is Perth ? 4
122 | What year did WWII begin ? 5
123 | What is the diameter of a golf ball ? 5
124 | What is an eclipse ? 0
125 | Who discovered America ? 3
126 | What is the earth 's diameter ? 5
127 | Which president was unmarried ? 3
128 | How wide is the Milky Way galaxy ? 5
129 | During which season do most thunderstorms occur ? 5
130 | What is Wimbledon ? 0
131 | What is the gestation period for a cat ? 5
132 | How far is a nautical mile ? 5
133 | Who was the abolitionist who led the raid on Harper 's Ferry in 1859 ? 3
134 | What does target heart rate mean ? 0
135 | What was the first satellite to go into space ? 1
136 | What is foreclosure ? 0
137 | What is the major fault line near Kentucky ? 1
138 | Where is the Holland Tunnel ? 4
139 | Who wrote the hymn `` Amazing Grace '' ? 3
140 | What position did Willie Davis play in baseball ? 3
141 | What are platelets ? 0
142 | What is severance pay ? 0
143 | What is the name of Roy Roger 's dog ? 1
144 | Where are the National Archives ? 4
145 | What is a baby turkey called ? 1
146 | What is poliomyelitis ? 0
147 | What is the longest bone in the human body ? 1
148 | Who is a German philosopher ? 3
149 | What were Christopher Columbus ' three ships ? 1
150 | What does Phi Beta Kappa mean ? 0
151 | What is nicotine ? 0
152 | What is another name for vitamin B1 ? 1
153 | Who discovered radium ? 3
154 | What are sunspots ? 0
155 | When was Algeria colonized ? 5
156 | What baseball team was the first to make numbers part of their uniform ? 3
157 | What continent is Egypt on ? 4
158 | What is the capital of Mongolia ? 4
159 | What is nanotechnology ? 0
160 | In the late 1700 's British convicts were used to populate which colony ? 4
161 | What state is the geographic center of the lower 48 states ? 4
162 | What is an obtuse angle ? 0
163 | What are polymers ? 0
164 | When is hurricane season in the Caribbean ? 5
165 | Where is the volcano Mauna Loa ? 4
166 | What is another astronomic term for the Northern Lights ? 1
167 | What peninsula is Spain part of ? 4
168 | When was Lyndon B. Johnson born ? 5
169 | What is acetaminophen ? 0
170 | What state has the least amount of rain per year ? 4
171 | Who founded American Red Cross ? 3
172 | What year did the Milwaukee Braves become the Atlanta Braves ? 5
173 | How fast is alcohol absorbed ? 5
174 | When is the summer solstice ? 5
175 | What is supernova ? 0
176 | Where is the Shawnee National Forest ? 4
177 | What U.S. state 's motto is `` Live free or Die '' ? 4
178 | Where is the Lourve ? 4
179 | When was the first stamp issued ? 5
180 | What primary colors do you mix to make orange ? 1
181 | How far is Pluto from the sun ? 5
182 | What body of water are the Canary Islands in ? 4
183 | What is neuropathy ? 0
184 | Where is the Euphrates River ? 4
185 | What is cryptography ? 0
186 | What is natural gas composed of ? 1
187 | Who is the Prime Minister of Canada ? 3
188 | What French ruler was defeated at the battle of Waterloo ? 3
189 | What is leukemia ? 0
190 | Where did Howard Hughes die ? 4
191 | What is the birthstone for June ? 1
192 | What is the sales tax in Minnesota ? 1
193 | What is the distance in miles from the earth to the sun ? 5
194 | What is the average life span for a chicken ? 5
195 | When was the first Wal-Mart store opened ? 5
196 | What is relative humidity ? 0
197 | What city has the zip code of 35824 ? 4
198 | What currency is used in Algeria ? 1
199 | Who invented the hula hoop ? 3
200 | What was the most popular toy in 1957 ? 1
201 | What is pastrami made of ? 1
202 | What is the name of the satellite that the Soviet Union sent into space in 1957 ? 1
203 | What city 's newspaper is called `` The Enquirer '' ? 4
204 | Who invented the slinky ? 3
205 | What are the animals that don 't have backbones called ? 1
206 | What is the melting point of copper ? 5
207 | Where is the volcano Olympus Mons located ? 4
208 | Who was the 23rd president of the United States ? 3
209 | What is the average body temperature ? 5
210 | What does a defibrillator do ? 0
211 | What is the effect of acid rain ? 0
212 | What year did the United States abolish the draft ? 5
213 | How fast is the speed of light ? 5
214 | What province is Montreal in ? 4
215 | What New York City structure is also known as the Twin Towers ? 4
216 | What is fungus ? 0
217 | What is the most frequently spoken language in the Netherlands ? 1
218 | What is sodium chloride ? 0
219 | What are the spots on dominoes called ? 1
220 | How many pounds in a ton ? 5
221 | What is influenza ? 0
222 | What is ozone depletion ? 0
223 | What year was the Mona Lisa painted ? 5
224 | What does `` Sitting Shiva '' mean ? 0
225 | What is the electrical output in Madrid , Spain ? 1
226 | Which mountain range in North America stretches from Maine to Georgia ? 4
227 | What is plastic made of ? 1
228 | What is the population of Nigeria ? 5
229 | What does your spleen do ? 0
230 | Where is the Grand Canyon ? 4
231 | Who invented the telephone ? 3
232 | What year did the U.S. buy Alaska ? 5
233 | What is the name of the leader of Ireland ? 3
234 | What is phenylalanine ? 0
235 | How many gallons of water are there in a cubic foot ? 5
236 | What are the two houses of the Legislative branch ? 1
237 | What is sonar ? 0
238 | In Poland , where do most people live ? 4
239 | What is phosphorus ? 0
240 | What is the location of the Sea of Tranquility ? 4
241 | How fast is sound ? 5
242 | What French province is cognac produced in ? 4
243 | What is Valentine 's Day ? 0
244 | What causes gray hair ? 0
245 | What is hypertension ? 0
246 | What is bandwidth ? 0
247 | What is the longest suspension bridge in the U.S. ? 4
248 | What is a parasite ? 0
249 | What is home equity ? 0
250 | What do meteorologists do ? 0
251 | What is the criterion for being legally blind ? 1
252 | Who is the tallest man in the world ? 3
253 | What are the twin cities ? 4
254 | What did Edward Binney and Howard Smith invent in 1903 ? 1
255 | What is the statue of liberty made of ? 1
256 | What is pilates ? 0
257 | What planet is known as the `` red '' planet ? 4
258 | What is the depth of the Nile river ? 5
259 | What is the colorful Korean traditional dress called ? 1
260 | What is Mardi Gras ? 0
261 | Mexican pesos are worth what in U.S. dollars ? 5
262 | Who was the first African American to play for the Brooklyn Dodgers ? 3
263 | Who was the first Prime Minister of Canada ? 3
264 | How many Admirals are there in the U.S. Navy ? 5
265 | What instrument did Glenn Miller play ? 1
266 | How old was Joan of Arc when she died ? 5
267 | What does the word fortnight mean ? 0
268 | What is dianetics ? 0
269 | What is the capital of Ethiopia ? 4
270 | For how long is an elephant pregnant ? 5
271 | How did Janice Joplin die ? 0
272 | What is the primary language in Iceland ? 1
273 | What is the difference between AM radio stations and FM radio stations ? 0
274 | What is osteoporosis ? 0
275 | Who was the first woman governor in the U.S. ? 3
276 | What is peyote ? 0
277 | What is the esophagus used for ? 0
278 | What is viscosity ? 0
279 | What year did Oklahoma become a state ? 5
280 | What is the abbreviation for Texas ? 2
281 | What is a mirror made out of ? 1
282 | Where on the body is a mortarboard worn ? 4
283 | What was J.F.K. 's wife 's name ? 3
284 | What does I.V. stand for ? 2
285 | What is the chunnel ? 0
286 | Where is Hitler buried ? 4
287 | What are antacids ? 0
288 | What is pulmonary fibrosis ? 0
289 | What are Quaaludes ? 0
290 | What is naproxen ? 0
291 | What is strep throat ? 0
292 | What is the largest city in the U.S. ? 4
293 | What is foot and mouth disease ? 1
294 | What is the life expectancy of a dollar bill ? 5
295 | What do you call a professional map drawer ? 1
296 | What are Aborigines ? 0
297 | What is hybridization ? 0
298 | What color is indigo ? 1
299 | How old do you have to be in order to rent a car in Italy ? 5
300 | What does a barometer measure ? 1
301 | What color is a giraffe 's tongue ? 1
302 | What does USPS stand for ? 2
303 | What year did the NFL go on strike ? 5
304 | What is solar wind ? 0
305 | What date did Neil Armstrong land on the moon ? 5
306 | When was Hiroshima bombed ? 5
307 | Where is the Savannah River ? 4
308 | Who was the first woman killed in the Vietnam War ? 3
309 | What planet has the strongest magnetic field of all the planets ? 4
310 | Who is the governor of Alaska ? 3
311 | What year did Mussolini seize power in Italy ? 5
312 | What is the capital of Persia ? 4
313 | Where is the Eiffel Tower ? 4
314 | How many hearts does an octopus have ? 5
315 | What is pneumonia ? 0
316 | What is the deepest lake in the US ? 4
317 | What is a fuel cell ? 0
318 | Who was the first U.S. president to appear on TV ? 3
319 | Where is the Little League Museum ? 4
320 | What are the two types of twins ? 1
321 | What is the brightest star ? 4
322 | What is diabetes ? 0
323 | When was President Kennedy shot ? 5
324 | What is TMJ ? 2
325 | What color is yak milk ? 1
326 | What date was Dwight D. Eisenhower born ? 5
327 | What does the technical term ISDN mean ? 2
328 | Why is the sun yellow ? 0
329 | What is the conversion rate between dollars and pounds ? 5
330 | When was Abraham Lincoln born ? 5
331 | What is the Milky Way ? 0
332 | What is mold ? 0
333 | What year was Mozart born ? 5
334 | What is a group of frogs called ? 1
335 | What is the name of William Penn 's ship ? 1
336 | What is the melting point of gold ? 5
337 | What is the street address of the White House ? 4
338 | What is semolina ? 0
339 | What fruit is Melba sauce made from ? 1
340 | What is Ursa Major ? 0
341 | What is the percentage of water content in the human body ? 5
342 | How much does water weigh ? 5
343 | What was President Lyndon Johnson 's reform program called ? 1
344 | What is the murder rate in Windsor , Ontario ? 5
345 | Who is the only president to serve 2 non-consecutive terms ? 3
346 | What is the population of Australia ? 5
347 | Who painted the ceiling of the Sistine Chapel ? 3
348 | Name a stimulant . 1
349 | What is the effect of volcanoes on the climate ? 0
350 | What year did the Andy Griffith show begin ? 5
351 | What is acid rain ? 0
352 | What is the date of Mexico 's independence ? 5
353 | What is the location of Lake Champlain ? 4
354 | What is the Illinois state flower ? 1
355 | What is Maryland 's state bird ? 1
356 | What is quicksilver ? 0
357 | Who wrote `` The Divine Comedy '' ? 3
358 | What is the speed of light ? 5
359 | What is the width of a football field ? 5
360 | Why in tennis are zero points called love ? 0
361 | What kind of dog was Toto in the Wizard of Oz ? 1
362 | What is a thyroid ? 0
363 | What does ciao mean ? 0
364 | What is the only artery that carries blue blood from the heart to the lungs ? 1
365 | How often does Old Faithful erupt at Yellowstone National Park ? 5
366 | What is acetic acid ? 0
367 | What is the elevation of St. Louis , MO ? 5
368 | What color does litmus paper turn when it comes into contact with a strong acid ? 1
369 | What are the colors of the German flag ? 1
370 | What is the Moulin Rouge ? 0
371 | What soviet seaport is on the Black Sea ? 4
372 | What is the atomic weight of silver ? 5
373 | What currency do they use in Brazil ? 1
374 | What are pathogens ? 0
375 | What is mad cow disease ? 0
376 | Name a food high in zinc . 1
377 | When did North Carolina enter the union ? 5
378 | Where do apple snails live ? 4
379 | What are ethics ? 0
380 | What does CPR stand for ? 2
381 | What is an annuity ? 0
382 | Who killed John F. Kennedy ? 3
383 | Who was the first vice president of the U.S. ? 3
384 | What birthstone is turquoise ? 1
385 | Who was the first US President to ride in an automobile to his inauguration ? 3
386 | How old was the youngest president of the United States ? 5
387 | When was Ulysses S. Grant born ? 5
388 | What is Muscular Dystrophy ? 0
389 | Who lived in the Neuschwanstein castle ? 3
390 | What is propylene glycol ? 0
391 | What is a panic disorder ? 0
392 | Who invented the instant Polaroid camera ? 3
393 | What is a carcinogen ? 0
394 | What is a baby lion called ? 1
395 | What is the world 's population ? 5
396 | What is nepotism ? 0
397 | What is die-casting ? 0
398 | What is myopia ? 0
399 | What is the sales tax rate in New York ? 5
400 | Developing nations comprise what percentage of the world 's population ? 5
401 | What is the fourth highest mountain in the world ? 4
402 | What is Shakespeare 's nickname ? 3
403 | What is the heaviest naturally occurring element ? 1
404 | When is Father 's Day ? 5
405 | What does the acronym NASA stand for ? 2
406 | How long is the Columbia River in miles ? 5
407 | What city 's newspaper is called `` The Star '' ? 4
408 | What is carbon dioxide ? 0
409 | Where is the Mason/Dixon line ? 4
410 | When was the Boston tea party ? 5
411 | What is metabolism ? 0
412 | Which U.S.A. president appeared on `` Laugh-In '' ? 3
413 | What are cigarettes made of ? 1
414 | What is the capital of Zimbabwe ? 4
415 | What does NASA stand for ? 2
416 | What is the state flower of Michigan ? 1
417 | What are semiconductors ? 0
418 | What is nuclear power ? 0
419 | What is a tsunami ? 0
420 | Who is the congressman from state of Texas on the armed forces committee ? 3
421 | Who was president in 1913 ? 3
422 | When was the first kidney transplant ? 5
423 | What are Canada 's two territories ? 4
424 | What was the name of the plane Lindbergh flew solo across the Atlantic ? 1
425 | What is genocide ? 0
426 | What continent is Argentina on ? 4
427 | What monastery was raided by Vikings in the late eighth century ? 1
428 | What is an earthquake ? 0
429 | Where is the tallest roller coaster located ? 4
430 | What are enzymes ? 0
431 | Who discovered oxygen ? 3
432 | What is bangers and mash ? 0
433 | What is the name given to the Tiger at Louisiana State University ? 1
434 | Where are the British crown jewels kept ? 4
435 | Who was the first person to reach the North Pole ? 3
436 | What is an ulcer ? 0
437 | What is vertigo ? 0
438 | What is the spirometer test ? 0
439 | When is the official first day of summer ? 5
440 | What does the abbreviation SOS mean ? 2
441 | What is the smallest bird in Britain ? 1
442 | Who invented Trivial Pursuit ? 3
443 | What gasses are in the troposphere ? 1
444 | Which country has the most water pollution ? 4
445 | What is the scientific name for elephant ? 1
446 | Who is the actress known for her role in the movie `` Gypsy '' ? 3
447 | What breed of hunting dog did the Beverly Hillbillies own ? 1
448 | What is the rainiest place on Earth ? 4
449 | Who was the first African American to win the Nobel Prize in literature ? 3
450 | When is St. Patrick 's Day ? 5
451 | What was FDR 's dog 's name ? 1
452 | What colors need to be mixed to get the color pink ? 1
453 | What is the most popular sport in Japan ? 1
454 | What is the active ingredient in baking soda ? 1
455 | When was Thomas Jefferson born ? 5
456 | How cold should a refrigerator be ? 5
457 | When was the telephone invented ? 5
458 | What is the most common eye color ? 1
459 | Where was the first golf course in the United States ? 4
460 | What is schizophrenia ? 0
461 | What is angiotensin ? 0
462 | What did Jesse Jackson organize ? 3
463 | What is New York 's state bird ? 1
464 | What is the National Park in Utah ? 4
465 | What is Susan B. Anthony 's birthday ? 5
466 | In which state would you find the Catskill Mountains ? 4
467 | What do you call a word that is spelled the same backwards and forwards ? 1
468 | What are pediatricians ? 0
469 | What chain store is headquartered in Bentonville , Arkansas ? 3
470 | What are solar cells ? 0
471 | What is compounded interest ? 0
472 | What are capers ? 0
473 | What is an antigen ? 0
474 | What currency does Luxembourg use ? 1
475 | What is the population of Venezuela ? 5
476 | What type of polymer is used for bulletproof vests ? 1
477 | What currency does Argentina use ? 1
478 | What is a thermometer ? 0
479 | What Canadian city has the largest population ? 4
480 | What color are crickets ? 1
481 | Which country gave New York the Statue of Liberty ? 4
482 | What was the name of the first U.S. satellite sent into space ? 1
483 | What precious stone is a form of pure carbon ? 1
484 | What kind of gas is in a fluorescent bulb ? 1
485 | What is rheumatoid arthritis ? 0
486 | What river runs through Rowe , Italy ? 4
487 | What is cerebral palsy ? 0
488 | What city is also known as `` The Gateway to the West '' ? 4
489 | How far away is the moon ? 5
490 | What is the source of natural gas ? 1
491 | In what spacecraft did U.S. astronaut Alan Shepard make his historic 1961 flight ? 1
492 | What is pectin ? 0
493 | What is bio-diversity ? 0
494 | What 's the easiest way to remove wallpaper ? 1
495 | What year did the Titanic start on its journey ? 5
496 | How much of an apple is water ? 5
497 | Who was the 22nd President of the US ? 3
498 | What is the money they use in Zambia ? 1
499 | How many feet in a mile ? 5
500 | What is the birthstone of October ? 1
501 | What is e-coli ? 0
502 |
--------------------------------------------------------------------------------
/aug_data/mpqa/dev.tsv:
--------------------------------------------------------------------------------
1 | sentence label
2 | unfettered support 1
3 | such a mindset will impair 0
4 | has revealed george bush's talents as a war leader 1
5 | provide support to the us , as it did 1
6 | kill 0
7 | play with 0
8 | embark on trampling upon human rights of civilians 0
9 | in such a complex state 0
10 | pursue 0
11 | will continue to support 1
12 | places strains 0
13 | wish 1
14 | subjected 0
15 | benefits 0
16 | highly respected 1
17 | this agreement 1
18 | under the strong influence 0
19 | preach 1
20 | hardest hit 0
21 | would be easier 1
22 | will be irresponsible 0
23 | conspicuous policy of playing games 0
24 | to agree 1
25 | to ask 1
26 | was very strong about this 1
27 | were even more full of praise 1
28 | can not be taken seriously 0
29 | felt gratified and relieved 1
30 | invited 1
31 | languages without nuances 0
32 | scheme 0
33 | incapacity to put crime under control 0
34 | it is pure rhetoric 0
35 | charged 0
36 | attacks 0
37 | anger 0
38 | vowed 1
39 | ridiculous 0
40 | are ready 1
41 | were accused 0
42 | will of 1
43 | unheeded 0
44 | opposes and rejects 0
45 | outcry 0
46 | join forces 1
47 | ca n't be burying its head in the sand 0
48 | insidious 0
49 | would not accept 0
50 | too costly to meet 0
51 | should not 0
52 | does not reflect 0
53 | invited 1
54 | top priority of 1
55 | only benefits corporate america 0
56 | still 0
57 | support 1
58 | not intent on revitalizing its economies 0
59 | angry anti - us protests erupted 0
60 | hopes 1
61 | m feeling much better 1
62 | branded 0
63 | the danger 0
64 | proposal to reunify 1
65 | nothing seemed more normal than to settle in someone else's territory 0
66 | to blame 0
67 | have been growing less visionary 0
68 | will fight on 0
69 | shortcomings 0
70 | deadly 0
71 | would incur its displeasure 0
72 | like a dangerous virus 0
73 | it is a futile illusion because it is a lie 0
74 | are irritated 0
75 | in a way our government has not been 0
76 | neither good nor bad , just incorrigible 0
77 | destroyed 0
78 | respecting 1
79 | to promote 1
80 | witch - hunting 0
81 | biggest terrorist country 0
82 | legitimate dissent 1
83 | even against vehicles transporting pregnant women in labor and against unarmed citizens 0
84 | niceties 1
85 | his ship has been sailing towards a wrong direction for far too long 0
86 | critics 0
87 | once again turn its back 0
88 | growing unrest 0
89 | defects 0
90 | has no love lost 0
91 | violent protests 0
92 | confined to minute cells 0
93 | immediately 1
94 | corrupt politicians 0
95 | a crime against humanity 0
96 | the rights and privileges that they would automatically get under geneva convention 0
97 | perils of delusion 0
98 | would be better off 1
99 | have been concerned 0
100 | uproar 0
101 | pained him 0
102 | terrorism 0
103 | not only prejudice 0
104 | a brutal occupation 0
105 | meddling 0
106 | falls at a bad time 0
107 | ugliest crimes 0
108 | picking on 0
109 | allegations 0
110 | are being well treated 1
111 | the serious crisis 0
112 | will be a monumental tragedy 0
113 | is proud 1
114 | thus 0
115 | oh , my god 0
116 | will not allow 0
117 | the worst crisis 0
118 | turn its back 0
119 | deepened 0
120 | will complain 0
121 | very hard line 0
122 | opposition 0
123 | would not dilute 1
124 | secret 0
125 | opposition 0
126 | illusory 0
127 | not adequately free and fair 0
128 | should follow 1
129 | impossible 0
130 | an aberration that goes against mankind 0
131 | have preference for 1
132 | all around us 0
133 | very , very large 1
134 | lie 0
135 | were ill - received 0
136 | difficult 0
137 | and then assert the laws of war do not apply 0
138 | from upholding religious and cultural values 1
139 | preferred 1
140 | full support 1
141 | criticizing 0
142 | immediate exploitation 0
143 | ignored 0
144 | has unashamedly seen fit 1
145 | support 1
146 | now a reality 1
147 | gaffe 0
148 | virtually nothing 0
149 | no place in israel can be considered safe 0
150 | to invite 1
151 | prevent such a development 1
152 | rigged 0
153 | logical solution 1
154 | bungled its dealings 0
155 | demerits 0
156 | like the suffering 0
157 | unacceptable 0
158 | supports 1
159 | 100 percent 1
160 | does not appear 0
161 | pretext 0
162 | what begins to happen when we hate our friends 0
163 | denies 0
164 | intimidation 0
165 | children skipping school 0
166 | criticism 0
167 | violence and intimidation 0
168 | blatantly obstructed 0
169 | reactions of repulsion 0
170 | are not happy 0
171 | criticized 0
172 | a growing impression that misuari could pose a security threat 0
173 | left behind 1
174 | they have not succeeded , and will never succeed 1
175 | it's shameful 0
176 | beyond the reach of any legal regime 0
177 | is pessimistic 0
178 | has no right 0
179 | are masterminded 0
180 | regret 0
181 | will weaken soon 0
182 | terrible tragedy 0
183 | there is no alternative to it but conflict , isolation , nationalism , and ultimately war 0
184 | freak show 0
185 | insisted 1
186 | axis of evil 0
187 | come under fire 0
188 | threats expressed by 0
189 | the support 1
190 | wrong 0
191 | has resisted 0
192 | the darkest hour is always before the dawn 1
193 | the entire palestinian people 0
194 | denounced 0
195 | very serious threat 0
196 | only 0
197 | making a spectacle of 0
198 | promote meetings 1
199 | to mould the electoral process in his favour 0
200 | not a single day would pass in peace and with no palestinians shedding blood 0
201 | crises 0
202 | hoping 1
203 | to intimidate women and children 0
204 | disputes 0
205 | extreme right 0
206 | most dangerous 0
207 | may even get better 0
208 | promise 1
209 | has pledged 1
210 | one body with two heads 0
211 | feel at ease 1
212 | turning a blind eye 0
213 | has floundered 0
214 | warns 0
215 | most cogent argument 1
216 | lump 0
217 | deals a blow 0
218 | bad treatment 0
219 | concerned 0
220 | have criticized 0
221 | you can hardly speak of a targeting error 0
222 | certain countries resort to 0
223 | called for 1
224 | violating human rights 0
225 | unusual 0
226 | would undermine 0
227 | a terrorist act 0
228 | threat 0
229 | objectives 1
230 | can be abated 1
231 | muscle - flexing 0
232 | pursued 1
233 | brilliant 1
234 | committed 1
235 | committing themselves 1
236 | support 1
237 | the importance of china 1
238 | only one single 0
239 | hoped 1
240 | closed ranks behind 1
241 | peace 1
242 | would not be a bad idea 1
243 | legitimate 1
244 | closer to that of cowboys than to a civilized mentality 0
245 | such fruitful results 1
246 | double crime 0
247 | perfectly at ease 1
248 | mistake 0
249 | repeatedly accused 0
250 | ceased to be a soldier of the fatherland 0
251 | however 0
252 | apocalyptic savagery 0
253 | a border of peace and good neighbourliness 1
254 | call for 1
255 | hurt 0
256 | declined to endorse 0
257 | countries such as iran , iraq and north korea represent an ``axis of evil 0
258 | more serious 0
259 | am confident 1
260 | picking on 0
261 | whether 0
262 | go out into the streets to defend 1
263 | committed one more mistake 0
264 | the agreement 1
265 | charging 0
266 | other aggressive acts against lebanon 0
267 | enjoying 1
268 | decided 0
269 | hard - line 0
270 | feels 0
271 | do n't want 0
272 | the threats launched 0
273 | rejected 0
274 | breaking fundamental concepts 0
275 | increasingly tyrannical 0
276 | undisguised declaration of war and a rhetoric threatening aggression 0
277 | want 1
278 | hyperbole , 0
279 | patronizing 0
280 | arbitrary arrests 0
281 | jeopardy 0
282 | could not be said to adequately comply 0
283 | with good economic management , israel should have little trouble 1
284 | terrorist allies 0
285 | swift criticism from 0
286 | instead 0
287 | steering the economy into disaster 0
288 | grew so unhappy 0
289 | will not resort 0
290 | has refused 0
291 | may not be feasible 0
292 | danger of being shelved altogether 0
293 | unmanageable 0
294 | favouring independence 1
295 | misrule 0
296 | misery 0
297 | steering the nation to prosperity and progress 1
298 | america's biding 0
299 | compounded the problem 0
300 | designed to benefit mugabe 0
301 | supposed to be 0
302 | appraised 1
303 | stand beside right and justice 1
304 | refuses 0
305 | surged 1
306 | denied 0
307 | fake imposter 0
308 | great evil on open display 0
309 | ideal , sunny clime 1
310 | seems to be determined to expand the scope of the anti - terror war 0
311 | a body blow 0
312 | as full citizens in the same sate 1
313 | lecturing 0
314 | suffered 0
315 | harmed 0
316 | criticized 0
317 | scores of 0
318 | ignore the consequences 0
319 | territorial ambition 0
320 | not 0
321 | no one has the right to wage war against the history of this nation 0
322 | wants 1
323 | hatred 0
324 | what occured in the united states on 11 september 0
325 | oneupmanship 0
326 | labeling 0
327 | showed little - disguised irritation 0
328 | concern 0
329 | even 0
330 | the possibility of a democratic , stable and prosperous 1
331 | strongly criticized and condemned 0
332 | will not admit 0
333 | persuade 1
334 | was more than confident 1
335 | had not heeded 0
336 | cost - effective 1
337 | mistake 0
338 | would adhere to a pluralistic vision 1
339 | had asked 1
340 | replete with 1
341 | shameful 0
342 | will only 0
343 | accidental slight 0
344 | nothing whatsoever 0
345 | could no longer tolerate 0
346 | leeway 1
347 | the doubts 0
348 | aggressions against 0
349 | raving 0
350 | are blamed by 0
351 | so many uncertainties 0
352 | destined to collapse 1
353 | confidence crisis 0
354 | are accusing 0
355 | warned 0
356 | such pessimism 0
357 | anxiety 0
358 | cause a rift 0
359 | no politically prudent 0
360 | held out an olive branch 1
361 | continues to demolish 0
362 | does not endorse 0
363 | failed 1
364 | comprehensive destructive policy 0
365 | sounds clumsy 0
366 | fails to meet the standard of being free and fair 0
367 | willfulness 1
368 | tough policy 0
369 | has not satisfied 0
370 | extremist inclinations 0
371 | unilateral 0
372 | the iranians have not done what the pakistan government has done 0
373 | because 0
374 | boycotted 0
375 | without just compensation 0
376 | desperation of the people 0
377 | damage 0
378 | dominating the world 0
379 | sought 1
380 | fearing 0
381 | that concessions are sufficiently concrete 1
382 | put his nation first 1
383 | expresses the concern 0
384 | can contaminate 0
385 | to put it mildly 0
386 | will do its utmost 1
387 | endorsed 1
388 | will surely be on the president's lips 0
389 | with typical understatement 0
390 | lambasted 0
391 | affirmed 1
392 | vitriol 0
393 | in compliance 0
394 | numerous serious abuses 0
395 | asked 1
396 | confidence 1
397 | played the same tactic again 0
398 | are enthusiastic 1
399 | valued ally and friend 1
400 | so - called 0
401 | are regarding 0
402 | running out of meaningful options 0
403 | can not accept 0
404 | is concerned 0
405 | further criticism 0
406 | of all the unconscionable things 0
407 | longest destructive war 0
408 | plight of 0
409 | can no longer stand shoulder to shoulder 0
410 | it would be madness to hold elections now 0
411 | should be looking up smiling 1
412 | if one goes by the us logic , only the us spy plane is permitted to spy at other people's doorsteps 0
413 | begins to bear fruit 1
414 | rigged 0
415 | serious division 0
416 | enjoys no social base 0
417 | very constructive 1
418 | grateful 1
419 | turned thugs 0
420 | was unconstitutional 0
421 | scares away 0
422 | declining to endorse 0
423 | violate human rights 0
424 | accused 0
425 | negatively 0
426 | due regard 1
427 | simplistic 0
428 | will help renew 1
429 | succumbing 0
430 | peace to prevail 1
431 | despite all of this , however 0
432 | proves this beyond the shadow of a doubt 1
433 | is accused 0
434 | intolerant 0
435 | the visit has achieved positive and successful results 1
436 | wantonly infringing 0
437 | assassin 0
438 | continue to obstruct 0
439 | interests 1
440 | was a strong desire among 1
441 | to denounce 0
442 | left no avenue unexplored 1
443 | endorsed 1
444 | would also ensure a durable peace 1
445 | as a vehicle for increasing personal popularity 1
446 | would have been sending a very bad signal 0
447 | continues to refuse 0
448 | pursuit 0
449 | committing themselves to peace 1
450 | accuses 0
451 | as if they were quarries 0
452 | far worse than those prevailing in camp x - ray 0
453 | inhumanely 0
454 | appropriate 1
455 | poor 0
456 | good education system 1
457 | illegal 0
458 | devastating 0
459 | to create the impression 0
460 | exacerbating 0
461 | faltered as a result of israel's intransigence 0
462 | his career was like a rough sea , with highs and lows and never calm 0
463 | generally approved of 1
464 | deviant 0
465 | without trying to maneuver , place obstacles , or set impossible conditions 0
466 | the fire is raging at home 0
467 | lacks credibility and can not withstand any objective scrutiny 0
468 | endorsed 1
469 | assassinate innocent activists and citizens 0
470 | than they deserve 0
471 | refusal to respect its obligations 0
472 | would not 0
473 | for the first time 1
474 | beautiful historic coincidence 1
475 | hardly elastic enough 0
476 | approve 1
477 | hope 1
478 | was so hard on 0
479 | most serious consequences 0
480 | will have trouble wriggling out of the need to explain and justify 0
481 | not completely reliable 0
482 | democracy exhausted all its generosity 0
483 | relatively calm 1
484 | are extremely concerned 0
485 | bringing an end to terrorism and the taliban 1
486 | reject 0
487 | stand firm 1
488 | new political bogey 0
489 | abstractly recommend 1
490 | was worried 0
491 | hardline 0
492 | crash course 0
493 | recognition 1
494 | repeated denunciations 0
495 | hope 1
496 | carnage 0
497 | threats 0
498 | swapping the silver - visored helmet of a space cadet for the green eyeshade of a consummate bean counter 0
499 | to voice his concern 0
500 | significant 0
501 | can bring security and stability to all the parties without exception 1
502 | would consider 1
503 | be reproached 0
504 | or so it claims 0
505 | harboring serious doubts 0
506 | if it becomes more of a nuisance 0
507 | need to establish a just peace 1
508 | immediate support 1
509 | discrediting 0
510 | picking a quarrel 0
511 | bridle the israeli oppressive activity 1
512 | the euro , our currency 1
513 | edifying photograph 1
514 | wrong judgment 0
515 | preservation of global peace and security 1
516 | sharply questioned 0
517 | denied 0
518 | respect 1
519 | unfeasible 0
520 | has promised 1
521 | fierce demonstrations 0
522 | to thwart 1
523 | disrespect of advice 0
524 | war on 0
525 | as the saying goes , when you pull up the turnip , mud comes with it 0
526 | smiling 1
527 | judgment 0
528 | an axis of evil 0
529 | criticizes 0
530 | costly burden 0
531 | detrimental 0
532 | earned eternal notoriety 0
533 | making liberal use of the but construction 0
534 | plotting 0
535 | financial disaster 0
536 | would be regarded 0
537 | parasitic economies 0
538 | wanted 1
539 | more demagogy than arguments 0
540 | israel's superior ability to punish palestinians 0
541 | flirting with 0
542 | was perceived 0
543 | what is europe 0
544 | the best role model 1
545 | inaccurate and fabricated 0
546 | recognition 1
547 | axis of evil rhetoric 0
548 | lost their illegitimate interests 0
549 | not a man who likes to improvise 0
550 | is criticized 0
551 | can trust 1
552 | foresaw 0
553 | heresy 0
554 | him has not been pretty 0
555 | demonstrations and rallies against 0
556 | axis of evil 0
557 | cold war heritage 0
558 | possible regression 0
559 | will guarantee 0
560 | came out in protest 0
561 | progressive 1
562 | peaceful protests 0
563 | difficulty , of course 0
564 | put on a spectacle 0
565 | dogma 0
566 | devastating what remains 0
567 | is stiffening in its attitude toward 0
568 | stuck for 18 months on ground zero 0
569 | put most of the blame 0
570 | very large 0
571 | once again 0
572 | it is almost impossible 0
573 | has criticised 0
574 | axis of evil 0
575 | humiliation of 0
576 | calm 1
577 | are tired 0
578 | enjoy 1
579 | wanted 1
580 | has blamed 0
581 | unprecedented force 0
582 | provocative 0
583 | far preferable 1
584 | repeated warnings 0
585 | neither free nor fair 0
586 | backing 1
587 | widespread debates against 0
588 | using violence , intimidation , special laws and dirty tricks to fix the two - day election 0
589 | continue to obstruct 0
590 | seeking 1
591 | biased attitude 0
592 | long neglected 0
593 | success of western europe after world war ii laid in its creation of the good neighborly field 1
594 | even risking his own future 0
595 | accused 0
596 | denied 0
597 | desperate tableau 0
598 | could n't wait 1
599 | blessed 1
600 | nuclear club 0
601 | most important 1
602 | wants 0
603 | questioning 0
604 | no one is listening 0
605 | complicates any situation 0
606 | who were expelled from their homeland 0
607 | distortion of reality 0
608 | had the advantage for washington of weakening opec 1
609 | the support 1
610 | may god be satisfied with him 1
611 | continue to rise 0
612 | will not be able lay claim to a worthy place in the civilized world 0
613 | has to be maintained at any cost 0
614 | thugs 0
615 | apprehensions 0
616 | want 1
617 | recommendations 1
618 | one of the few enlightened officials 1
619 | democratic achievements 1
620 | tensions between 0
621 | see 0
622 | infuriating 0
623 | first organized protest 0
624 | feared by 0
625 | ignored 0
626 | believe 0
627 | particular concern is raised 0
628 | main problems are in the economic area 0
629 | damages the credibility 0
630 | declining to comply 0
631 | blamed 0
632 | allegedly threatening 0
633 | dismisses 0
634 | the unconditional support 1
635 | not satisfactory 0
636 | negated 0
637 | berating 0
638 | would remake venezuela to benefit the poor 1
639 | `world judge of human rights' 0
640 | were only to be expected 0
641 | cooperation 1
642 | issuing a letter of objection 0
643 | would support wholeheartedly 1
644 | giving him medium approval 1
645 | at a loss 0
646 | dwindling moral values 0
647 | is self - inflicted 0
648 | if it is successful 0
649 | impose 0
650 | eradication 0
651 | feared 0
652 | scupper any hope 0
653 | backing away 0
654 | held 0
655 | will have no universally - acknowledged 0
656 | did not show the slightest sympathy , still less the least regret 0
657 | advanced a mendacious critique 0
658 | disputes between 0
659 | encouraged 1
660 | another setback for the imf 0
661 | wanted 1
662 | clear priority 1
663 | crimes of war 0
664 | destroyed 0
665 | warned 0
666 | decided to back 1
667 | violates the united nations charter 0
668 | interfere 0
669 | have n't already violated 0
670 | defenceless 0
671 | was effective especially 1
672 | is accusing 0
673 | a man blinded by power 0
674 | backing 1
675 | it's really strange 0
676 | his favourite newfie 0
677 | mercurial strongman 0
678 | that four shots would solve the problem 0
679 | would not accept 0
680 | there is no reason for it to be impossible 1
681 | poor response 0
682 | aspirations 1
683 | to the healthy development of sino - us relations 1
684 | resolute commitment 1
685 | democracy 1
686 | the concern 0
687 | only wants 1
688 | needlessly 0
689 | secretly behind every local misfortune 0
690 | positive 1
691 | is disgusted 0
692 | high degree of difficulty 0
693 | improvement 1
694 | spurned 0
695 | has denounced 0
696 | axis of evil theory 0
697 | extensive support 1
698 | worst 0
699 | its selfishness 0
700 | ambition 1
701 | find no grounds for any support 0
702 | achieving better results 1
703 | no tears will be shed in this corner 0
704 | is a telling example of a new - and perhaps risky - approach 0
705 | will become one of the best elements 1
706 | was also quite naive 0
707 | the criticism 0
708 | desire to work 1
709 | for the sake of peace 1
710 | double standard 0
711 | pretended 0
712 | alarm 0
713 | discontent 0
714 | furthermore 0
715 | massacring thousands of innocent people 0
716 | immense gulf between 0
717 | still wants 1
718 | basically sound 1
719 | repeatedly threatened 0
720 | mendacious 0
721 | beyond reproach 1
722 | but 0
723 | neat stuff 1
724 | purposely play up 0
725 | blatant campaign of intimidation 0
726 | disappointment 0
727 | provoked 0
728 | intends to 0
729 | worried 0
730 | real danger 0
731 | supported 1
732 | support 1
733 | were plotting 0
734 | agreed 1
735 | most realistic 1
736 | violating 0
737 | value sharing 1
738 | to its knees 0
739 | to request 1
740 | get out ! . 0
741 | the axis of evil 0
742 | assassins , assassins , assassins 0
743 | is the appropriate one 1
744 | resistance 1
745 | indulging in blood - shed and their lunaticism 0
746 | desperate 0
747 | to describe 0
748 | can ask 1
749 | know 0
750 | invited 1
751 | to express satisfaction 1
752 | harmonious and close 1
753 | support 1
754 | has been inclined toward 1
755 | incited 0
756 | was like giving away the country to its former colonial masters 0
757 | the destruction 0
758 | already volatile 0
759 | overwhelming evidence 0
760 | imperative for harmonious society 1
761 | nor does it seem proper 0
762 | storming palestinian territories 0
763 | is reflected 1
764 | expressed satisfaction 1
765 | intends 1
766 | ruined 0
767 | made public a blacklist 0
768 | those who seek to attack and destroy law and order and legitimate government 0
769 | unlawful 0
770 | freely 1
771 | reassurances 1
772 | agreeable 1
773 | dont want 0
774 | did not agree 0
775 | to deny 0
776 | perception 0
777 | had particularly harsh words for 0
778 | want 1
779 | the brutality with which this closure was implemented was unheard of 0
780 | friendship 1
781 | axis of evil 0
782 | supported 1
783 | slaughter of more than 0
784 | will be asked 1
785 | just , legitimate rights 1
786 | condemned 0
787 | thanked 1
788 | unstable situation 0
789 | merely an interlude 0
790 | depends on 0
791 | has been efficient enough 1
792 | grand 1
793 | any step backward for democracy 0
794 | quite supportive 1
795 | great leader 1
796 | would help 1
797 | formed a close friendship 1
798 | uphold 1
799 | is feared by 0
800 | used to boast 1
801 | regarded 1
802 | would violate international standards prohibiting cruel , inhuman or degrading treatment 0
803 | convince the americans that the bush administration has not only succeeded in removing a regime of religious madmen 0
804 | has put west asia on the brink of another war 0
805 | has also backed 1
806 | attacks 0
807 | banned 0
808 | dishonoring 0
809 | sent congratulations 1
810 | thought 0
811 | may be disconcerted 0
812 | concern 0
813 | will be inviting 1
814 | have to bash 0
815 | plan 1
816 | guaranteed 1
817 | the mistake is to assume 0
818 | insists 1
819 | sticking to the polluting policies 0
820 | can be difficult 0
821 | can prevent 1
822 | extremely dangerous 0
823 | would beg for surrender 0
824 | protesting 0
825 | proclaimed 1
826 | to applaud everything 1
827 | the most ideal way 1
828 | accused 0
829 | what fate awaits argentina's crisis - stricken economy 0
830 | especially those who are dangerous and suicidal 0
831 | has failed for a decade now 0
832 | had likely deteriorated beyond repair 0
833 | attempts to suppress 0
834 | endorse 1
835 | if there were serious problems 0
836 | designed to achieve an outcome - power at all costs 0
837 | criticism of 0
838 | everything 0
839 | so that we would not become terrorists 1
840 | higher transparency 1
841 | warning 0
842 | does not justify all the means 0
843 | the further 0
844 | artificial obstacles 0
845 | would improve 0
846 | has now made himself heard 1
847 | openly invited 1
848 | argued 0
849 | decisions 0
850 | admittedly 1
851 | still prefer 1
852 | had planned 0
853 | sacrifice himself 0
854 | very pleasant for 1
855 | prevented 0
856 | favorable opinions 1
857 | no one we know to have planned such deeds will escape 0
858 | to express openly dissenting political and religious views 0
859 | confidence 1
860 | no business 0
861 | has provoked concern 0
862 | had aligned against 0
863 | under the pretext of fighting terrorism 0
864 | violation of the palestinian people's human rights 0
865 | understanding and approval 1
866 | concerns 0
867 | universal character of the prophets 1
868 | force himself 0
869 | getting mixed up with crime , drugs , and violence 0
870 | bold and fair 1
871 | like an upholder of justice 1
872 | factually inaccurate 0
873 | criticism 0
874 | was shocked 0
875 | saw 1
876 | had refused to accept 0
877 | would make it possible 1
878 | domino effect 0
879 | palestinian hand that is stretched for peace 0
880 | has refused to bow 0
881 | despite the perils 0
882 | will be instrumental 1
883 | concerns 0
884 | those who are really behind it all , those who are behind this business , use the imf 0
885 | accommodate 1
886 | doubts 0
887 | slammed 0
888 | will seek 1
889 | grieving 0
890 | called for 1
891 | isolated and frustrated 0
892 | in the name of self - righteousness 0
893 | intensify and accelerate 0
894 | keeping a wary eye 0
895 | took to the streets 0
896 | gave free rein 0
897 | to rid 1
898 | wanted 1
899 | betrayal 0
900 | the branding 0
901 | collapse 0
902 | i lost any sense of dignity 0
903 | massive intimidatory show of force that was evident in the deployment of security forces 0
904 | unpopular system 0
905 | regarded 0
906 | has posed serious threats 0
907 | to draw up an action campaign aimed 0
908 | illegally 0
909 | reputation was ruined 0
910 | the opposition 0
911 | cracks are appearing 0
912 | could fray the bilateral goodwill 0
913 | was deemed 0
914 | has renounced 0
915 | the cia would organize coups and assassinations to further us interests 0
916 | the definition of a democratic regime is subjected to double standards 0
917 | strike 0
918 | challenged 0
919 | sympathy 0
920 | rubbed their palms at length 1
921 | shook 0
922 | sided with 1
923 | trying to move the goal posts 0
924 | lose popular support among 0
925 | will only invite worse criticism and rejection 0
926 | axis of evil 0
927 | absolutely not permitted 0
928 | found little to object to 0
929 | justify 1
930 | extension of its characteristic policy of hegemony 0
931 | unrealistic 0
932 | warned against 0
933 | crippled 0
934 | well 1
935 | it is ineffective 0
936 | is now promoting 1
937 | called 0
938 | in spite of the good offices 0
939 | genuinely 1
940 | for more than two years 0
941 | tried to encourage 1
942 | loathsome action 0
943 | reckless 0
944 | and killed 0
945 | likening 0
946 | will now agree 1
947 | exalted 1
948 | would like 1
949 | hoped 1
950 | even more dismal 0
951 | gives new life 1
952 | rejected 0
953 | predatory 0
954 | viciously spoke ill 0
955 | in protest against 0
956 | was remarkable 1
957 |
--------------------------------------------------------------------------------
/aug_data/stsa.binary/run.log:
--------------------------------------------------------------------------------
1 | /home/xgg/anaconda3/bin/python /home/xgg/pros/cbert_aug/aug_dataset_wo_ft.py
2 | Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
3 | /home/xgg/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
4 | from ._conv import register_converters as _register_converters
5 | Namespace(bert_model='bert-base-uncased', data_dir='datasets', do_lower_case=False, learning_rate=5e-05, max_seq_length=64, num_train_epochs=6.0, output_dir='aug_data', seed=42, task_name='stsa.binary', train_batch_size=32, warmup_proportion=0.1)
6 | 03/08/2019 21:02:11 - INFO - pytorch_pretrained_bert.modeling - Model config {
7 | "attention_probs_dropout_prob": 0.1,
8 | "hidden_act": "gelu",
9 | "hidden_dropout_prob": 0.1,
10 | "hidden_size": 768,
11 | "initializer_range": 0.02,
12 | "intermediate_size": 3072,
13 | "max_position_embeddings": 512,
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "type_vocab_size": 2,
17 | "vocab_size": 30522
18 | }
19 |
20 | 03/08/2019 21:02:14 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
21 | 03/08/2019 21:02:15 - INFO - __main__ - *** Example ***
22 | 03/08/2019 21:02:15 - INFO - __main__ - guid: train-1
23 | 03/08/2019 21:02:15 - INFO - __main__ - tokens: [CLS] an un ##int ##ent ##ional ##ly surreal kid ' s picture . . . in which actors in bad bear suits en ##act a sort of inter - species parody of a vh1 behind the music episode . [SEP]
24 | 03/08/2019 21:02:15 - INFO - __main__ - init_ids: 101 2019 4895 18447 4765 19301 2135 16524 4845 1005 1055 3861 1012 1012 1012 1999 2029 5889 1999 2919 4562 11072 4372 18908 1037 4066 1997 6970 1011 2427 12354 1997 1037 26365 2369 1996 2189 2792 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
25 | 03/08/2019 21:02:15 - INFO - __main__ - input_ids: 101 2019 4895 18447 4765 103 2135 103 103 1005 1055 3861 1012 1012 1012 6970 103 5889 1999 2919 4562 11072 4372 18908 1037 4066 1997 6970 1011 2427 12354 1997 1037 26365 2369 1996 2189 26365 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
26 | 03/08/2019 21:02:15 - INFO - __main__ - input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
27 | 03/08/2019 21:02:15 - INFO - __main__ - segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
28 | 03/08/2019 21:02:15 - INFO - __main__ - masked_lm_labels: -1 -1 -1 -1 -1 19301 -1 16524 4845 -1 -1 -1 -1 -1 -1 1999 2029 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 2792 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
29 | 03/08/2019 21:02:15 - INFO - __main__ - *** Example ***
30 | 03/08/2019 21:02:15 - INFO - __main__ - guid: train-2
31 | 03/08/2019 21:02:15 - INFO - __main__ - tokens: [CLS] the kind of film that leaves you scratching your head in amazement over the fact that so many talented people could participate in such an ill - advised and poorly executed idea . [SEP]
32 | 03/08/2019 21:02:15 - INFO - __main__ - init_ids: 101 1996 2785 1997 2143 2008 3727 2017 20291 2115 2132 1999 21606 2058 1996 2755 2008 2061 2116 10904 2111 2071 5589 1999 2107 2019 5665 1011 9449 1998 9996 6472 2801 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
33 | 03/08/2019 21:02:15 - INFO - __main__ - input_ids: 101 1996 2785 1997 2143 2008 3727 2017 20291 2115 2132 1999 103 103 1996 2755 2008 2061 2116 10904 103 2071 5589 1999 2107 2019 5665 1011 9449 6472 9996 103 2801 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
34 | 03/08/2019 21:02:15 - INFO - __main__ - input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
35 | 03/08/2019 21:02:15 - INFO - __main__ - segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
36 | 03/08/2019 21:02:15 - INFO - __main__ - masked_lm_labels: -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 21606 2058 -1 -1 -1 -1 -1 -1 2111 -1 -1 -1 -1 -1 -1 -1 -1 1998 -1 6472 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
37 | 03/08/2019 21:02:17 - INFO - __main__ - ***** Running training *****
38 | 03/08/2019 21:02:17 - INFO - __main__ - Num examples = 6228
39 | 03/08/2019 21:02:17 - INFO - __main__ - Batch size = 32
40 | 03/08/2019 21:02:17 - INFO - __main__ - Num steps = 1167
41 | constract vocabulary based on frequency
42 | # train data: 6228
43 | # test data: 692
44 | # vocab: 14830
45 | # class: 2
46 | Epoch: 0%| | 0/6 [00:00, ?it/s]before augment best acc:0.800578
47 | avg_loss: 2.7279621171951294
48 | avg_loss: 2.595244951248169
49 | avg_loss: 2.626865839958191
50 | constract vocabulary based on frequency
51 | # train data: 18613
52 | # test data: 692
53 | # vocab: 17639
54 | # class: 2
55 | epoch 0 augment best acc:0.793353
56 | Epoch: 17%|█▋ | 1/6 [04:22<21:51, 262.37s/it]avg_loss: 1.199854292869568
57 | avg_loss: 1.1750281822681428
58 | avg_loss: 1.1697524082660675
59 | constract vocabulary based on frequency
60 | # train data: 18691
61 | # test data: 692
62 | # vocab: 17694
63 | # class: 2
64 | Epoch: 33%|███▎ | 2/6 [09:19<18:38, 279.67s/it]epoch 1 augment best acc:0.807803
65 | avg_loss: 0.4066984289884567
66 | avg_loss: 0.4212127384543419
67 | avg_loss: 0.4228825390338898
68 | constract vocabulary based on frequency
69 | # train data: 18742
70 | # test data: 692
71 | # vocab: 17793
72 | # class: 2
73 | Epoch: 50%|█████ | 3/6 [14:44<14:44, 294.70s/it]epoch 2 augment best acc:0.809249
74 | avg_loss: 0.14582043968141079
75 | avg_loss: 0.1362319701910019
76 | avg_loss: 0.15393256649374962
77 | constract vocabulary based on frequency
78 | # train data: 18690
79 | # test data: 692
80 | # vocab: 17819
81 | # class: 2
82 | Epoch: 67%|██████▋ | 4/6 [20:21<10:10, 305.26s/it]epoch 3 augment best acc:0.806358
83 | avg_loss: 0.06510618031024933
84 | avg_loss: 0.07271562956273556
85 | avg_loss: 0.06607807904481888
86 | constract vocabulary based on frequency
87 | # train data: 18618
88 | # test data: 692
89 | # vocab: 17798
90 | # class: 2
91 | epoch 4 augment best acc:0.802023
92 | Epoch: 83%|████████▎ | 5/6 [25:01<05:00, 300.36s/it]avg_loss: 0.04483885794878006
93 | avg_loss: 0.046800237521529196
94 | avg_loss: 0.046726834438741205
95 | constract vocabulary based on frequency
96 | # train data: 18682
97 | # test data: 692
98 | # vocab: 17758
99 | # class: 2
100 | epoch 5 augment best acc:0.825145
101 | Epoch: 100%|██████████| 6/6 [30:26<00:00, 304.43s/it]
102 |
103 | Process finished with exit code 0
104 |
--------------------------------------------------------------------------------
/aug_dataset.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import csv
6 | import os
7 | import shutil
8 | import logging
9 | import argparse
10 | import random
11 | from tqdm import tqdm, trange
12 | import json
13 |
14 | import numpy as np
15 | import torch
16 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
17 | from torch.utils.data.distributed import DistributedSampler
18 |
19 | from pytorch_pretrained_bert.tokenization import BertTokenizer
20 | from pytorch_pretrained_bert.modeling import BertForMaskedLM
21 | from pytorch_pretrained_bert.optimization import BertAdam
22 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
23 |
24 | import train_text_classifier
25 |
26 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
27 | datefmt='%m/%d/%Y %H:%M:%S',
28 | level=logging.INFO)
29 | logger = logging.getLogger(__name__)
30 |
31 |
32 | class DataProcessor(object):
33 | """Base class for data converters for sequence classification data sets."""
34 |
35 | def get_train_examples(self, data_dir):
36 | """Gets a collection of `InputExample`s for the train set."""
37 | raise NotImplementedError()
38 |
39 | def get_dev_examples(self, data_dir):
40 | """Gets a collection of `InputExample`s for the dev set."""
41 | raise NotImplementedError()
42 |
43 | def get_labels(self):
44 | """Gets the list of labels for this data set."""
45 | raise NotImplementedError()
46 |
47 | @classmethod
48 | def _read_tsv(cls, input_file, quotechar=None):
49 | """Reads a tab separated value file."""
50 | with open(input_file, "r") as f:
51 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
52 | lines = []
53 | for line in reader:
54 | lines.append(line)
55 | return lines
56 |
57 |
58 | class InputExample(object):
59 | """A single training/test example for simple sequence classification."""
60 |
61 | def __init__(self, guid, text_a, label=None):
62 | """Constructs a InputExample.
63 |
64 | Args:
65 | guid: Unique id for the example.
66 | text_a: string. The untokenized text of the first sequence. For single
67 | sequence tasks, only this sequence must be specified.
68 | """
69 | self.guid = guid
70 | self.text_a = text_a
71 | self.label = label
72 |
73 |
74 | class InputFeatures(object):
75 | """A single set of features of data."""
76 |
77 | def __init__(self, init_ids, input_ids, input_mask, segment_ids, masked_lm_labels):
78 | self.init_ids = init_ids
79 | self.input_ids = input_ids
80 | self.input_mask = input_mask
81 | self.segment_ids = segment_ids
82 | self.masked_lm_labels = masked_lm_labels
83 |
84 | class AugProcessor(DataProcessor):
85 | """Processor for dataset to be augmented."""
86 |
87 | def get_train_examples(self, data_dir):
88 | """See base class."""
89 | return self._create_examples(
90 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
91 |
92 | def get_dev_examples(self, data_dir):
93 | """See base class."""
94 | return self._create_examples(
95 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
96 |
97 | def get_labels(self, name):
98 | """add your dataset here"""
99 | if name in ['stsa.binary', 'mpqa', 'rt-polarity', 'subj']:
100 | return ["0", "1"]
101 | elif name in ['stsa.fine']:
102 | return ["0", "1", "2", "3", "4"]
103 | elif name in ['TREC']:
104 | return ["0", "1", "2", "3", "4", "5"]
105 |
106 | def _create_examples(self, lines, set_type):
107 | """Creates examples for the training and dev sets."""
108 | examples = []
109 | for (i, line) in enumerate(lines):
110 | if i == 0:
111 | continue
112 | guid = "%s-%s" % (set_type, i)
113 | text_a = line[0]
114 | label = line[-1]
115 | examples.append(
116 | InputExample(guid=guid, text_a=text_a, label=label))
117 | return examples
118 |
119 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
120 | """Loads a data file into a list of `InputBatch`s."""
121 |
122 | label_map = {}
123 | for (i, label) in enumerate(label_list):
124 | label_map[label] = i
125 |
126 | features = []
127 | # ----
128 | dupe_factor = 5
129 | masked_lm_prob = 0.15
130 | max_predictions_per_seq = 20
131 | rng = random.Random(12345)
132 |
133 | for (ex_index, example) in enumerate(examples):
134 | tokens_a = tokenizer.tokenize(example.text_a)
135 | segment_id = label_map[example.label]
136 | # Account for [CLS] and [SEP] with "- 2"
137 | if len(tokens_a) > max_seq_length - 2:
138 | tokens_a = tokens_a[0:(max_seq_length - 2)]
139 |
140 | # Due to we use conditional bert, we need to place label information in segment_ids
141 | tokens = []
142 | segment_ids = []
143 | # is [CLS]和[SEP] needed ?
144 | tokens.append("[CLS]")
145 | segment_ids.append(segment_id)
146 | for token in tokens_a:
147 | tokens.append(token)
148 | segment_ids.append(segment_id)
149 | tokens.append("[SEP]")
150 | segment_ids.append(segment_id)
151 | masked_lm_labels = [-1] * max_seq_length
152 |
153 | cand_indexes = []
154 | for (i, token) in enumerate(tokens):
155 | if token == "[CLS]" or token == "[SEP]":
156 | continue
157 | cand_indexes.append(i)
158 |
159 | rng.shuffle(cand_indexes)
160 | len_cand = len(cand_indexes)
161 |
162 | output_tokens = list(tokens)
163 |
164 | num_to_predict = min(max_predictions_per_seq,
165 | max(1, int(round(len(tokens) * masked_lm_prob))))
166 |
167 | masked_lms_pos = []
168 | covered_indexes = set()
169 | for index in cand_indexes:
170 | if len(masked_lms_pos) >= num_to_predict:
171 | break
172 | if index in covered_indexes:
173 | continue
174 | covered_indexes.add(index)
175 |
176 | masked_token = None
177 | # 80% of the time, replace with [MASK]
178 | if rng.random() < 0.8:
179 | masked_token = "[MASK]"
180 | else:
181 | # 10% of the time, keep original
182 | if rng.random() < 0.5:
183 | masked_token = tokens[index]
184 | # 10% of the time, replace with random word
185 | else:
186 | masked_token = tokens[cand_indexes[rng.randint(0, len_cand - 1)]]
187 |
188 | masked_lm_labels[index] = tokenizer.convert_tokens_to_ids([tokens[index]])[0]
189 | output_tokens[index] = masked_token
190 | masked_lms_pos.append(index)
191 |
192 | init_ids = tokenizer.convert_tokens_to_ids(tokens)
193 | input_ids = tokenizer.convert_tokens_to_ids(output_tokens)
194 |
195 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
196 | # tokens are attended to.
197 | input_mask = [1] * len(input_ids)
198 |
199 | # Zero-pad up to the sequence length.
200 | while len(input_ids) < max_seq_length:
201 | init_ids.append(0)
202 | input_ids.append(0)
203 | input_mask.append(0)
204 | segment_ids.append(0) # ?segment_id
205 |
206 | assert len(init_ids) == max_seq_length
207 | assert len(input_ids) == max_seq_length
208 | assert len(input_mask) == max_seq_length
209 | assert len(segment_ids) == max_seq_length
210 |
211 | if ex_index < 2:
212 | logger.info("*** Example ***")
213 | logger.info("guid: %s" % (example.guid))
214 | logger.info("tokens: %s" % " ".join(
215 | [str(x) for x in tokens]))
216 | logger.info("init_ids: %s" % " ".join([str(x) for x in init_ids]))
217 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
218 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
219 | logger.info(
220 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
221 | logger.info("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels]))
222 |
223 | features.append(
224 | InputFeatures(init_ids=init_ids,
225 | input_ids=input_ids,
226 | input_mask=input_mask,
227 | segment_ids=segment_ids,
228 | masked_lm_labels=masked_lm_labels))
229 | return features
230 |
231 | def rev_wordpiece(str):
232 | #print(str)
233 | if len(str) > 1:
234 | for i in range(len(str)-1, 0, -1):
235 | if str[i] == '[PAD]':
236 | str.remove(str[i])
237 | elif len(str[i]) > 1 and str[i][0]=='#' and str[i][1]=='#':
238 | str[i-1] += str[i][2:]
239 | str.remove(str[i])
240 | return " ".join(str[1:-1])
241 |
242 | def main():
243 | parser = argparse.ArgumentParser()
244 |
245 | ## Required parameters
246 | parser.add_argument("--data_dir", default="datasets", type=str,
247 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
248 | parser.add_argument("--output_dir", default="aug_data", type=str,
249 | help="The output dir for augmented dataset")
250 | parser.add_argument("--bert_model", default="bert-base-uncased", type=str,
251 | help="The path of pretrained bert model.")
252 | parser.add_argument("--task_name",default="subj",type=str,
253 | help="The name of the task to train.")
254 | parser.add_argument("--max_seq_length", default=64, type=int,
255 | help="The maximum total input sequence length after WordPiece tokenization. \n"
256 | "Sequences longer than this will be truncated, and sequences shorter \n"
257 | "than this will be padded.")
258 | parser.add_argument("--do_lower_case", default=False, action='store_true',
259 | help="Set this flag if you are using an uncased model.")
260 | parser.add_argument("--train_batch_size", default=32, type=int,
261 | help="Total batch size for training.")
262 | parser.add_argument("--learning_rate", default=5e-5, type=float,
263 | help="The initial learning rate for Adam.")
264 | parser.add_argument("--num_train_epochs", default=9.0, type=float,
265 | help="Total number of training epochs to perform.")
266 | parser.add_argument("--warmup_proportion", default=0.1, type=float,
267 | help="Proportion of training to perform linear learning rate warmup for. "
268 | "E.g., 0.1 = 10%% of training.")
269 | parser.add_argument('--seed', type=int, default=42,
270 | help="random seed for initialization")
271 |
272 | args = parser.parse_args()
273 | with open("global.config", 'rb') as f:
274 | configs_dict = json.load(f)
275 |
276 | args.task_name = configs_dict.get("dataset")
277 | print(args)
278 | run_aug(args, save_every_epoch=False)
279 |
280 |
281 | def run_aug(args, save_every_epoch=False):
282 | processors = {
283 | # you can your processor here
284 | "TREC": AugProcessor,
285 | "stsa.fine": AugProcessor,
286 | "stsa.binary": AugProcessor,
287 | "mpqa": AugProcessor,
288 | "rt-polarity": AugProcessor,
289 | "subj": AugProcessor,
290 | }
291 |
292 | task_name = args.task_name
293 | if task_name not in processors:
294 | raise ValueError("Task not found: %s" % (task_name))
295 | args.data_dir = os.path.join(args.data_dir, task_name)
296 | args.output_dir = os.path.join(args.output_dir, task_name)
297 |
298 | random.seed(args.seed)
299 | np.random.seed(args.seed)
300 | torch.manual_seed(args.seed)
301 | os.makedirs(args.output_dir, exist_ok=True)
302 | processor = processors[task_name]()
303 | label_list = processor.get_labels(task_name)
304 |
305 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
306 |
307 | train_examples = None
308 | num_train_steps = None
309 | train_examples = processor.get_train_examples(args.data_dir)
310 | #dev_examples = processor.get_dev_examples(args.data_dir)
311 | #train_examples.extend(dev_examples)
312 | num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs)
313 |
314 | # Prepare model
315 | def load_model(model_name):
316 | weights_path = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, model_name)
317 | model = torch.load(weights_path)
318 | return model
319 | cbert_name = "{}/BertForMaskedLM_{}_epoch_10".format(task_name.lower(), task_name.lower())
320 | model = load_model(cbert_name)
321 | model.cuda()
322 |
323 | # Prepare optimizer
324 | param_optimizer = list(model.named_parameters())
325 | no_decay = ['bias', 'gamma', 'beta']
326 | optimizer_grouped_parameters = [
327 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
328 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
329 | ]
330 | t_total = num_train_steps
331 | optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate,
332 | warmup=args.warmup_proportion,t_total=t_total)
333 |
334 | global_step = 0
335 | train_features = convert_examples_to_features(
336 | train_examples, label_list, args.max_seq_length, tokenizer)
337 | logger.info("***** Running training *****")
338 | logger.info(" Num examples = %d", len(train_examples))
339 | logger.info(" Batch size = %d", args.train_batch_size)
340 | logger.info(" Num steps = %d", num_train_steps)
341 | all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long)
342 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
343 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
344 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
345 | all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long)
346 | train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels)
347 | train_sampler = RandomSampler(train_data)
348 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
349 |
350 | save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name)
351 | if not os.path.exists(save_model_dir):
352 | os.mkdir(save_model_dir)
353 | MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]
354 |
355 | origin_train_path = os.path.join(args.output_dir, "train_origin.tsv")
356 | save_train_path = os.path.join(args.output_dir, "train.tsv")
357 | shutil.copy(origin_train_path, save_train_path)
358 | best_test_acc = train_text_classifier.train("aug_data")
359 | print("before augment best acc:{}".format(best_test_acc))
360 |
361 | for e in trange(int(args.num_train_epochs), desc="Epoch"):
362 | avg_loss = 0.
363 |
364 | for step, batch in enumerate(train_dataloader):
365 | model.train()
366 | batch = tuple(t.cuda() for t in batch)
367 | _, input_ids, input_mask, segment_ids, masked_ids = batch
368 | loss = model(input_ids, segment_ids, input_mask, masked_ids)
369 | loss.backward()
370 | avg_loss += loss.item()
371 | optimizer.step()
372 | model.zero_grad()
373 | if (step + 1) % 50 == 0:
374 | print("avg_loss: {}".format(avg_loss / 50))
375 | avg_loss = 0
376 | torch.cuda.empty_cache()
377 | shutil.copy(origin_train_path, save_train_path)
378 | save_train_file = open(save_train_path, 'a')
379 | tsv_writer = csv.writer(save_train_file, delimiter='\t')
380 | #tsv_writer.writerow(['sentence', 'label'])
381 | for step, batch in enumerate(train_dataloader):
382 | model.eval()
383 | batch = tuple(t.cuda() for t in batch)
384 | init_ids, _, input_mask, segment_ids, _ = batch
385 | input_lens = [sum(mask).item() for mask in input_mask]
386 | #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens])
387 | masked_idx = np.squeeze([np.random.randint(0, l, max(l//7,2)) for l in input_lens])
388 | for ids, idx in zip(init_ids,masked_idx):
389 | ids[idx] = MASK_id
390 | predictions = model(init_ids, segment_ids, input_mask)
391 | for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions, segment_ids):
392 | #pred = torch.argsort(pred)[:,-e-1][idx]
393 | '''
394 | pred = torch.argsort(preds)[:,-1][idx]
395 | ids[idx] = pred
396 | new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
397 | new_str = rev_wordpiece(new_str)
398 | tsv_writer.writerow([new_str, seg[0].item()])
399 | '''
400 | pred = torch.argsort(preds)[:, -2][idx]
401 | ids[idx] = pred
402 | new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
403 | new_str = rev_wordpiece(new_str)
404 | tsv_writer.writerow([new_str, seg[0].item()])
405 | torch.cuda.empty_cache()
406 | predictions = predictions.detach().cpu()
407 | torch.cuda.empty_cache()
408 | bak_train_path = os.path.join(args.output_dir, "train_epoch_{}.tsv".format(e))
409 | shutil.copy(save_train_path, bak_train_path)
410 | best_test_acc = train_text_classifier.train("aug_data")
411 | print("epoch {} augment best acc:{}".format(e, best_test_acc))
412 | if save_every_epoch:
413 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1)
414 | save_model_path = os.path.join(save_model_dir, save_model_name)
415 | torch.save(model, save_model_path)
416 | else:
417 | if (e + 1) % 10 == 0:
418 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1)
419 | save_model_path = os.path.join(save_model_dir, save_model_name)
420 | torch.save(model, save_model_path)
421 |
422 |
423 | if __name__ == "__main__":
424 | main()
425 |
--------------------------------------------------------------------------------
/aug_dataset_wo_ft.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import csv
6 | import os
7 | import shutil
8 | import logging
9 | import argparse
10 | import random
11 | from tqdm import tqdm, trange
12 | import json
13 |
14 | import numpy as np
15 | import torch
16 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
17 | from torch.utils.data.distributed import DistributedSampler
18 |
19 | from pytorch_pretrained_bert.tokenization import BertTokenizer
20 | from pytorch_pretrained_bert.modeling import BertForMaskedLM
21 | from pytorch_pretrained_bert.optimization import BertAdam
22 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
23 |
24 | import train_text_classifier
25 |
26 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
27 | datefmt='%m/%d/%Y %H:%M:%S',
28 | level=logging.INFO)
29 | logger = logging.getLogger(__name__)
30 |
31 |
32 | class DataProcessor(object):
33 | """Base class for data converters for sequence classification data sets."""
34 |
35 | def get_train_examples(self, data_dir):
36 | """Gets a collection of `InputExample`s for the train set."""
37 | raise NotImplementedError()
38 |
39 | def get_dev_examples(self, data_dir):
40 | """Gets a collection of `InputExample`s for the dev set."""
41 | raise NotImplementedError()
42 |
43 | def get_labels(self):
44 | """Gets the list of labels for this data set."""
45 | raise NotImplementedError()
46 |
47 | @classmethod
48 | def _read_tsv(cls, input_file, quotechar=None):
49 | """Reads a tab separated value file."""
50 | with open(input_file, "r") as f:
51 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
52 | lines = []
53 | for line in reader:
54 | lines.append(line)
55 | return lines
56 |
57 |
58 | class InputExample(object):
59 | """A single training/test example for simple sequence classification."""
60 |
61 | def __init__(self, guid, text_a, label=None):
62 | """Constructs a InputExample.
63 |
64 | Args:
65 | guid: Unique id for the example.
66 | text_a: string. The untokenized text of the first sequence. For single
67 | sequence tasks, only this sequence must be specified.
68 | """
69 | self.guid = guid
70 | self.text_a = text_a
71 | self.label = label
72 |
73 |
74 | class InputFeatures(object):
75 | """A single set of features of data."""
76 |
77 | def __init__(self, init_ids, input_ids, input_mask, segment_ids, masked_lm_labels):
78 | self.init_ids = init_ids
79 | self.input_ids = input_ids
80 | self.input_mask = input_mask
81 | self.segment_ids = segment_ids
82 | self.masked_lm_labels = masked_lm_labels
83 |
84 | class AugProcessor(DataProcessor):
85 | """Processor for dataset to be augmented."""
86 |
87 | def get_train_examples(self, data_dir):
88 | """See base class."""
89 | return self._create_examples(
90 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
91 |
92 | def get_dev_examples(self, data_dir):
93 | """See base class."""
94 | return self._create_examples(
95 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
96 |
97 | def get_labels(self, name):
98 | """add your dataset here"""
99 | if name in ['stsa.binary', 'mpqa', 'rt-polarity', 'subj']:
100 | return ["0", "1"]
101 | elif name in ['stsa.fine']:
102 | return ["0", "1", "2", "3", "4"]
103 | elif name in ['TREC']:
104 | return ["0", "1", "2", "3", "4", "5"]
105 |
106 | def _create_examples(self, lines, set_type):
107 | """Creates examples for the training and dev sets."""
108 | examples = []
109 | for (i, line) in enumerate(lines):
110 | if i == 0:
111 | continue
112 | guid = "%s-%s" % (set_type, i)
113 | text_a = line[0]
114 | label = line[-1]
115 | examples.append(
116 | InputExample(guid=guid, text_a=text_a, label=label))
117 | return examples
118 |
119 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
120 | """Loads a data file into a list of `InputBatch`s."""
121 |
122 | label_map = {}
123 | for (i, label) in enumerate(label_list):
124 | label_map[label] = i
125 |
126 | features = []
127 | # ----
128 | dupe_factor = 5
129 | masked_lm_prob = 0.15
130 | max_predictions_per_seq = 20
131 | rng = random.Random(12345)
132 |
133 | for (ex_index, example) in enumerate(examples):
134 | tokens_a = tokenizer.tokenize(example.text_a)
135 | segment_id = label_map[example.label]
136 | # Account for [CLS] and [SEP] with "- 2"
137 | if len(tokens_a) > max_seq_length - 2:
138 | tokens_a = tokens_a[0:(max_seq_length - 2)]
139 |
140 | # 由于是CMLM,所以需要用标签
141 | tokens = []
142 | segment_ids = []
143 | # is [CLS]和[SEP] needed ?
144 | tokens.append("[CLS]")
145 | segment_ids.append(segment_id)
146 | for token in tokens_a:
147 | tokens.append(token)
148 | segment_ids.append(segment_id)
149 | tokens.append("[SEP]")
150 | segment_ids.append(segment_id)
151 | masked_lm_labels = [-1] * max_seq_length
152 |
153 | cand_indexes = []
154 | for (i, token) in enumerate(tokens):
155 | if token == "[CLS]" or token == "[SEP]":
156 | continue
157 | cand_indexes.append(i)
158 |
159 | rng.shuffle(cand_indexes)
160 | len_cand = len(cand_indexes)
161 |
162 | output_tokens = list(tokens)
163 |
164 | num_to_predict = min(max_predictions_per_seq,
165 | max(1, int(round(len(tokens) * masked_lm_prob))))
166 |
167 | masked_lms_pos = []
168 | covered_indexes = set()
169 | for index in cand_indexes:
170 | if len(masked_lms_pos) >= num_to_predict:
171 | break
172 | if index in covered_indexes:
173 | continue
174 | covered_indexes.add(index)
175 |
176 | masked_token = None
177 | # 80% of the time, replace with [MASK]
178 | if rng.random() < 0.8:
179 | masked_token = "[MASK]"
180 | else:
181 | # 10% of the time, keep original
182 | if rng.random() < 0.5:
183 | masked_token = tokens[index]
184 | # 10% of the time, replace with random word
185 | else:
186 | masked_token = tokens[cand_indexes[rng.randint(0, len_cand - 1)]]
187 |
188 | masked_lm_labels[index] = tokenizer.convert_tokens_to_ids([tokens[index]])[0]
189 | output_tokens[index] = masked_token
190 | masked_lms_pos.append(index)
191 |
192 | init_ids = tokenizer.convert_tokens_to_ids(tokens)
193 | input_ids = tokenizer.convert_tokens_to_ids(output_tokens)
194 |
195 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
196 | # tokens are attended to.
197 | input_mask = [1] * len(input_ids)
198 |
199 | # Zero-pad up to the sequence length.
200 | while len(input_ids) < max_seq_length:
201 | init_ids.append(0)
202 | input_ids.append(0)
203 | input_mask.append(0)
204 | segment_ids.append(0) # ?segment_id
205 |
206 | assert len(init_ids) == max_seq_length
207 | assert len(input_ids) == max_seq_length
208 | assert len(input_mask) == max_seq_length
209 | assert len(segment_ids) == max_seq_length
210 |
211 | if ex_index < 2:
212 | logger.info("*** Example ***")
213 | logger.info("guid: %s" % (example.guid))
214 | logger.info("tokens: %s" % " ".join(
215 | [str(x) for x in tokens]))
216 | logger.info("init_ids: %s" % " ".join([str(x) for x in init_ids]))
217 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
218 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
219 | logger.info(
220 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
221 | logger.info("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels]))
222 |
223 | features.append(
224 | InputFeatures(init_ids=init_ids,
225 | input_ids=input_ids,
226 | input_mask=input_mask,
227 | segment_ids=segment_ids,
228 | masked_lm_labels=masked_lm_labels))
229 | return features
230 |
231 | def rev_wordpiece(str):
232 | #print(str)
233 | if len(str) > 1:
234 | for i in range(len(str)-1, 0, -1):
235 | if str[i] == '[PAD]':
236 | str.remove(str[i])
237 | elif len(str[i]) > 1 and str[i][0]=='#' and str[i][1]=='#':
238 | str[i-1] += str[i][2:]
239 | str.remove(str[i])
240 | return " ".join(str[1:-1])
241 |
242 | def main():
243 | parser = argparse.ArgumentParser()
244 |
245 | ## Required parameters
246 | parser.add_argument("--data_dir", default="datasets", type=str,
247 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
248 | parser.add_argument("--output_dir", default="aug_data", type=str,
249 | help="The output dir for augmented dataset")
250 | parser.add_argument("--bert_model", default="bert-base-uncased", type=str,
251 | help="The path of pretrained bert model.")
252 | parser.add_argument("--task_name",default="subj",type=str,
253 | help="The name of the task to train.")
254 | parser.add_argument("--max_seq_length", default=64, type=int,
255 | help="The maximum total input sequence length after WordPiece tokenization. \n"
256 | "Sequences longer than this will be truncated, and sequences shorter \n"
257 | "than this will be padded.")
258 | parser.add_argument("--do_lower_case", default=False, action='store_true',
259 | help="Set this flag if you are using an uncased model.")
260 | parser.add_argument("--train_batch_size", default=32, type=int,
261 | help="Total batch size for training.")
262 | parser.add_argument("--learning_rate", default=5e-5, type=float,
263 | help="The initial learning rate for Adam.")
264 | parser.add_argument("--num_train_epochs", default=9.0, type=float,
265 | help="Total number of training epochs to perform.")
266 | parser.add_argument("--warmup_proportion", default=0.1, type=float,
267 | help="Proportion of training to perform linear learning rate warmup for. "
268 | "E.g., 0.1 = 10%% of training.")
269 | parser.add_argument('--seed', type=int, default=42,
270 | help="random seed for initialization")
271 |
272 | args = parser.parse_args()
273 | with open("global.config", 'rb') as f:
274 | configs_dict = json.load(f)
275 |
276 | args.task_name = configs_dict.get("dataset")
277 | print(args)
278 | run_aug(args, save_every_epoch=False)
279 |
280 |
281 | def run_aug(args, save_every_epoch=False):
282 | processors = {
283 | # you can your processor here
284 | "TREC": AugProcessor,
285 | "stsa.fine": AugProcessor,
286 | "stsa.binary": AugProcessor,
287 | "mpqa": AugProcessor,
288 | "rt-polarity": AugProcessor,
289 | "subj": AugProcessor,
290 | }
291 |
292 | task_name = args.task_name
293 | if task_name not in processors:
294 | raise ValueError("Task not found: %s" % (task_name))
295 | args.data_dir = os.path.join(args.data_dir, task_name)
296 | args.output_dir = os.path.join(args.output_dir, task_name)
297 |
298 | random.seed(args.seed)
299 | np.random.seed(args.seed)
300 | torch.manual_seed(args.seed)
301 | os.makedirs(args.output_dir, exist_ok=True)
302 | processor = processors[task_name]()
303 | label_list = processor.get_labels(task_name)
304 |
305 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
306 |
307 | train_examples = None
308 | num_train_steps = None
309 | train_examples = processor.get_train_examples(args.data_dir)
310 | #dev_examples = processor.get_dev_examples(args.data_dir)
311 | #train_examples.extend(dev_examples)
312 | num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs)
313 |
314 | # Prepare model
315 | model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)
316 |
317 | if task_name == 'stsa.fine':
318 | model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(5, 768)
319 | model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02)
320 | elif task_name == 'TREC':
321 | model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(6, 768)
322 | model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02)
323 |
324 | model.cuda()
325 |
326 | # Prepare optimizer
327 | param_optimizer = list(model.named_parameters())
328 | no_decay = ['bias', 'gamma', 'beta']
329 | optimizer_grouped_parameters = [
330 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
331 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
332 | ]
333 | t_total = num_train_steps
334 | optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate,
335 | warmup=args.warmup_proportion,t_total=t_total)
336 |
337 | global_step = 0
338 | train_features = convert_examples_to_features(
339 | train_examples, label_list, args.max_seq_length, tokenizer)
340 | logger.info("***** Running training *****")
341 | logger.info(" Num examples = %d", len(train_examples))
342 | logger.info(" Batch size = %d", args.train_batch_size)
343 | logger.info(" Num steps = %d", num_train_steps)
344 | all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long)
345 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
346 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
347 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
348 | all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long)
349 | train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels)
350 | train_sampler = RandomSampler(train_data)
351 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
352 |
353 | save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name)
354 | if not os.path.exists(save_model_dir):
355 | os.mkdir(save_model_dir)
356 | MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]
357 |
358 | origin_train_path = os.path.join(args.output_dir, "train_origin.tsv")
359 | save_train_path = os.path.join(args.output_dir, "train.tsv")
360 | shutil.copy(origin_train_path, save_train_path)
361 | best_test_acc = train_text_classifier.train("aug_data")
362 | print("before augment best acc:{}".format(best_test_acc))
363 |
364 | for e in trange(int(args.num_train_epochs), desc="Epoch"):
365 | avg_loss = 0.
366 |
367 | for step, batch in enumerate(train_dataloader):
368 | model.train()
369 | batch = tuple(t.cuda() for t in batch)
370 | _, input_ids, input_mask, segment_ids, masked_ids = batch
371 | loss = model(input_ids, segment_ids, input_mask, masked_ids)
372 | loss.backward()
373 | avg_loss += loss.item()
374 | optimizer.step()
375 | model.zero_grad()
376 | if (step + 1) % 50 == 0:
377 | print("avg_loss: {}".format(avg_loss / 50))
378 | avg_loss = 0
379 | torch.cuda.empty_cache()
380 | shutil.copy(origin_train_path, save_train_path)
381 | save_train_file = open(save_train_path, 'a')
382 | tsv_writer = csv.writer(save_train_file, delimiter='\t')
383 | #tsv_writer.writerow(['sentence', 'label'])
384 | for step, batch in enumerate(train_dataloader):
385 | model.eval()
386 | batch = tuple(t.cuda() for t in batch)
387 | init_ids, _, input_mask, segment_ids, _ = batch
388 | input_lens = [sum(mask).item() for mask in input_mask]
389 | #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens])
390 | masked_idx = np.squeeze([np.random.randint(0, l, max(l//7,2)) for l in input_lens])
391 | for ids, idx in zip(init_ids,masked_idx):
392 | ids[idx] = MASK_id
393 | predictions = model(init_ids, segment_ids, input_mask)
394 | for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions, segment_ids):
395 | #pred = torch.argsort(pred)[:,-e-1][idx]
396 | '''
397 | pred = torch.argsort(preds)[:,-1][idx]
398 | ids[idx] = pred
399 | new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
400 | new_str = rev_wordpiece(new_str)
401 | tsv_writer.writerow([new_str, seg[0].item()])
402 | '''
403 | pred = torch.argsort(preds)[:, -2][idx]
404 | ids[idx] = pred
405 | new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
406 | new_str = rev_wordpiece(new_str)
407 | tsv_writer.writerow([new_str, seg[0].item()])
408 | torch.cuda.empty_cache()
409 | predictions = predictions.detach().cpu()
410 | torch.cuda.empty_cache()
411 | bak_train_path = os.path.join(args.output_dir, "train_epoch_{}.tsv".format(e))
412 | shutil.copy(save_train_path, bak_train_path)
413 | best_test_acc = train_text_classifier.train("aug_data")
414 | print("epoch {} augment best acc:{}".format(e, best_test_acc))
415 | if save_every_epoch:
416 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1)
417 | save_model_path = os.path.join(save_model_dir, save_model_name)
418 | torch.save(model, save_model_path)
419 | else:
420 | if (e + 1) % 10 == 0:
421 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1)
422 | save_model_path = os.path.join(save_model_dir, save_model_name)
423 | torch.save(model, save_model_path)
424 |
425 |
426 | if __name__ == "__main__":
427 | main()
428 |
--------------------------------------------------------------------------------
/datasets/TREC/test.tsv:
--------------------------------------------------------------------------------
1 | sentence label
2 | How far is it from Denver to Aspen ? 5
3 | What county is Modesto , California in ? 4
4 | Who was Galileo ? 3
5 | What is an atom ? 0
6 | When did Hawaii become a state ? 5
7 | How tall is the Sears Building ? 5
8 | George Bush purchased a small interest in which baseball team ? 3
9 | What is Australia 's national flower ? 1
10 | Why does the moon turn orange ? 0
11 | What is autism ? 0
12 | What city had a world fair in 1900 ? 4
13 | What person 's head is on a dime ? 3
14 | What is the average weight of a Yellow Labrador ? 5
15 | Who was the first man to fly across the Pacific Ocean ? 3
16 | When did Idaho become a state ? 5
17 | What is the life expectancy for crickets ? 5
18 | What metal has the highest melting point ? 1
19 | Who developed the vaccination against polio ? 3
20 | What is epilepsy ? 0
21 | What year did the Titanic sink ? 5
22 | Who was the first American to walk in space ? 3
23 | What is a biosphere ? 0
24 | What river in the US is known as the Big Muddy ? 4
25 | What is bipolar disorder ? 0
26 | What is cholesterol ? 0
27 | Who developed the Macintosh computer ? 3
28 | What is caffeine ? 0
29 | What imaginary line is halfway between the North and South Poles ? 4
30 | Where is John Wayne airport ? 4
31 | What hemisphere is the Philippines in ? 4
32 | What is the average speed of the horses at the Kentucky Derby ? 5
33 | Where are the Rocky Mountains ? 4
34 | What are invertebrates ? 0
35 | What is the temperature at the center of the earth ? 5
36 | When did John F. Kennedy get elected as President ? 5
37 | How old was Elvis Presley when he died ? 5
38 | Where is the Orinoco River ? 4
39 | How far is the service line from the net in tennis ? 5
40 | How much fiber should you have per day ? 5
41 | How many Great Lakes are there ? 5
42 | Material called linen is made from what plant ? 1
43 | What is Teflon ? 0
44 | What is amitriptyline ? 0
45 | What is a shaman ? 0
46 | What is the proper name for a female walrus ? 1
47 | What is a group of turkeys called ? 1
48 | How long did Rip Van Winkle sleep ? 5
49 | What are triglycerides ? 0
50 | How many liters in a gallon ? 5
51 | What is the name of the chocolate company in San Francisco ? 3
52 | What are amphibians ? 0
53 | Who discovered x-rays ? 3
54 | Which comedian 's signature line is `` Can we talk '' ? 3
55 | What is fibromyalgia ? 0
56 | What is done with worn or outdated flags ? 0
57 | What does cc in engines mean ? 0
58 | When did Elvis Presley die ? 5
59 | What is the capital of Yugoslavia ? 4
60 | Where is Milan ? 4
61 | What is the speed hummingbirds fly ? 5
62 | What is the oldest city in the United States ? 4
63 | What was W.C. Fields ' real name ? 3
64 | What river flows between Fargo , North Dakota and Moorhead , Minnesota ? 4
65 | What do bats eat ? 1
66 | What state did the Battle of Bighorn take place in ? 4
67 | Who was Abraham Lincoln ? 3
68 | What do you call a newborn kangaroo ? 1
69 | What are spider veins ? 0
70 | What day and month did John Lennon die ? 5
71 | What strait separates North America from Asia ? 4
72 | What is the population of Seattle ? 5
73 | How much was a ticket for the Titanic ? 5
74 | What is the largest city in the world ? 4
75 | What American composer wrote the music for `` West Side Story '' ? 3
76 | Where is the Mall of the America ? 4
77 | What is the pH scale ? 0
78 | What type of currency is used in Australia ? 1
79 | How tall is the Gateway Arch in St. Louis , MO ? 5
80 | How much does the human adult female brain weigh ? 5
81 | Who was the first governor of Alaska ? 3
82 | What is a prism ? 0
83 | When was the first liver transplant ? 5
84 | Who was elected president of South Africa in 1994 ? 3
85 | What is the population of China ? 5
86 | When was Rosa Parks born ? 5
87 | Why is a ladybug helpful ? 0
88 | What is amoxicillin ? 0
89 | Who was the first female United States Representative ? 3
90 | What are xerophytes ? 0
91 | What country did Ponce de Leon come from ? 4
92 | The U.S. Department of Treasury first issued paper currency for the U.S. during which war ? 1
93 | What is desktop publishing ? 0
94 | What is the temperature of the sun 's surface ? 5
95 | What year did Canada join the United Nations ? 5
96 | What is the oldest university in the US ? 3
97 | Where is Prince Edward Island ? 4
98 | Mercury , what year was it discovered ? 5
99 | What is cryogenics ? 0
100 | What are coral reefs ? 0
101 | What is the longest major league baseball-winning streak ? 1
102 | What is neurology ? 0
103 | Who invented the calculator ? 3
104 | How do you measure earthquakes ? 0
105 | Who is Duke Ellington ? 3
106 | What county is Phoenix , AZ in ? 4
107 | What is a micron ? 0
108 | The sun 's core , what is the temperature ? 5
109 | What is the Ohio state bird ? 1
110 | When were William Shakespeare 's twins born ? 5
111 | What is the highest dam in the U.S. ? 4
112 | What color is a poison arrow frog ? 1
113 | What is acupuncture ? 0
114 | What is the length of the coastline of the state of Alaska ? 5
115 | What is the name of Neil Armstrong 's wife ? 3
116 | What is Hawaii 's state flower ? 1
117 | Who won Ms. American in 1989 ? 3
118 | When did the Hindenberg crash ? 5
119 | What mineral helps prevent osteoporosis ? 1
120 | What was the last year that the Chicago Cubs won the World Series ? 5
121 | Where is Perth ? 4
122 | What year did WWII begin ? 5
123 | What is the diameter of a golf ball ? 5
124 | What is an eclipse ? 0
125 | Who discovered America ? 3
126 | What is the earth 's diameter ? 5
127 | Which president was unmarried ? 3
128 | How wide is the Milky Way galaxy ? 5
129 | During which season do most thunderstorms occur ? 5
130 | What is Wimbledon ? 0
131 | What is the gestation period for a cat ? 5
132 | How far is a nautical mile ? 5
133 | Who was the abolitionist who led the raid on Harper 's Ferry in 1859 ? 3
134 | What does target heart rate mean ? 0
135 | What was the first satellite to go into space ? 1
136 | What is foreclosure ? 0
137 | What is the major fault line near Kentucky ? 1
138 | Where is the Holland Tunnel ? 4
139 | Who wrote the hymn `` Amazing Grace '' ? 3
140 | What position did Willie Davis play in baseball ? 3
141 | What are platelets ? 0
142 | What is severance pay ? 0
143 | What is the name of Roy Roger 's dog ? 1
144 | Where are the National Archives ? 4
145 | What is a baby turkey called ? 1
146 | What is poliomyelitis ? 0
147 | What is the longest bone in the human body ? 1
148 | Who is a German philosopher ? 3
149 | What were Christopher Columbus ' three ships ? 1
150 | What does Phi Beta Kappa mean ? 0
151 | What is nicotine ? 0
152 | What is another name for vitamin B1 ? 1
153 | Who discovered radium ? 3
154 | What are sunspots ? 0
155 | When was Algeria colonized ? 5
156 | What baseball team was the first to make numbers part of their uniform ? 3
157 | What continent is Egypt on ? 4
158 | What is the capital of Mongolia ? 4
159 | What is nanotechnology ? 0
160 | In the late 1700 's British convicts were used to populate which colony ? 4
161 | What state is the geographic center of the lower 48 states ? 4
162 | What is an obtuse angle ? 0
163 | What are polymers ? 0
164 | When is hurricane season in the Caribbean ? 5
165 | Where is the volcano Mauna Loa ? 4
166 | What is another astronomic term for the Northern Lights ? 1
167 | What peninsula is Spain part of ? 4
168 | When was Lyndon B. Johnson born ? 5
169 | What is acetaminophen ? 0
170 | What state has the least amount of rain per year ? 4
171 | Who founded American Red Cross ? 3
172 | What year did the Milwaukee Braves become the Atlanta Braves ? 5
173 | How fast is alcohol absorbed ? 5
174 | When is the summer solstice ? 5
175 | What is supernova ? 0
176 | Where is the Shawnee National Forest ? 4
177 | What U.S. state 's motto is `` Live free or Die '' ? 4
178 | Where is the Lourve ? 4
179 | When was the first stamp issued ? 5
180 | What primary colors do you mix to make orange ? 1
181 | How far is Pluto from the sun ? 5
182 | What body of water are the Canary Islands in ? 4
183 | What is neuropathy ? 0
184 | Where is the Euphrates River ? 4
185 | What is cryptography ? 0
186 | What is natural gas composed of ? 1
187 | Who is the Prime Minister of Canada ? 3
188 | What French ruler was defeated at the battle of Waterloo ? 3
189 | What is leukemia ? 0
190 | Where did Howard Hughes die ? 4
191 | What is the birthstone for June ? 1
192 | What is the sales tax in Minnesota ? 1
193 | What is the distance in miles from the earth to the sun ? 5
194 | What is the average life span for a chicken ? 5
195 | When was the first Wal-Mart store opened ? 5
196 | What is relative humidity ? 0
197 | What city has the zip code of 35824 ? 4
198 | What currency is used in Algeria ? 1
199 | Who invented the hula hoop ? 3
200 | What was the most popular toy in 1957 ? 1
201 | What is pastrami made of ? 1
202 | What is the name of the satellite that the Soviet Union sent into space in 1957 ? 1
203 | What city 's newspaper is called `` The Enquirer '' ? 4
204 | Who invented the slinky ? 3
205 | What are the animals that don 't have backbones called ? 1
206 | What is the melting point of copper ? 5
207 | Where is the volcano Olympus Mons located ? 4
208 | Who was the 23rd president of the United States ? 3
209 | What is the average body temperature ? 5
210 | What does a defibrillator do ? 0
211 | What is the effect of acid rain ? 0
212 | What year did the United States abolish the draft ? 5
213 | How fast is the speed of light ? 5
214 | What province is Montreal in ? 4
215 | What New York City structure is also known as the Twin Towers ? 4
216 | What is fungus ? 0
217 | What is the most frequently spoken language in the Netherlands ? 1
218 | What is sodium chloride ? 0
219 | What are the spots on dominoes called ? 1
220 | How many pounds in a ton ? 5
221 | What is influenza ? 0
222 | What is ozone depletion ? 0
223 | What year was the Mona Lisa painted ? 5
224 | What does `` Sitting Shiva '' mean ? 0
225 | What is the electrical output in Madrid , Spain ? 1
226 | Which mountain range in North America stretches from Maine to Georgia ? 4
227 | What is plastic made of ? 1
228 | What is the population of Nigeria ? 5
229 | What does your spleen do ? 0
230 | Where is the Grand Canyon ? 4
231 | Who invented the telephone ? 3
232 | What year did the U.S. buy Alaska ? 5
233 | What is the name of the leader of Ireland ? 3
234 | What is phenylalanine ? 0
235 | How many gallons of water are there in a cubic foot ? 5
236 | What are the two houses of the Legislative branch ? 1
237 | What is sonar ? 0
238 | In Poland , where do most people live ? 4
239 | What is phosphorus ? 0
240 | What is the location of the Sea of Tranquility ? 4
241 | How fast is sound ? 5
242 | What French province is cognac produced in ? 4
243 | What is Valentine 's Day ? 0
244 | What causes gray hair ? 0
245 | What is hypertension ? 0
246 | What is bandwidth ? 0
247 | What is the longest suspension bridge in the U.S. ? 4
248 | What is a parasite ? 0
249 | What is home equity ? 0
250 | What do meteorologists do ? 0
251 | What is the criterion for being legally blind ? 1
252 | Who is the tallest man in the world ? 3
253 | What are the twin cities ? 4
254 | What did Edward Binney and Howard Smith invent in 1903 ? 1
255 | What is the statue of liberty made of ? 1
256 | What is pilates ? 0
257 | What planet is known as the `` red '' planet ? 4
258 | What is the depth of the Nile river ? 5
259 | What is the colorful Korean traditional dress called ? 1
260 | What is Mardi Gras ? 0
261 | Mexican pesos are worth what in U.S. dollars ? 5
262 | Who was the first African American to play for the Brooklyn Dodgers ? 3
263 | Who was the first Prime Minister of Canada ? 3
264 | How many Admirals are there in the U.S. Navy ? 5
265 | What instrument did Glenn Miller play ? 1
266 | How old was Joan of Arc when she died ? 5
267 | What does the word fortnight mean ? 0
268 | What is dianetics ? 0
269 | What is the capital of Ethiopia ? 4
270 | For how long is an elephant pregnant ? 5
271 | How did Janice Joplin die ? 0
272 | What is the primary language in Iceland ? 1
273 | What is the difference between AM radio stations and FM radio stations ? 0
274 | What is osteoporosis ? 0
275 | Who was the first woman governor in the U.S. ? 3
276 | What is peyote ? 0
277 | What is the esophagus used for ? 0
278 | What is viscosity ? 0
279 | What year did Oklahoma become a state ? 5
280 | What is the abbreviation for Texas ? 2
281 | What is a mirror made out of ? 1
282 | Where on the body is a mortarboard worn ? 4
283 | What was J.F.K. 's wife 's name ? 3
284 | What does I.V. stand for ? 2
285 | What is the chunnel ? 0
286 | Where is Hitler buried ? 4
287 | What are antacids ? 0
288 | What is pulmonary fibrosis ? 0
289 | What are Quaaludes ? 0
290 | What is naproxen ? 0
291 | What is strep throat ? 0
292 | What is the largest city in the U.S. ? 4
293 | What is foot and mouth disease ? 1
294 | What is the life expectancy of a dollar bill ? 5
295 | What do you call a professional map drawer ? 1
296 | What are Aborigines ? 0
297 | What is hybridization ? 0
298 | What color is indigo ? 1
299 | How old do you have to be in order to rent a car in Italy ? 5
300 | What does a barometer measure ? 1
301 | What color is a giraffe 's tongue ? 1
302 | What does USPS stand for ? 2
303 | What year did the NFL go on strike ? 5
304 | What is solar wind ? 0
305 | What date did Neil Armstrong land on the moon ? 5
306 | When was Hiroshima bombed ? 5
307 | Where is the Savannah River ? 4
308 | Who was the first woman killed in the Vietnam War ? 3
309 | What planet has the strongest magnetic field of all the planets ? 4
310 | Who is the governor of Alaska ? 3
311 | What year did Mussolini seize power in Italy ? 5
312 | What is the capital of Persia ? 4
313 | Where is the Eiffel Tower ? 4
314 | How many hearts does an octopus have ? 5
315 | What is pneumonia ? 0
316 | What is the deepest lake in the US ? 4
317 | What is a fuel cell ? 0
318 | Who was the first U.S. president to appear on TV ? 3
319 | Where is the Little League Museum ? 4
320 | What are the two types of twins ? 1
321 | What is the brightest star ? 4
322 | What is diabetes ? 0
323 | When was President Kennedy shot ? 5
324 | What is TMJ ? 2
325 | What color is yak milk ? 1
326 | What date was Dwight D. Eisenhower born ? 5
327 | What does the technical term ISDN mean ? 2
328 | Why is the sun yellow ? 0
329 | What is the conversion rate between dollars and pounds ? 5
330 | When was Abraham Lincoln born ? 5
331 | What is the Milky Way ? 0
332 | What is mold ? 0
333 | What year was Mozart born ? 5
334 | What is a group of frogs called ? 1
335 | What is the name of William Penn 's ship ? 1
336 | What is the melting point of gold ? 5
337 | What is the street address of the White House ? 4
338 | What is semolina ? 0
339 | What fruit is Melba sauce made from ? 1
340 | What is Ursa Major ? 0
341 | What is the percentage of water content in the human body ? 5
342 | How much does water weigh ? 5
343 | What was President Lyndon Johnson 's reform program called ? 1
344 | What is the murder rate in Windsor , Ontario ? 5
345 | Who is the only president to serve 2 non-consecutive terms ? 3
346 | What is the population of Australia ? 5
347 | Who painted the ceiling of the Sistine Chapel ? 3
348 | Name a stimulant . 1
349 | What is the effect of volcanoes on the climate ? 0
350 | What year did the Andy Griffith show begin ? 5
351 | What is acid rain ? 0
352 | What is the date of Mexico 's independence ? 5
353 | What is the location of Lake Champlain ? 4
354 | What is the Illinois state flower ? 1
355 | What is Maryland 's state bird ? 1
356 | What is quicksilver ? 0
357 | Who wrote `` The Divine Comedy '' ? 3
358 | What is the speed of light ? 5
359 | What is the width of a football field ? 5
360 | Why in tennis are zero points called love ? 0
361 | What kind of dog was Toto in the Wizard of Oz ? 1
362 | What is a thyroid ? 0
363 | What does ciao mean ? 0
364 | What is the only artery that carries blue blood from the heart to the lungs ? 1
365 | How often does Old Faithful erupt at Yellowstone National Park ? 5
366 | What is acetic acid ? 0
367 | What is the elevation of St. Louis , MO ? 5
368 | What color does litmus paper turn when it comes into contact with a strong acid ? 1
369 | What are the colors of the German flag ? 1
370 | What is the Moulin Rouge ? 0
371 | What soviet seaport is on the Black Sea ? 4
372 | What is the atomic weight of silver ? 5
373 | What currency do they use in Brazil ? 1
374 | What are pathogens ? 0
375 | What is mad cow disease ? 0
376 | Name a food high in zinc . 1
377 | When did North Carolina enter the union ? 5
378 | Where do apple snails live ? 4
379 | What are ethics ? 0
380 | What does CPR stand for ? 2
381 | What is an annuity ? 0
382 | Who killed John F. Kennedy ? 3
383 | Who was the first vice president of the U.S. ? 3
384 | What birthstone is turquoise ? 1
385 | Who was the first US President to ride in an automobile to his inauguration ? 3
386 | How old was the youngest president of the United States ? 5
387 | When was Ulysses S. Grant born ? 5
388 | What is Muscular Dystrophy ? 0
389 | Who lived in the Neuschwanstein castle ? 3
390 | What is propylene glycol ? 0
391 | What is a panic disorder ? 0
392 | Who invented the instant Polaroid camera ? 3
393 | What is a carcinogen ? 0
394 | What is a baby lion called ? 1
395 | What is the world 's population ? 5
396 | What is nepotism ? 0
397 | What is die-casting ? 0
398 | What is myopia ? 0
399 | What is the sales tax rate in New York ? 5
400 | Developing nations comprise what percentage of the world 's population ? 5
401 | What is the fourth highest mountain in the world ? 4
402 | What is Shakespeare 's nickname ? 3
403 | What is the heaviest naturally occurring element ? 1
404 | When is Father 's Day ? 5
405 | What does the acronym NASA stand for ? 2
406 | How long is the Columbia River in miles ? 5
407 | What city 's newspaper is called `` The Star '' ? 4
408 | What is carbon dioxide ? 0
409 | Where is the Mason/Dixon line ? 4
410 | When was the Boston tea party ? 5
411 | What is metabolism ? 0
412 | Which U.S.A. president appeared on `` Laugh-In '' ? 3
413 | What are cigarettes made of ? 1
414 | What is the capital of Zimbabwe ? 4
415 | What does NASA stand for ? 2
416 | What is the state flower of Michigan ? 1
417 | What are semiconductors ? 0
418 | What is nuclear power ? 0
419 | What is a tsunami ? 0
420 | Who is the congressman from state of Texas on the armed forces committee ? 3
421 | Who was president in 1913 ? 3
422 | When was the first kidney transplant ? 5
423 | What are Canada 's two territories ? 4
424 | What was the name of the plane Lindbergh flew solo across the Atlantic ? 1
425 | What is genocide ? 0
426 | What continent is Argentina on ? 4
427 | What monastery was raided by Vikings in the late eighth century ? 1
428 | What is an earthquake ? 0
429 | Where is the tallest roller coaster located ? 4
430 | What are enzymes ? 0
431 | Who discovered oxygen ? 3
432 | What is bangers and mash ? 0
433 | What is the name given to the Tiger at Louisiana State University ? 1
434 | Where are the British crown jewels kept ? 4
435 | Who was the first person to reach the North Pole ? 3
436 | What is an ulcer ? 0
437 | What is vertigo ? 0
438 | What is the spirometer test ? 0
439 | When is the official first day of summer ? 5
440 | What does the abbreviation SOS mean ? 2
441 | What is the smallest bird in Britain ? 1
442 | Who invented Trivial Pursuit ? 3
443 | What gasses are in the troposphere ? 1
444 | Which country has the most water pollution ? 4
445 | What is the scientific name for elephant ? 1
446 | Who is the actress known for her role in the movie `` Gypsy '' ? 3
447 | What breed of hunting dog did the Beverly Hillbillies own ? 1
448 | What is the rainiest place on Earth ? 4
449 | Who was the first African American to win the Nobel Prize in literature ? 3
450 | When is St. Patrick 's Day ? 5
451 | What was FDR 's dog 's name ? 1
452 | What colors need to be mixed to get the color pink ? 1
453 | What is the most popular sport in Japan ? 1
454 | What is the active ingredient in baking soda ? 1
455 | When was Thomas Jefferson born ? 5
456 | How cold should a refrigerator be ? 5
457 | When was the telephone invented ? 5
458 | What is the most common eye color ? 1
459 | Where was the first golf course in the United States ? 4
460 | What is schizophrenia ? 0
461 | What is angiotensin ? 0
462 | What did Jesse Jackson organize ? 3
463 | What is New York 's state bird ? 1
464 | What is the National Park in Utah ? 4
465 | What is Susan B. Anthony 's birthday ? 5
466 | In which state would you find the Catskill Mountains ? 4
467 | What do you call a word that is spelled the same backwards and forwards ? 1
468 | What are pediatricians ? 0
469 | What chain store is headquartered in Bentonville , Arkansas ? 3
470 | What are solar cells ? 0
471 | What is compounded interest ? 0
472 | What are capers ? 0
473 | What is an antigen ? 0
474 | What currency does Luxembourg use ? 1
475 | What is the population of Venezuela ? 5
476 | What type of polymer is used for bulletproof vests ? 1
477 | What currency does Argentina use ? 1
478 | What is a thermometer ? 0
479 | What Canadian city has the largest population ? 4
480 | What color are crickets ? 1
481 | Which country gave New York the Statue of Liberty ? 4
482 | What was the name of the first U.S. satellite sent into space ? 1
483 | What precious stone is a form of pure carbon ? 1
484 | What kind of gas is in a fluorescent bulb ? 1
485 | What is rheumatoid arthritis ? 0
486 | What river runs through Rowe , Italy ? 4
487 | What is cerebral palsy ? 0
488 | What city is also known as `` The Gateway to the West '' ? 4
489 | How far away is the moon ? 5
490 | What is the source of natural gas ? 1
491 | In what spacecraft did U.S. astronaut Alan Shepard make his historic 1961 flight ? 1
492 | What is pectin ? 0
493 | What is bio-diversity ? 0
494 | What 's the easiest way to remove wallpaper ? 1
495 | What year did the Titanic start on its journey ? 5
496 | How much of an apple is water ? 5
497 | Who was the 22nd President of the US ? 3
498 | What is the money they use in Zambia ? 1
499 | How many feet in a mile ? 5
500 | What is the birthstone of October ? 1
501 | What is e-coli ? 0
502 |
--------------------------------------------------------------------------------
/datasets/mpqa/dev.tsv:
--------------------------------------------------------------------------------
1 | sentence label
2 | unfettered support 1
3 | such a mindset will impair 0
4 | has revealed george bush's talents as a war leader 1
5 | provide support to the us , as it did 1
6 | kill 0
7 | play with 0
8 | embark on trampling upon human rights of civilians 0
9 | in such a complex state 0
10 | pursue 0
11 | will continue to support 1
12 | places strains 0
13 | wish 1
14 | subjected 0
15 | benefits 0
16 | highly respected 1
17 | this agreement 1
18 | under the strong influence 0
19 | preach 1
20 | hardest hit 0
21 | would be easier 1
22 | will be irresponsible 0
23 | conspicuous policy of playing games 0
24 | to agree 1
25 | to ask 1
26 | was very strong about this 1
27 | were even more full of praise 1
28 | can not be taken seriously 0
29 | felt gratified and relieved 1
30 | invited 1
31 | languages without nuances 0
32 | scheme 0
33 | incapacity to put crime under control 0
34 | it is pure rhetoric 0
35 | charged 0
36 | attacks 0
37 | anger 0
38 | vowed 1
39 | ridiculous 0
40 | are ready 1
41 | were accused 0
42 | will of 1
43 | unheeded 0
44 | opposes and rejects 0
45 | outcry 0
46 | join forces 1
47 | ca n't be burying its head in the sand 0
48 | insidious 0
49 | would not accept 0
50 | too costly to meet 0
51 | should not 0
52 | does not reflect 0
53 | invited 1
54 | top priority of 1
55 | only benefits corporate america 0
56 | still 0
57 | support 1
58 | not intent on revitalizing its economies 0
59 | angry anti - us protests erupted 0
60 | hopes 1
61 | m feeling much better 1
62 | branded 0
63 | the danger 0
64 | proposal to reunify 1
65 | nothing seemed more normal than to settle in someone else's territory 0
66 | to blame 0
67 | have been growing less visionary 0
68 | will fight on 0
69 | shortcomings 0
70 | deadly 0
71 | would incur its displeasure 0
72 | like a dangerous virus 0
73 | it is a futile illusion because it is a lie 0
74 | are irritated 0
75 | in a way our government has not been 0
76 | neither good nor bad , just incorrigible 0
77 | destroyed 0
78 | respecting 1
79 | to promote 1
80 | witch - hunting 0
81 | biggest terrorist country 0
82 | legitimate dissent 1
83 | even against vehicles transporting pregnant women in labor and against unarmed citizens 0
84 | niceties 1
85 | his ship has been sailing towards a wrong direction for far too long 0
86 | critics 0
87 | once again turn its back 0
88 | growing unrest 0
89 | defects 0
90 | has no love lost 0
91 | violent protests 0
92 | confined to minute cells 0
93 | immediately 1
94 | corrupt politicians 0
95 | a crime against humanity 0
96 | the rights and privileges that they would automatically get under geneva convention 0
97 | perils of delusion 0
98 | would be better off 1
99 | have been concerned 0
100 | uproar 0
101 | pained him 0
102 | terrorism 0
103 | not only prejudice 0
104 | a brutal occupation 0
105 | meddling 0
106 | falls at a bad time 0
107 | ugliest crimes 0
108 | picking on 0
109 | allegations 0
110 | are being well treated 1
111 | the serious crisis 0
112 | will be a monumental tragedy 0
113 | is proud 1
114 | thus 0
115 | oh , my god 0
116 | will not allow 0
117 | the worst crisis 0
118 | turn its back 0
119 | deepened 0
120 | will complain 0
121 | very hard line 0
122 | opposition 0
123 | would not dilute 1
124 | secret 0
125 | opposition 0
126 | illusory 0
127 | not adequately free and fair 0
128 | should follow 1
129 | impossible 0
130 | an aberration that goes against mankind 0
131 | have preference for 1
132 | all around us 0
133 | very , very large 1
134 | lie 0
135 | were ill - received 0
136 | difficult 0
137 | and then assert the laws of war do not apply 0
138 | from upholding religious and cultural values 1
139 | preferred 1
140 | full support 1
141 | criticizing 0
142 | immediate exploitation 0
143 | ignored 0
144 | has unashamedly seen fit 1
145 | support 1
146 | now a reality 1
147 | gaffe 0
148 | virtually nothing 0
149 | no place in israel can be considered safe 0
150 | to invite 1
151 | prevent such a development 1
152 | rigged 0
153 | logical solution 1
154 | bungled its dealings 0
155 | demerits 0
156 | like the suffering 0
157 | unacceptable 0
158 | supports 1
159 | 100 percent 1
160 | does not appear 0
161 | pretext 0
162 | what begins to happen when we hate our friends 0
163 | denies 0
164 | intimidation 0
165 | children skipping school 0
166 | criticism 0
167 | violence and intimidation 0
168 | blatantly obstructed 0
169 | reactions of repulsion 0
170 | are not happy 0
171 | criticized 0
172 | a growing impression that misuari could pose a security threat 0
173 | left behind 1
174 | they have not succeeded , and will never succeed 1
175 | it's shameful 0
176 | beyond the reach of any legal regime 0
177 | is pessimistic 0
178 | has no right 0
179 | are masterminded 0
180 | regret 0
181 | will weaken soon 0
182 | terrible tragedy 0
183 | there is no alternative to it but conflict , isolation , nationalism , and ultimately war 0
184 | freak show 0
185 | insisted 1
186 | axis of evil 0
187 | come under fire 0
188 | threats expressed by 0
189 | the support 1
190 | wrong 0
191 | has resisted 0
192 | the darkest hour is always before the dawn 1
193 | the entire palestinian people 0
194 | denounced 0
195 | very serious threat 0
196 | only 0
197 | making a spectacle of 0
198 | promote meetings 1
199 | to mould the electoral process in his favour 0
200 | not a single day would pass in peace and with no palestinians shedding blood 0
201 | crises 0
202 | hoping 1
203 | to intimidate women and children 0
204 | disputes 0
205 | extreme right 0
206 | most dangerous 0
207 | may even get better 0
208 | promise 1
209 | has pledged 1
210 | one body with two heads 0
211 | feel at ease 1
212 | turning a blind eye 0
213 | has floundered 0
214 | warns 0
215 | most cogent argument 1
216 | lump 0
217 | deals a blow 0
218 | bad treatment 0
219 | concerned 0
220 | have criticized 0
221 | you can hardly speak of a targeting error 0
222 | certain countries resort to 0
223 | called for 1
224 | violating human rights 0
225 | unusual 0
226 | would undermine 0
227 | a terrorist act 0
228 | threat 0
229 | objectives 1
230 | can be abated 1
231 | muscle - flexing 0
232 | pursued 1
233 | brilliant 1
234 | committed 1
235 | committing themselves 1
236 | support 1
237 | the importance of china 1
238 | only one single 0
239 | hoped 1
240 | closed ranks behind 1
241 | peace 1
242 | would not be a bad idea 1
243 | legitimate 1
244 | closer to that of cowboys than to a civilized mentality 0
245 | such fruitful results 1
246 | double crime 0
247 | perfectly at ease 1
248 | mistake 0
249 | repeatedly accused 0
250 | ceased to be a soldier of the fatherland 0
251 | however 0
252 | apocalyptic savagery 0
253 | a border of peace and good neighbourliness 1
254 | call for 1
255 | hurt 0
256 | declined to endorse 0
257 | countries such as iran , iraq and north korea represent an ``axis of evil 0
258 | more serious 0
259 | am confident 1
260 | picking on 0
261 | whether 0
262 | go out into the streets to defend 1
263 | committed one more mistake 0
264 | the agreement 1
265 | charging 0
266 | other aggressive acts against lebanon 0
267 | enjoying 1
268 | decided 0
269 | hard - line 0
270 | feels 0
271 | do n't want 0
272 | the threats launched 0
273 | rejected 0
274 | breaking fundamental concepts 0
275 | increasingly tyrannical 0
276 | undisguised declaration of war and a rhetoric threatening aggression 0
277 | want 1
278 | hyperbole , 0
279 | patronizing 0
280 | arbitrary arrests 0
281 | jeopardy 0
282 | could not be said to adequately comply 0
283 | with good economic management , israel should have little trouble 1
284 | terrorist allies 0
285 | swift criticism from 0
286 | instead 0
287 | steering the economy into disaster 0
288 | grew so unhappy 0
289 | will not resort 0
290 | has refused 0
291 | may not be feasible 0
292 | danger of being shelved altogether 0
293 | unmanageable 0
294 | favouring independence 1
295 | misrule 0
296 | misery 0
297 | steering the nation to prosperity and progress 1
298 | america's biding 0
299 | compounded the problem 0
300 | designed to benefit mugabe 0
301 | supposed to be 0
302 | appraised 1
303 | stand beside right and justice 1
304 | refuses 0
305 | surged 1
306 | denied 0
307 | fake imposter 0
308 | great evil on open display 0
309 | ideal , sunny clime 1
310 | seems to be determined to expand the scope of the anti - terror war 0
311 | a body blow 0
312 | as full citizens in the same sate 1
313 | lecturing 0
314 | suffered 0
315 | harmed 0
316 | criticized 0
317 | scores of 0
318 | ignore the consequences 0
319 | territorial ambition 0
320 | not 0
321 | no one has the right to wage war against the history of this nation 0
322 | wants 1
323 | hatred 0
324 | what occured in the united states on 11 september 0
325 | oneupmanship 0
326 | labeling 0
327 | showed little - disguised irritation 0
328 | concern 0
329 | even 0
330 | the possibility of a democratic , stable and prosperous 1
331 | strongly criticized and condemned 0
332 | will not admit 0
333 | persuade 1
334 | was more than confident 1
335 | had not heeded 0
336 | cost - effective 1
337 | mistake 0
338 | would adhere to a pluralistic vision 1
339 | had asked 1
340 | replete with 1
341 | shameful 0
342 | will only 0
343 | accidental slight 0
344 | nothing whatsoever 0
345 | could no longer tolerate 0
346 | leeway 1
347 | the doubts 0
348 | aggressions against 0
349 | raving 0
350 | are blamed by 0
351 | so many uncertainties 0
352 | destined to collapse 1
353 | confidence crisis 0
354 | are accusing 0
355 | warned 0
356 | such pessimism 0
357 | anxiety 0
358 | cause a rift 0
359 | no politically prudent 0
360 | held out an olive branch 1
361 | continues to demolish 0
362 | does not endorse 0
363 | failed 1
364 | comprehensive destructive policy 0
365 | sounds clumsy 0
366 | fails to meet the standard of being free and fair 0
367 | willfulness 1
368 | tough policy 0
369 | has not satisfied 0
370 | extremist inclinations 0
371 | unilateral 0
372 | the iranians have not done what the pakistan government has done 0
373 | because 0
374 | boycotted 0
375 | without just compensation 0
376 | desperation of the people 0
377 | damage 0
378 | dominating the world 0
379 | sought 1
380 | fearing 0
381 | that concessions are sufficiently concrete 1
382 | put his nation first 1
383 | expresses the concern 0
384 | can contaminate 0
385 | to put it mildly 0
386 | will do its utmost 1
387 | endorsed 1
388 | will surely be on the president's lips 0
389 | with typical understatement 0
390 | lambasted 0
391 | affirmed 1
392 | vitriol 0
393 | in compliance 0
394 | numerous serious abuses 0
395 | asked 1
396 | confidence 1
397 | played the same tactic again 0
398 | are enthusiastic 1
399 | valued ally and friend 1
400 | so - called 0
401 | are regarding 0
402 | running out of meaningful options 0
403 | can not accept 0
404 | is concerned 0
405 | further criticism 0
406 | of all the unconscionable things 0
407 | longest destructive war 0
408 | plight of 0
409 | can no longer stand shoulder to shoulder 0
410 | it would be madness to hold elections now 0
411 | should be looking up smiling 1
412 | if one goes by the us logic , only the us spy plane is permitted to spy at other people's doorsteps 0
413 | begins to bear fruit 1
414 | rigged 0
415 | serious division 0
416 | enjoys no social base 0
417 | very constructive 1
418 | grateful 1
419 | turned thugs 0
420 | was unconstitutional 0
421 | scares away 0
422 | declining to endorse 0
423 | violate human rights 0
424 | accused 0
425 | negatively 0
426 | due regard 1
427 | simplistic 0
428 | will help renew 1
429 | succumbing 0
430 | peace to prevail 1
431 | despite all of this , however 0
432 | proves this beyond the shadow of a doubt 1
433 | is accused 0
434 | intolerant 0
435 | the visit has achieved positive and successful results 1
436 | wantonly infringing 0
437 | assassin 0
438 | continue to obstruct 0
439 | interests 1
440 | was a strong desire among 1
441 | to denounce 0
442 | left no avenue unexplored 1
443 | endorsed 1
444 | would also ensure a durable peace 1
445 | as a vehicle for increasing personal popularity 1
446 | would have been sending a very bad signal 0
447 | continues to refuse 0
448 | pursuit 0
449 | committing themselves to peace 1
450 | accuses 0
451 | as if they were quarries 0
452 | far worse than those prevailing in camp x - ray 0
453 | inhumanely 0
454 | appropriate 1
455 | poor 0
456 | good education system 1
457 | illegal 0
458 | devastating 0
459 | to create the impression 0
460 | exacerbating 0
461 | faltered as a result of israel's intransigence 0
462 | his career was like a rough sea , with highs and lows and never calm 0
463 | generally approved of 1
464 | deviant 0
465 | without trying to maneuver , place obstacles , or set impossible conditions 0
466 | the fire is raging at home 0
467 | lacks credibility and can not withstand any objective scrutiny 0
468 | endorsed 1
469 | assassinate innocent activists and citizens 0
470 | than they deserve 0
471 | refusal to respect its obligations 0
472 | would not 0
473 | for the first time 1
474 | beautiful historic coincidence 1
475 | hardly elastic enough 0
476 | approve 1
477 | hope 1
478 | was so hard on 0
479 | most serious consequences 0
480 | will have trouble wriggling out of the need to explain and justify 0
481 | not completely reliable 0
482 | democracy exhausted all its generosity 0
483 | relatively calm 1
484 | are extremely concerned 0
485 | bringing an end to terrorism and the taliban 1
486 | reject 0
487 | stand firm 1
488 | new political bogey 0
489 | abstractly recommend 1
490 | was worried 0
491 | hardline 0
492 | crash course 0
493 | recognition 1
494 | repeated denunciations 0
495 | hope 1
496 | carnage 0
497 | threats 0
498 | swapping the silver - visored helmet of a space cadet for the green eyeshade of a consummate bean counter 0
499 | to voice his concern 0
500 | significant 0
501 | can bring security and stability to all the parties without exception 1
502 | would consider 1
503 | be reproached 0
504 | or so it claims 0
505 | harboring serious doubts 0
506 | if it becomes more of a nuisance 0
507 | need to establish a just peace 1
508 | immediate support 1
509 | discrediting 0
510 | picking a quarrel 0
511 | bridle the israeli oppressive activity 1
512 | the euro , our currency 1
513 | edifying photograph 1
514 | wrong judgment 0
515 | preservation of global peace and security 1
516 | sharply questioned 0
517 | denied 0
518 | respect 1
519 | unfeasible 0
520 | has promised 1
521 | fierce demonstrations 0
522 | to thwart 1
523 | disrespect of advice 0
524 | war on 0
525 | as the saying goes , when you pull up the turnip , mud comes with it 0
526 | smiling 1
527 | judgment 0
528 | an axis of evil 0
529 | criticizes 0
530 | costly burden 0
531 | detrimental 0
532 | earned eternal notoriety 0
533 | making liberal use of the but construction 0
534 | plotting 0
535 | financial disaster 0
536 | would be regarded 0
537 | parasitic economies 0
538 | wanted 1
539 | more demagogy than arguments 0
540 | israel's superior ability to punish palestinians 0
541 | flirting with 0
542 | was perceived 0
543 | what is europe 0
544 | the best role model 1
545 | inaccurate and fabricated 0
546 | recognition 1
547 | axis of evil rhetoric 0
548 | lost their illegitimate interests 0
549 | not a man who likes to improvise 0
550 | is criticized 0
551 | can trust 1
552 | foresaw 0
553 | heresy 0
554 | him has not been pretty 0
555 | demonstrations and rallies against 0
556 | axis of evil 0
557 | cold war heritage 0
558 | possible regression 0
559 | will guarantee 0
560 | came out in protest 0
561 | progressive 1
562 | peaceful protests 0
563 | difficulty , of course 0
564 | put on a spectacle 0
565 | dogma 0
566 | devastating what remains 0
567 | is stiffening in its attitude toward 0
568 | stuck for 18 months on ground zero 0
569 | put most of the blame 0
570 | very large 0
571 | once again 0
572 | it is almost impossible 0
573 | has criticised 0
574 | axis of evil 0
575 | humiliation of 0
576 | calm 1
577 | are tired 0
578 | enjoy 1
579 | wanted 1
580 | has blamed 0
581 | unprecedented force 0
582 | provocative 0
583 | far preferable 1
584 | repeated warnings 0
585 | neither free nor fair 0
586 | backing 1
587 | widespread debates against 0
588 | using violence , intimidation , special laws and dirty tricks to fix the two - day election 0
589 | continue to obstruct 0
590 | seeking 1
591 | biased attitude 0
592 | long neglected 0
593 | success of western europe after world war ii laid in its creation of the good neighborly field 1
594 | even risking his own future 0
595 | accused 0
596 | denied 0
597 | desperate tableau 0
598 | could n't wait 1
599 | blessed 1
600 | nuclear club 0
601 | most important 1
602 | wants 0
603 | questioning 0
604 | no one is listening 0
605 | complicates any situation 0
606 | who were expelled from their homeland 0
607 | distortion of reality 0
608 | had the advantage for washington of weakening opec 1
609 | the support 1
610 | may god be satisfied with him 1
611 | continue to rise 0
612 | will not be able lay claim to a worthy place in the civilized world 0
613 | has to be maintained at any cost 0
614 | thugs 0
615 | apprehensions 0
616 | want 1
617 | recommendations 1
618 | one of the few enlightened officials 1
619 | democratic achievements 1
620 | tensions between 0
621 | see 0
622 | infuriating 0
623 | first organized protest 0
624 | feared by 0
625 | ignored 0
626 | believe 0
627 | particular concern is raised 0
628 | main problems are in the economic area 0
629 | damages the credibility 0
630 | declining to comply 0
631 | blamed 0
632 | allegedly threatening 0
633 | dismisses 0
634 | the unconditional support 1
635 | not satisfactory 0
636 | negated 0
637 | berating 0
638 | would remake venezuela to benefit the poor 1
639 | `world judge of human rights' 0
640 | were only to be expected 0
641 | cooperation 1
642 | issuing a letter of objection 0
643 | would support wholeheartedly 1
644 | giving him medium approval 1
645 | at a loss 0
646 | dwindling moral values 0
647 | is self - inflicted 0
648 | if it is successful 0
649 | impose 0
650 | eradication 0
651 | feared 0
652 | scupper any hope 0
653 | backing away 0
654 | held 0
655 | will have no universally - acknowledged 0
656 | did not show the slightest sympathy , still less the least regret 0
657 | advanced a mendacious critique 0
658 | disputes between 0
659 | encouraged 1
660 | another setback for the imf 0
661 | wanted 1
662 | clear priority 1
663 | crimes of war 0
664 | destroyed 0
665 | warned 0
666 | decided to back 1
667 | violates the united nations charter 0
668 | interfere 0
669 | have n't already violated 0
670 | defenceless 0
671 | was effective especially 1
672 | is accusing 0
673 | a man blinded by power 0
674 | backing 1
675 | it's really strange 0
676 | his favourite newfie 0
677 | mercurial strongman 0
678 | that four shots would solve the problem 0
679 | would not accept 0
680 | there is no reason for it to be impossible 1
681 | poor response 0
682 | aspirations 1
683 | to the healthy development of sino - us relations 1
684 | resolute commitment 1
685 | democracy 1
686 | the concern 0
687 | only wants 1
688 | needlessly 0
689 | secretly behind every local misfortune 0
690 | positive 1
691 | is disgusted 0
692 | high degree of difficulty 0
693 | improvement 1
694 | spurned 0
695 | has denounced 0
696 | axis of evil theory 0
697 | extensive support 1
698 | worst 0
699 | its selfishness 0
700 | ambition 1
701 | find no grounds for any support 0
702 | achieving better results 1
703 | no tears will be shed in this corner 0
704 | is a telling example of a new - and perhaps risky - approach 0
705 | will become one of the best elements 1
706 | was also quite naive 0
707 | the criticism 0
708 | desire to work 1
709 | for the sake of peace 1
710 | double standard 0
711 | pretended 0
712 | alarm 0
713 | discontent 0
714 | furthermore 0
715 | massacring thousands of innocent people 0
716 | immense gulf between 0
717 | still wants 1
718 | basically sound 1
719 | repeatedly threatened 0
720 | mendacious 0
721 | beyond reproach 1
722 | but 0
723 | neat stuff 1
724 | purposely play up 0
725 | blatant campaign of intimidation 0
726 | disappointment 0
727 | provoked 0
728 | intends to 0
729 | worried 0
730 | real danger 0
731 | supported 1
732 | support 1
733 | were plotting 0
734 | agreed 1
735 | most realistic 1
736 | violating 0
737 | value sharing 1
738 | to its knees 0
739 | to request 1
740 | get out ! . 0
741 | the axis of evil 0
742 | assassins , assassins , assassins 0
743 | is the appropriate one 1
744 | resistance 1
745 | indulging in blood - shed and their lunaticism 0
746 | desperate 0
747 | to describe 0
748 | can ask 1
749 | know 0
750 | invited 1
751 | to express satisfaction 1
752 | harmonious and close 1
753 | support 1
754 | has been inclined toward 1
755 | incited 0
756 | was like giving away the country to its former colonial masters 0
757 | the destruction 0
758 | already volatile 0
759 | overwhelming evidence 0
760 | imperative for harmonious society 1
761 | nor does it seem proper 0
762 | storming palestinian territories 0
763 | is reflected 1
764 | expressed satisfaction 1
765 | intends 1
766 | ruined 0
767 | made public a blacklist 0
768 | those who seek to attack and destroy law and order and legitimate government 0
769 | unlawful 0
770 | freely 1
771 | reassurances 1
772 | agreeable 1
773 | dont want 0
774 | did not agree 0
775 | to deny 0
776 | perception 0
777 | had particularly harsh words for 0
778 | want 1
779 | the brutality with which this closure was implemented was unheard of 0
780 | friendship 1
781 | axis of evil 0
782 | supported 1
783 | slaughter of more than 0
784 | will be asked 1
785 | just , legitimate rights 1
786 | condemned 0
787 | thanked 1
788 | unstable situation 0
789 | merely an interlude 0
790 | depends on 0
791 | has been efficient enough 1
792 | grand 1
793 | any step backward for democracy 0
794 | quite supportive 1
795 | great leader 1
796 | would help 1
797 | formed a close friendship 1
798 | uphold 1
799 | is feared by 0
800 | used to boast 1
801 | regarded 1
802 | would violate international standards prohibiting cruel , inhuman or degrading treatment 0
803 | convince the americans that the bush administration has not only succeeded in removing a regime of religious madmen 0
804 | has put west asia on the brink of another war 0
805 | has also backed 1
806 | attacks 0
807 | banned 0
808 | dishonoring 0
809 | sent congratulations 1
810 | thought 0
811 | may be disconcerted 0
812 | concern 0
813 | will be inviting 1
814 | have to bash 0
815 | plan 1
816 | guaranteed 1
817 | the mistake is to assume 0
818 | insists 1
819 | sticking to the polluting policies 0
820 | can be difficult 0
821 | can prevent 1
822 | extremely dangerous 0
823 | would beg for surrender 0
824 | protesting 0
825 | proclaimed 1
826 | to applaud everything 1
827 | the most ideal way 1
828 | accused 0
829 | what fate awaits argentina's crisis - stricken economy 0
830 | especially those who are dangerous and suicidal 0
831 | has failed for a decade now 0
832 | had likely deteriorated beyond repair 0
833 | attempts to suppress 0
834 | endorse 1
835 | if there were serious problems 0
836 | designed to achieve an outcome - power at all costs 0
837 | criticism of 0
838 | everything 0
839 | so that we would not become terrorists 1
840 | higher transparency 1
841 | warning 0
842 | does not justify all the means 0
843 | the further 0
844 | artificial obstacles 0
845 | would improve 0
846 | has now made himself heard 1
847 | openly invited 1
848 | argued 0
849 | decisions 0
850 | admittedly 1
851 | still prefer 1
852 | had planned 0
853 | sacrifice himself 0
854 | very pleasant for 1
855 | prevented 0
856 | favorable opinions 1
857 | no one we know to have planned such deeds will escape 0
858 | to express openly dissenting political and religious views 0
859 | confidence 1
860 | no business 0
861 | has provoked concern 0
862 | had aligned against 0
863 | under the pretext of fighting terrorism 0
864 | violation of the palestinian people's human rights 0
865 | understanding and approval 1
866 | concerns 0
867 | universal character of the prophets 1
868 | force himself 0
869 | getting mixed up with crime , drugs , and violence 0
870 | bold and fair 1
871 | like an upholder of justice 1
872 | factually inaccurate 0
873 | criticism 0
874 | was shocked 0
875 | saw 1
876 | had refused to accept 0
877 | would make it possible 1
878 | domino effect 0
879 | palestinian hand that is stretched for peace 0
880 | has refused to bow 0
881 | despite the perils 0
882 | will be instrumental 1
883 | concerns 0
884 | those who are really behind it all , those who are behind this business , use the imf 0
885 | accommodate 1
886 | doubts 0
887 | slammed 0
888 | will seek 1
889 | grieving 0
890 | called for 1
891 | isolated and frustrated 0
892 | in the name of self - righteousness 0
893 | intensify and accelerate 0
894 | keeping a wary eye 0
895 | took to the streets 0
896 | gave free rein 0
897 | to rid 1
898 | wanted 1
899 | betrayal 0
900 | the branding 0
901 | collapse 0
902 | i lost any sense of dignity 0
903 | massive intimidatory show of force that was evident in the deployment of security forces 0
904 | unpopular system 0
905 | regarded 0
906 | has posed serious threats 0
907 | to draw up an action campaign aimed 0
908 | illegally 0
909 | reputation was ruined 0
910 | the opposition 0
911 | cracks are appearing 0
912 | could fray the bilateral goodwill 0
913 | was deemed 0
914 | has renounced 0
915 | the cia would organize coups and assassinations to further us interests 0
916 | the definition of a democratic regime is subjected to double standards 0
917 | strike 0
918 | challenged 0
919 | sympathy 0
920 | rubbed their palms at length 1
921 | shook 0
922 | sided with 1
923 | trying to move the goal posts 0
924 | lose popular support among 0
925 | will only invite worse criticism and rejection 0
926 | axis of evil 0
927 | absolutely not permitted 0
928 | found little to object to 0
929 | justify 1
930 | extension of its characteristic policy of hegemony 0
931 | unrealistic 0
932 | warned against 0
933 | crippled 0
934 | well 1
935 | it is ineffective 0
936 | is now promoting 1
937 | called 0
938 | in spite of the good offices 0
939 | genuinely 1
940 | for more than two years 0
941 | tried to encourage 1
942 | loathsome action 0
943 | reckless 0
944 | and killed 0
945 | likening 0
946 | will now agree 1
947 | exalted 1
948 | would like 1
949 | hoped 1
950 | even more dismal 0
951 | gives new life 1
952 | rejected 0
953 | predatory 0
954 | viciously spoke ill 0
955 | in protest against 0
956 | was remarkable 1
957 |
--------------------------------------------------------------------------------
/evaluator.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import copy
3 |
4 | import six
5 |
6 | import chainer
7 | from chainer import configuration
8 | from chainer.dataset import convert
9 | from chainer.dataset import iterator as iterator_module
10 | from chainer import function
11 | from chainer import link
12 | from chainer import reporter as reporter_module
13 | from chainer.training import extension
14 |
15 |
16 | class MicroEvaluator(chainer.training.extensions.Evaluator):
17 |
18 | def evaluate(self):
19 | iterator = self._iterators['main']
20 | eval_func = self.eval_func or self._targets['main']
21 |
22 | if self.eval_hook:
23 | self.eval_hook(self)
24 |
25 | if hasattr(iterator, 'reset'):
26 | iterator.reset()
27 | it = iterator
28 | else:
29 | it = copy.copy(iterator)
30 |
31 | # summary = reporter_module.DictSummary()
32 | summary = collections.defaultdict(list)
33 |
34 | for batch in it:
35 | observation = {}
36 | with reporter_module.report_scope(observation):
37 | in_arrays = self.converter(batch, self.device)
38 | with function.no_backprop_mode():
39 | if isinstance(in_arrays, tuple):
40 | eval_func(*in_arrays)
41 | elif isinstance(in_arrays, dict):
42 | eval_func(**in_arrays)
43 | else:
44 | eval_func(in_arrays)
45 | n_data = len(batch)
46 | summary['n'].append(n_data)
47 | # summary.add(observation)
48 | for k, v in observation.items():
49 | summary[k].append(v)
50 |
51 | mean = dict()
52 | ns = summary['n']
53 | del summary['n']
54 | for k, vs in summary.items():
55 | mean[k] = sum(v * n for v, n in zip(vs, ns)) / sum(ns)
56 | return mean
57 | # return summary.compute_mean()
58 |
--------------------------------------------------------------------------------
/finetune_dataset.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import csv
6 | import os
7 | import shutil
8 | import logging
9 | import argparse
10 | import random
11 | from tqdm import tqdm, trange
12 | import json
13 |
14 | import numpy as np
15 | import torch
16 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
17 | from torch.utils.data.distributed import DistributedSampler
18 |
19 | from pytorch_pretrained_bert.tokenization import BertTokenizer
20 | from pytorch_pretrained_bert.modeling import BertForMaskedLM
21 | from pytorch_pretrained_bert.optimization import BertAdam
22 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
23 |
24 | import train_text_classifier
25 |
26 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
27 | datefmt='%m/%d/%Y %H:%M:%S',
28 | level=logging.INFO)
29 | logger = logging.getLogger(__name__)
30 |
31 |
32 | class DataProcessor(object):
33 | """Base class for data converters for sequence classification data sets."""
34 |
35 | def get_train_examples(self, data_dir):
36 | """Gets a collection of `InputExample`s for the train set."""
37 | raise NotImplementedError()
38 |
39 | def get_dev_examples(self, data_dir):
40 | """Gets a collection of `InputExample`s for the dev set."""
41 | raise NotImplementedError()
42 |
43 | def get_labels(self):
44 | """Gets the list of labels for this data set."""
45 | raise NotImplementedError()
46 |
47 | @classmethod
48 | def _read_tsv(cls, input_file, quotechar=None):
49 | """Reads a tab separated value file."""
50 | with open(input_file, "r") as f:
51 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
52 | lines = []
53 | for line in reader:
54 | lines.append(line)
55 | return lines
56 |
57 |
58 | class InputExample(object):
59 | """A single training/test example for simple sequence classification."""
60 |
61 | def __init__(self, guid, text_a, label=None):
62 | """Constructs a InputExample.
63 |
64 | Args:
65 | guid: Unique id for the example.
66 | text_a: string. The untokenized text of the first sequence. For single
67 | sequence tasks, only this sequence must be specified.
68 | """
69 | self.guid = guid
70 | self.text_a = text_a
71 | self.label = label
72 |
73 |
74 | class InputFeatures(object):
75 | """A single set of features of data."""
76 |
77 | def __init__(self, init_ids, input_ids, input_mask, segment_ids, masked_lm_labels):
78 | self.init_ids = init_ids
79 | self.input_ids = input_ids
80 | self.input_mask = input_mask
81 | self.segment_ids = segment_ids
82 | self.masked_lm_labels = masked_lm_labels
83 |
84 | class AugProcessor(DataProcessor):
85 | """Processor for dataset to be augmented."""
86 |
87 | def get_train_examples(self, data_dir):
88 | """See base class."""
89 | return self._create_examples(
90 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
91 |
92 | def get_dev_examples(self, data_dir):
93 | """See base class."""
94 | return self._create_examples(
95 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
96 |
97 | def get_labels(self, name):
98 | """add your dataset here"""
99 | if name in ['stsa.binary', 'mpqa', 'rt-polarity', 'subj']:
100 | return ["0", "1"]
101 | elif name in ['stsa.fine']:
102 | return ["0", "1", "2", "3", "4"]
103 | elif name in ['TREC']:
104 | return ["0", "1", "2", "3", "4", "5"]
105 |
106 | def _create_examples(self, lines, set_type):
107 | """Creates examples for the training and dev sets."""
108 | examples = []
109 | for (i, line) in enumerate(lines):
110 | if i == 0:
111 | continue
112 | guid = "%s-%s" % (set_type, i)
113 | text_a = line[0]
114 | label = line[-1]
115 | examples.append(
116 | InputExample(guid=guid, text_a=text_a, label=label))
117 | return examples
118 |
119 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
120 | """Loads a data file into a list of `InputBatch`s."""
121 |
122 | label_map = {}
123 | for (i, label) in enumerate(label_list):
124 | label_map[label] = i
125 |
126 | features = []
127 | # ----
128 | dupe_factor = 5
129 | masked_lm_prob = 0.15
130 | max_predictions_per_seq = 20
131 | rng = random.Random(12345)
132 |
133 | for (ex_index, example) in enumerate(examples):
134 | tokens_a = tokenizer.tokenize(example.text_a)
135 | segment_id = label_map[example.label]
136 | # Account for [CLS] and [SEP] with "- 2"
137 | if len(tokens_a) > max_seq_length - 2:
138 | tokens_a = tokens_a[0:(max_seq_length - 2)]
139 |
140 | # Due to we use conditional bert, we need to place label information in segment_ids
141 | tokens = []
142 | segment_ids = []
143 | # is [CLS]和[SEP] needed ?
144 | tokens.append("[CLS]")
145 | segment_ids.append(segment_id)
146 | for token in tokens_a:
147 | tokens.append(token)
148 | segment_ids.append(segment_id)
149 | tokens.append("[SEP]")
150 | segment_ids.append(segment_id)
151 | masked_lm_labels = [-1] * max_seq_length
152 |
153 | cand_indexes = []
154 | for (i, token) in enumerate(tokens):
155 | if token == "[CLS]" or token == "[SEP]":
156 | continue
157 | cand_indexes.append(i)
158 |
159 | rng.shuffle(cand_indexes)
160 | len_cand = len(cand_indexes)
161 |
162 | output_tokens = list(tokens)
163 |
164 | num_to_predict = min(max_predictions_per_seq,
165 | max(1, int(round(len(tokens) * masked_lm_prob))))
166 |
167 | masked_lms_pos = []
168 | covered_indexes = set()
169 | for index in cand_indexes:
170 | if len(masked_lms_pos) >= num_to_predict:
171 | break
172 | if index in covered_indexes:
173 | continue
174 | covered_indexes.add(index)
175 |
176 | masked_token = None
177 | # 80% of the time, replace with [MASK]
178 | if rng.random() < 0.8:
179 | masked_token = "[MASK]"
180 | else:
181 | # 10% of the time, keep original
182 | if rng.random() < 0.5:
183 | masked_token = tokens[index]
184 | # 10% of the time, replace with random word
185 | else:
186 | masked_token = tokens[cand_indexes[rng.randint(0, len_cand - 1)]]
187 |
188 | masked_lm_labels[index] = tokenizer.convert_tokens_to_ids([tokens[index]])[0]
189 | output_tokens[index] = masked_token
190 | masked_lms_pos.append(index)
191 |
192 | init_ids = tokenizer.convert_tokens_to_ids(tokens)
193 | input_ids = tokenizer.convert_tokens_to_ids(output_tokens)
194 |
195 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
196 | # tokens are attended to.
197 | input_mask = [1] * len(input_ids)
198 |
199 | # Zero-pad up to the sequence length.
200 | while len(input_ids) < max_seq_length:
201 | init_ids.append(0)
202 | input_ids.append(0)
203 | input_mask.append(0)
204 | segment_ids.append(0) # ?segment_id
205 |
206 | assert len(init_ids) == max_seq_length
207 | assert len(input_ids) == max_seq_length
208 | assert len(input_mask) == max_seq_length
209 | assert len(segment_ids) == max_seq_length
210 |
211 | if ex_index < 2:
212 | logger.info("*** Example ***")
213 | logger.info("guid: %s" % (example.guid))
214 | logger.info("tokens: %s" % " ".join(
215 | [str(x) for x in tokens]))
216 | logger.info("init_ids: %s" % " ".join([str(x) for x in init_ids]))
217 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
218 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
219 | logger.info(
220 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
221 | logger.info("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels]))
222 |
223 | features.append(
224 | InputFeatures(init_ids=init_ids,
225 | input_ids=input_ids,
226 | input_mask=input_mask,
227 | segment_ids=segment_ids,
228 | masked_lm_labels=masked_lm_labels))
229 | return features
230 |
231 | def rev_wordpiece(str):
232 | #print(str)
233 | if len(str) > 1:
234 | for i in range(len(str)-1, 0, -1):
235 | if str[i] == '[PAD]':
236 | str.remove(str[i])
237 | elif len(str[i]) > 1 and str[i][0]=='#' and str[i][1]=='#':
238 | str[i-1] += str[i][2:]
239 | str.remove(str[i])
240 | return " ".join(str[1:-1])
241 |
242 | def main():
243 | parser = argparse.ArgumentParser()
244 |
245 | ## Required parameters
246 | parser.add_argument("--data_dir", default="datasets", type=str,
247 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
248 | parser.add_argument("--output_dir", default="aug_data", type=str,
249 | help="The output dir for augmented dataset")
250 | parser.add_argument("--bert_model", default="bert-base-uncased", type=str,
251 | help="The path of pretrained bert model.")
252 | parser.add_argument("--task_name",default="subj",type=str,
253 | help="The name of the task to train.")
254 | parser.add_argument("--max_seq_length", default=64, type=int,
255 | help="The maximum total input sequence length after WordPiece tokenization. \n"
256 | "Sequences longer than this will be truncated, and sequences shorter \n"
257 | "than this will be padded.")
258 | parser.add_argument("--do_lower_case", default=False, action='store_true',
259 | help="Set this flag if you are using an uncased model.")
260 | parser.add_argument("--train_batch_size", default=32, type=int,
261 | help="Total batch size for training.")
262 | parser.add_argument("--learning_rate", default=5e-5, type=float,
263 | help="The initial learning rate for Adam.")
264 | parser.add_argument("--num_train_epochs", default=10.0, type=float,
265 | help="Total number of training epochs to perform.")
266 | parser.add_argument("--warmup_proportion", default=0.1, type=float,
267 | help="Proportion of training to perform linear learning rate warmup for. "
268 | "E.g., 0.1 = 10%% of training.")
269 | parser.add_argument('--seed', type=int, default=42,
270 | help="random seed for initialization")
271 |
272 | args = parser.parse_args()
273 | with open("global.config", 'rb') as f:
274 | configs_dict = json.load(f)
275 |
276 | args.task_name = configs_dict.get("dataset")
277 | print(args)
278 | run_aug(args, save_every_epoch=False)
279 |
280 |
281 | def run_aug(args, save_every_epoch=False):
282 | processors = {
283 | # you can your processor here
284 | "TREC": AugProcessor,
285 | "stsa.fine": AugProcessor,
286 | "stsa.binary": AugProcessor,
287 | "mpqa": AugProcessor,
288 | "rt-polarity": AugProcessor,
289 | "subj": AugProcessor,
290 | }
291 |
292 | task_name = args.task_name
293 | if task_name not in processors:
294 | raise ValueError("Task not found: %s" % (task_name))
295 | args.data_dir = os.path.join(args.data_dir, task_name)
296 | args.output_dir = os.path.join(args.output_dir, task_name)
297 |
298 | random.seed(args.seed)
299 | np.random.seed(args.seed)
300 | torch.manual_seed(args.seed)
301 | os.makedirs(args.output_dir, exist_ok=True)
302 | processor = processors[task_name]()
303 | label_list = processor.get_labels(task_name)
304 |
305 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
306 |
307 | train_examples = None
308 | num_train_steps = None
309 | train_examples = processor.get_train_examples(args.data_dir)
310 | #dev_examples = processor.get_dev_examples(args.data_dir)
311 | #train_examples.extend(dev_examples)
312 | num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs)
313 |
314 | # Prepare model
315 | model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)
316 |
317 | if task_name == 'stsa.fine':
318 | model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(5, 768)
319 | model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02)
320 | elif task_name == 'TREC':
321 | model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(6, 768)
322 | model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02)
323 |
324 | model.cuda()
325 |
326 | # Prepare optimizer
327 | param_optimizer = list(model.named_parameters())
328 | no_decay = ['bias', 'gamma', 'beta']
329 | optimizer_grouped_parameters = [
330 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
331 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
332 | ]
333 | t_total = num_train_steps
334 | optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate,
335 | warmup=args.warmup_proportion,t_total=t_total)
336 |
337 | global_step = 0
338 | train_features = convert_examples_to_features(
339 | train_examples, label_list, args.max_seq_length, tokenizer)
340 | logger.info("***** Running training *****")
341 | logger.info(" Num examples = %d", len(train_examples))
342 | logger.info(" Batch size = %d", args.train_batch_size)
343 | logger.info(" Num steps = %d", num_train_steps)
344 | all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long)
345 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
346 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
347 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
348 | all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long)
349 | train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels)
350 | train_sampler = RandomSampler(train_data)
351 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
352 |
353 | model.train()
354 |
355 | save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name)
356 | if not os.path.exists(save_model_dir):
357 | os.mkdir(save_model_dir)
358 | for e in trange(int(args.num_train_epochs), desc="Epoch"):
359 | avg_loss = 0.
360 |
361 | for step, batch in enumerate(train_dataloader):
362 | batch = tuple(t.cuda() for t in batch)
363 | _, input_ids, input_mask, segment_ids, masked_ids = batch
364 | loss = model(input_ids, segment_ids, input_mask, masked_ids)
365 | loss.backward()
366 | avg_loss += loss.item()
367 | optimizer.step()
368 | model.zero_grad()
369 | if (step + 1) % 50 == 0:
370 | print("avg_loss: {}".format(avg_loss / 50))
371 | avg_loss = 0
372 | if save_every_epoch:
373 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1)
374 | save_model_path = os.path.join(save_model_dir, save_model_name)
375 | torch.save(model, save_model_path)
376 | else:
377 | if (e + 1) % 10 == 0:
378 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1)
379 | save_model_path = os.path.join(save_model_dir, save_model_name)
380 | torch.save(model, save_model_path)
381 |
382 |
383 | if __name__ == "__main__":
384 | main()
385 |
--------------------------------------------------------------------------------
/global.config:
--------------------------------------------------------------------------------
1 | {
2 | "dataset":"stsa.binary",
3 | "eval_model":"cnn"
4 | }
5 |
--------------------------------------------------------------------------------
/nets.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Sample script of recurrent neural network language model.
3 |
4 | This code is ported from the following implementation written in Torch.
5 | https://github.com/tomsercu/lstm
6 |
7 | """
8 | from __future__ import division
9 | from __future__ import print_function
10 | import argparse
11 | import json
12 | import warnings
13 |
14 | import numpy as np
15 |
16 | import chainer
17 | from chainer import cuda
18 | import chainer.functions as F
19 | import chainer.links as L
20 | from chainer import training
21 | from chainer.training import extensions
22 | from chainer import reporter
23 |
24 | embed_init = chainer.initializers.Uniform(.25)
25 |
26 |
27 | def embed_seq_batch(embed, seq_batch, dropout=0., context=None):
28 | x_len = [len(seq) for seq in seq_batch]
29 | x_section = np.cumsum(x_len[:-1])
30 | ex = embed(F.concat(seq_batch, axis=0))
31 | ex = F.dropout(ex, dropout)
32 | if context is not None:
33 | ids = [embed.xp.full((l, ), i).astype('i')
34 | for i, l in enumerate(x_len)]
35 | ids = embed.xp.concatenate(ids, axis=0)
36 | cx = F.embed_id(ids, context)
37 | ex = F.concat([ex, cx], axis=1)
38 | exs = F.split_axis(ex, x_section, 0)
39 | return exs
40 |
41 |
42 | class NormalOutputLayer(L.Linear):
43 |
44 | def __init__(self, *args, **kwargs):
45 | super(NormalOutputLayer, self).__init__(*args, **kwargs)
46 |
47 | def output_and_loss(self, h, t, reduce='mean'):
48 | logit = self(h)
49 | return F.softmax_cross_entropy(
50 | logit, t, normalize=False, reduce=reduce)
51 |
52 | def output(self, h, t=None):
53 | return self(h)
54 |
55 |
56 | class MLP(chainer.Chain):
57 | def __init__(self, n_hidden, in_units, hidden_units, out_units, dropout=0.):
58 | super(MLP, self).__init__()
59 | with self.init_scope():
60 | self.l1 = L.Linear(in_units, hidden_units)
61 | self.lo = L.Linear(hidden_units, out_units)
62 | for i in range(2, n_hidden + 2):
63 | setattr(self, 'l{}'.format(i),
64 | L.Linear(hidden_units, hidden_units))
65 | self.n_hidden = n_hidden
66 | self.dropout = dropout
67 |
68 | def __call__(self, x, label=None):
69 | x = self.l1(x)
70 | for i in range(2, self.n_hidden + 2):
71 | x = F.relu(x)
72 | x = F.dropout(x, self.dropout)
73 | x = getattr(self, 'l{}'.format(i))(x)
74 | x = F.relu(x)
75 | x = F.dropout(x, self.dropout)
76 | x = self.lo(x)
77 | x = F.relu(x)
78 | if hasattr(self, 'l1_label') and label is not None:
79 | x += self.l1_label(label)
80 | return x
81 |
82 |
83 | class BiLanguageModel(chainer.Chain):
84 |
85 | def __init__(self, n_vocab, n_units, n_layers=2, dropout=0.5):
86 | super(BiLanguageModel, self).__init__()
87 | with self.init_scope():
88 | self.embed = L.EmbedID(n_vocab, n_units)
89 | RNN = L.NStepLSTM
90 | self.encoder_fw = RNN(n_layers, n_units, n_units, dropout)
91 | self.encoder_bw = RNN(n_layers, n_units, n_units, dropout)
92 | self.output = NormalOutputLayer(n_units, n_vocab)
93 | self.mlp = MLP(1, n_units * 2, n_units, n_units, dropout)
94 | self.dropout = dropout
95 | self.n_units = n_units
96 | self.n_layers = n_layers
97 |
98 | def add_label_condition_nets(self, n_labels, label_units):
99 | with self.init_scope():
100 | self.mlp.add_link(
101 | 'l1_label',
102 | L.Linear(None, self.mlp.l1.b.size, nobias=True,
103 | initialW=chainer.initializers.Uniform(0.4)))
104 | self.n_labels = n_labels
105 |
106 | def encode(self, seq_batch, labels=None):
107 | seq_batch_wo_2bos = [seq[2::] for seq in seq_batch]
108 | revseq_batch_wo_2bos = [seq[::-1] for seq in seq_batch_wo_2bos]
109 | seq_batch_wo_2eos = [seq[:-2] for seq in seq_batch]
110 | bwe_seq_batch = self.embed_seq_batch(revseq_batch_wo_2bos)
111 | fwe_seq_batch = self.embed_seq_batch(seq_batch_wo_2eos)
112 | bwt_out_batch = self.encode_seq_batch(
113 | bwe_seq_batch, self.encoder_bw)[-1]
114 | fwt_out_batch = self.encode_seq_batch(
115 | fwe_seq_batch, self.encoder_fw)[-1]
116 | revbwt_concat = F.concat(
117 | [b[::-1] for b in bwt_out_batch], axis=0)
118 | fwt_concat = F.concat(fwt_out_batch, axis=0)
119 | t_out_concat = F.concat([fwt_concat, revbwt_concat], axis=1)
120 | t_out_concat = F.dropout(t_out_concat, self.dropout)
121 | if hasattr(self.mlp, 'l1_label') and labels is not None:
122 | labels = [[labels[i]] * f.shape[0]
123 | for i, f in enumerate(fwt_out_batch)]
124 | labels = self.xp.concatenate(sum(labels, []), axis=0)
125 | label_concat = self.xp.zeros(
126 | (t_out_concat.shape[0], self.n_labels)).astype('f')
127 | label_concat[self.xp.arange(len(labels)), labels] = 1.
128 | t_out_concat = self.mlp(t_out_concat, label_concat)
129 | else:
130 | t_out_concat = self.mlp(t_out_concat)
131 | return t_out_concat
132 |
133 | def embed_seq_batch(self, x_seq_batch, context=None):
134 | e_seq_batch = embed_seq_batch(
135 | self.embed, x_seq_batch,
136 | dropout=self.dropout,
137 | context=context)
138 | return e_seq_batch
139 |
140 | def encode_seq_batch(self, e_seq_batch, encoder):
141 | hs, cs, y_seq_batch = encoder(None, None, e_seq_batch)
142 | return hs, cs, y_seq_batch
143 |
144 | def calculate_loss(self, input_chain, **args):
145 | seq_batch = sum(input_chain, [])
146 | t_out_concat = self.encode(seq_batch)
147 | seq_batch_mid = [seq[1:-1] for seq in seq_batch]
148 | seq_mid_concat = F.concat(seq_batch_mid, axis=0)
149 | n_tok = sum(len(s) for s in seq_batch_mid)
150 | loss = self.output_and_loss_from_concat(
151 | t_out_concat, seq_mid_concat,
152 | normalize=n_tok)
153 | reporter.report({'perp': self.xp.exp(loss.data)}, self)
154 | return loss
155 |
156 | def output_and_loss_from_concat(self, y, t, normalize=None):
157 | y = F.dropout(y, ratio=self.dropout)
158 | loss = self.output.output_and_loss(y, t)
159 | if normalize is not None:
160 | loss *= 1. * t.shape[0] / normalize
161 | else:
162 | loss *= t.shape[0]
163 | return loss
164 |
165 | def calculate_loss_with_labels(self, seq_batch_with_labels):
166 | seq_batch, labels = seq_batch_with_labels
167 | t_out_concat = self.encode(seq_batch, labels=labels)
168 | seq_batch_mid = [seq[1:-1] for seq in seq_batch]
169 | seq_mid_concat = F.concat(seq_batch_mid, axis=0)
170 | n_tok = sum(len(s) for s in seq_batch_mid)
171 | loss = self.output_and_loss_from_concat(
172 | t_out_concat, seq_mid_concat,
173 | normalize=n_tok)
174 | reporter.report({'perp': self.xp.exp(loss.data)}, self)
175 | return loss
176 |
177 | def predict(self, xs, labels=None):
178 | with chainer.using_config('train', False), chainer.no_backprop_mode():
179 | t_out_concat = self.encode(xs, labels=labels, add_original=0.)
180 | prob_concat = F.softmax(self.output.output(t_out_concat)).data
181 | x_len = [len(x) for x in xs]
182 | x_section = np.cumsum(x_len[:-1])
183 | ps = np.split(cuda.to_cpu(prob_concat), x_section, 0)
184 | return ps
185 |
186 | def predict_embed(self,
187 | xs, embedW,
188 | labels=None,
189 | dropout=0.,
190 | mode='sampling',
191 | temp=1.,
192 | word_lower_bound=0.,
193 | gold_lower_bound=0.,
194 | gumbel=True,
195 | residual=0.,
196 | wordwise=True,
197 | add_original=0.,
198 | augment_ratio=0.25):
199 | x_len = [len(x) for x in xs]
200 | with chainer.using_config('train', False), chainer.no_backprop_mode():
201 | t_out_concat = self.encode(xs, labels=labels)
202 | prob_concat = self.output.output(t_out_concat).data
203 | prob_concat /= temp
204 | prob_concat += self.xp.random.gumbel(
205 | size=prob_concat.shape).astype('f')
206 | prob_concat = F.softmax(prob_concat).data
207 |
208 | out_concat = F.embed_id(
209 | self.xp.argmax(prob_concat, axis=1).astype(np.int32), embedW)
210 |
211 | # insert eos
212 | eos = embedW[0][None]
213 | new_out = []
214 | count = 0
215 | for i, x in enumerate(xs):
216 | new_out.append(eos)
217 | new_out.append(out_concat[count:count + len(xs) - 2])
218 | new_out.append(eos)
219 | count += len(xs) - 2
220 | out_concat = F.concat(new_out, axis=0)
221 |
222 | def embed_func(x): return F.embed_id(x, embedW, ignore_label=-1)
223 | raw_concat = F.concat(
224 | sequence_embed(embed_func, xs, self.dropout), axis=0)
225 | b, u = raw_concat.shape
226 |
227 | mask = self.xp.broadcast_to(
228 | (self.xp.random.rand(b, 1) < augment_ratio),
229 | raw_concat.shape)
230 | out_concat = F.where(mask, out_concat, raw_concat)
231 |
232 | x_len = [len(x) for x in xs]
233 | x_section = np.cumsum(x_len[:-1])
234 | out_concat = F.dropout(out_concat, dropout)
235 | exs = F.split_axis(out_concat, x_section, 0)
236 | return exs
237 |
238 |
239 | def sequence_embed(embed, xs, dropout=0.):
240 | """Efficient embedding function for variable-length sequences
241 |
242 | This output is equally to
243 | "return [F.dropout(embed(x), ratio=dropout) for x in xs]".
244 | However, calling the functions is one-shot and faster.
245 |
246 | Args:
247 | embed (callable): A :func:`~chainer.functions.embed_id` function
248 | or :class:`~chainer.links.EmbedID` link.
249 | xs (list of :class:`~chainer.Variable` or :class:`numpy.ndarray` or \
250 | :class:`cupy.ndarray`): i-th element in the list is an input variable,
251 | which is a :math:`(L_i, )`-shaped int array.
252 | dropout (float): Dropout ratio.
253 |
254 | Returns:
255 | list of ~chainer.Variable: Output variables. i-th element in the
256 | list is an output variable, which is a :math:`(L_i, N)`-shaped
257 | float array. :math:`(N)` is the number of dimensions of word embedding.
258 |
259 | """
260 | x_len = [len(x) for x in xs]
261 | x_section = np.cumsum(x_len[:-1])
262 | ex = embed(F.concat(xs, axis=0))
263 | ex = F.dropout(ex, ratio=dropout)
264 | exs = F.split_axis(ex, x_section, 0)
265 | return exs
266 |
267 |
268 | def block_embed(embed, x, dropout=0.):
269 | """Embedding function followed by convolution
270 |
271 | Args:
272 | embed (callable): A :func:`~chainer.functions.embed_id` function
273 | or :class:`~chainer.links.EmbedID` link.
274 | x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
275 | :class:`cupy.ndarray`): Input variable, which
276 | is a :math:`(B, L)`-shaped int array. Its first dimension
277 | :math:`(B)` is assumed to be the *minibatch dimension*.
278 | The second dimension :math:`(L)` is the length of padded
279 | sentences.
280 | dropout (float): Dropout ratio.
281 |
282 | Returns:
283 | ~chainer.Variable: Output variable. A float array with shape
284 | of :math:`(B, N, L, 1)`. :math:`(N)` is the number of dimensions
285 | of word embedding.
286 |
287 | """
288 | e = embed(x)
289 | e = F.dropout(e, ratio=dropout)
290 | e = F.transpose(e, (0, 2, 1))
291 | e = e[:, :, :, None]
292 | return e
293 |
294 |
295 | class PredictiveEmbed(chainer.Chain):
296 | def __init__(self, n_vocab, n_units, bilm,
297 | dropout=0., initialW=embed_init):
298 | super(PredictiveEmbed, self).__init__()
299 | with self.init_scope():
300 | self.embed = L.EmbedID(n_vocab, n_units, ignore_label=-1,
301 | initialW=initialW)
302 | self.bilm = bilm
303 | self.n_vocab = n_vocab
304 | self.n_units = n_units
305 | self.dropout = dropout
306 |
307 | def __call__(self, x):
308 | return self.embed(x)
309 |
310 | def setup(self,
311 | mode='weighted_sum',
312 | temp=1.,
313 | word_lower_bound=0.,
314 | gold_lower_bound=0.,
315 | gumbel=True,
316 | residual=0.,
317 | wordwise=True,
318 | add_original=1.,
319 | augment_ratio=0.5,
320 | ignore_unk=-1):
321 | self.config = {
322 | 'dropout': self.dropout,
323 | 'mode': mode,
324 | 'temp': temp,
325 | 'word_lower_bound': 0.,
326 | 'gold_lower_bound': 0.,
327 | 'gumbel': gumbel,
328 | 'residual': residual,
329 | 'wordwise': wordwise,
330 | 'add_original': add_original,
331 | 'augment_ratio': augment_ratio
332 | }
333 | if ignore_unk >= 0:
334 | self.bilm.output.b.data[ignore_unk] = -1e5
335 |
336 | def embed_xs(self, xs, batch='concat'):
337 | if batch == 'concat':
338 | x_block = chainer.dataset.convert.concat_examples(xs, padding=-1)
339 | ex_block = block_embed(self.embed, x_block, self.dropout)
340 | return ex_block
341 | elif batch == 'list':
342 | exs = sequence_embed(self.embed, xs, self.dropout)
343 | return exs
344 | else:
345 | raise NotImplementedError
346 |
347 | def embed_xs_with_prediction(self, xs, labels=None, batch='concat'):
348 | predicted_exs = self.bilm.predict_embed(
349 | xs, self.embed.W,
350 | labels=labels,
351 | dropout=self.config['dropout'],
352 | mode=self.config['mode'],
353 | temp=self.config['temp'],
354 | word_lower_bound=self.config['word_lower_bound'],
355 | gold_lower_bound=self.config['gold_lower_bound'],
356 | gumbel=self.config['gumbel'],
357 | residual=self.config['residual'],
358 | wordwise=self.config['wordwise'],
359 | add_original=self.config['add_original'],
360 | augment_ratio=self.config['augment_ratio'])
361 | if batch == 'concat':
362 | predicted_ex_block = F.pad_sequence(predicted_exs, padding=0.)
363 | predicted_ex_block = F.transpose(
364 | predicted_ex_block, (0, 2, 1))[:, :, :, None]
365 | return predicted_ex_block
366 | elif batch == 'list':
367 | return predicted_exs
368 | else:
369 | raise NotImplementedError
370 |
--------------------------------------------------------------------------------
/text_classification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__init__.py
--------------------------------------------------------------------------------
/text_classification/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/text_classification/__pycache__/nets.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__pycache__/nets.cpython-36.pyc
--------------------------------------------------------------------------------
/text_classification/__pycache__/nlp_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__pycache__/nlp_utils.cpython-36.pyc
--------------------------------------------------------------------------------
/text_classification/__pycache__/text_datasets.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__pycache__/text_datasets.cpython-36.pyc
--------------------------------------------------------------------------------
/text_classification/nets.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 | import chainer
4 | import chainer.functions as F
5 | import chainer.links as L
6 | from chainer import reporter
7 |
8 | embed_init = chainer.initializers.Uniform(.25)
9 |
10 |
11 | def sequence_embed(embed, xs, dropout=0.):
12 | """Efficient embedding function for variable-length sequences
13 |
14 | This output is equally to
15 | "return [F.dropout(embed(x), ratio=dropout) for x in xs]".
16 | However, calling the functions is one-shot and faster.
17 |
18 | Args:
19 | embed (callable): A :func:`~chainer.functions.embed_id` function
20 | or :class:`~chainer.links.EmbedID` link.
21 | xs (list of :class:`~chainer.Variable` or :class:`numpy.ndarray` or \
22 | :class:`cupy.ndarray`): i-th element in the list is an input variable,
23 | which is a :math:`(L_i, )`-shaped int array.
24 | dropout (float): Dropout ratio.
25 |
26 | Returns:
27 | list of ~chainer.Variable: Output variables. i-th element in the
28 | list is an output variable, which is a :math:`(L_i, N)`-shaped
29 | float array. :math:`(N)` is the number of dimensions of word embedding.
30 |
31 | """
32 | x_len = [len(x) for x in xs]
33 | x_section = numpy.cumsum(x_len[:-1])
34 | ex = embed(F.concat(xs, axis=0))
35 | ex = F.dropout(ex, ratio=dropout)
36 | exs = F.split_axis(ex, x_section, 0)
37 | return exs
38 |
39 |
40 | def block_embed(embed, x, dropout=0.):
41 | """Embedding function followed by convolution
42 |
43 | Args:
44 | embed (callable): A :func:`~chainer.functions.embed_id` function
45 | or :class:`~chainer.links.EmbedID` link.
46 | x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
47 | :class:`cupy.ndarray`): Input variable, which
48 | is a :math:`(B, L)`-shaped int array. Its first dimension
49 | :math:`(B)` is assumed to be the *minibatch dimension*.
50 | The second dimension :math:`(L)` is the length of padded
51 | sentences.
52 | dropout (float): Dropout ratio.
53 |
54 | Returns:
55 | ~chainer.Variable: Output variable. A float array with shape
56 | of :math:`(B, N, L, 1)`. :math:`(N)` is the number of dimensions
57 | of word embedding.
58 |
59 | """
60 | e = embed(x)
61 | e = F.dropout(e, ratio=dropout)
62 | e = F.transpose(e, (0, 2, 1))
63 | e = e[:, :, :, None]
64 | return e
65 |
66 |
67 | class TextClassifier(chainer.Chain):
68 |
69 | """A classifier using a given encoder.
70 |
71 | This chain encodes a sentence and classifies it into classes.
72 |
73 | Args:
74 | encoder (Link): A callable encoder, which extracts a feature.
75 | Input is a list of variables whose shapes are
76 | "(sentence_length, )".
77 | Output is a variable whose shape is "(batchsize, n_units)".
78 | n_class (int): The number of classes to be predicted.
79 |
80 | """
81 |
82 | def __init__(self, encoder, n_class, dropout=0.1):
83 | super(TextClassifier, self).__init__()
84 | with self.init_scope():
85 | self.encoder = encoder
86 | self.output = L.Linear(encoder.out_units, n_class)
87 | self.dropout = dropout
88 |
89 | def __call__(self, xs, ys=None):
90 | if ys is None:
91 | xs, ys = xs
92 | concat_outputs = self.predict(xs, ys=ys)
93 | concat_truths = F.concat(ys, axis=0)
94 |
95 | loss = F.softmax_cross_entropy(concat_outputs, concat_truths)
96 | accuracy = F.accuracy(concat_outputs, concat_truths)
97 | reporter.report({'loss': loss.data}, self)
98 | reporter.report({'accuracy': accuracy.data}, self)
99 | return loss
100 |
101 | def predict(self, xs, ys=None, softmax=False, argmax=False):
102 | concat_encodings = F.dropout(self.encoder(xs, labels=ys),
103 | ratio=self.dropout)
104 | concat_outputs = self.output(concat_encodings)
105 | if softmax:
106 | return F.softmax(concat_outputs).data
107 | elif argmax:
108 | return self.xp.argmax(concat_outputs.data, axis=1)
109 | else:
110 | return concat_outputs
111 |
112 |
113 | class RNNEncoder(chainer.Chain):
114 |
115 | """A LSTM-RNN Encoder with Word Embedding.
116 |
117 | This model encodes a sentence sequentially using LSTM.
118 |
119 | Args:
120 | n_layers (int): The number of LSTM layers.
121 | n_vocab (int): The size of vocabulary.
122 | n_units (int): The number of units of a LSTM layer and word embedding.
123 | dropout (float): The dropout ratio.
124 |
125 | """
126 |
127 | def __init__(self, n_layers, n_vocab, n_units, dropout=0.1):
128 | super(RNNEncoder, self).__init__(
129 | embed=L.EmbedID(n_vocab, n_units,
130 | initialW=embed_init),
131 | encoder=L.NStepLSTM(n_layers, n_units, n_units, dropout),
132 | )
133 | self.n_layers = n_layers
134 | self.out_units = n_units
135 | self.dropout = dropout
136 | self.use_predict_embed = False
137 |
138 | def __call__(self, xs, labels=None):
139 | exs = sequence_embed(self.embed, xs, self.dropout)
140 | if self.use_predict_embed and chainer.config.train:
141 | exs = self.embed.embed_xs_with_prediction(
142 | xs, labels=labels, batch='list')
143 | last_h, last_c, ys = self.encoder(None, None, exs)
144 | assert(last_h.shape == (self.n_layers, len(xs), self.out_units))
145 | concat_outputs = last_h[-1]
146 | return concat_outputs
147 |
148 |
149 | class CNNEncoder(chainer.Chain):
150 |
151 | """A CNN encoder with word embedding.
152 |
153 | This model encodes a sentence as a set of n-gram chunks
154 | using convolutional filters.
155 | Following the convolution, max-pooling is applied over time.
156 | Finally, the output is fed into a multilayer perceptron.
157 |
158 | Args:
159 | n_layers (int): The number of layers of MLP.
160 | n_vocab (int): The size of vocabulary.
161 | n_units (int): The number of units of MLP and word embedding.
162 | dropout (float): The dropout ratio.
163 |
164 | """
165 |
166 | def __init__(self, n_layers, n_vocab, n_units, dropout=0.1):
167 | out_units = n_units // 3
168 | super(CNNEncoder, self).__init__(
169 | embed=L.EmbedID(n_vocab, n_units, ignore_label=-1,
170 | initialW=embed_init),
171 | cnn_w3=L.Convolution2D(
172 | n_units, out_units, ksize=(3, 1), stride=1, pad=(2, 0),
173 | nobias=True),
174 | cnn_w4=L.Convolution2D(
175 | n_units, out_units, ksize=(4, 1), stride=1, pad=(3, 0),
176 | nobias=True),
177 | cnn_w5=L.Convolution2D(
178 | n_units, out_units, ksize=(5, 1), stride=1, pad=(4, 0),
179 | nobias=True),
180 | mlp=MLP(n_layers, out_units * 3, dropout)
181 | )
182 | self.out_units = out_units * 3
183 | self.dropout = dropout
184 | self.use_predict_embed = False
185 |
186 | def __call__(self, xs, labels=None):
187 | x_block = chainer.dataset.convert.concat_examples(xs, padding=-1)
188 | ex_block = block_embed(self.embed, x_block, self.dropout)
189 | if self.use_predict_embed and chainer.config.train:
190 | ex_block = self.embed.embed_xs_with_prediction(
191 | xs, labels=labels, batch='concat')
192 | h_w3 = F.max(self.cnn_w3(ex_block), axis=2)
193 | h_w4 = F.max(self.cnn_w4(ex_block), axis=2)
194 | h_w5 = F.max(self.cnn_w5(ex_block), axis=2)
195 | h = F.concat([h_w3, h_w4, h_w5], axis=1)
196 | h = F.relu(h)
197 | h = F.dropout(h, ratio=self.dropout)
198 | h = self.mlp(h)
199 | return h
200 |
201 |
202 | class MLP(chainer.ChainList):
203 |
204 | """A multilayer perceptron.
205 |
206 | Args:
207 | n_vocab (int): The size of vocabulary.
208 | n_units (int): The number of units in a hidden or output layer.
209 | dropout (float): The dropout ratio.
210 |
211 | """
212 |
213 | def __init__(self, n_layers, n_units, dropout=0.1):
214 | super(MLP, self).__init__()
215 | for i in range(n_layers):
216 | self.add_link(L.Linear(None, n_units))
217 | self.dropout = dropout
218 | self.out_units = n_units
219 |
220 | def __call__(self, x):
221 | for i, link in enumerate(self.children()):
222 | x = F.dropout(x, ratio=self.dropout)
223 | x = F.relu(link(x))
224 | return x
225 |
226 |
227 | class BOWEncoder(chainer.Chain):
228 |
229 | """A BoW encoder with word embedding.
230 |
231 | This model encodes a sentence as just a set of words by averaging.
232 |
233 | Args:
234 | n_vocab (int): The size of vocabulary.
235 | n_units (int): The number of units of word embedding.
236 | dropout (float): The dropout ratio.
237 |
238 | """
239 |
240 | def __init__(self, n_vocab, n_units, dropout=0.1):
241 | super(BOWEncoder, self).__init__(
242 | embed=L.EmbedID(n_vocab, n_units, ignore_label=-1,
243 | initialW=embed_init),
244 | )
245 | self.out_units = n_units
246 | self.dropout = dropout
247 |
248 | def __call__(self, xs):
249 | x_block = chainer.dataset.convert.concat_examples(xs, padding=-1)
250 | ex_block = block_embed(self.embed, x_block)
251 | x_len = self.xp.array([len(x) for x in xs], 'i')[:, None, None]
252 | h = F.sum(ex_block, axis=2) / x_len
253 | return h
254 |
255 |
256 | class BOWMLPEncoder(chainer.Chain):
257 |
258 | """A BOW encoder with word embedding and MLP.
259 |
260 | This model encodes a sentence as just a set of words by averaging.
261 | Additionally, its output is fed into a multilayer perceptron.
262 |
263 | Args:
264 | n_layers (int): The number of layers of MLP.
265 | n_vocab (int): The size of vocabulary.
266 | n_units (int): The number of units of MLP and word embedding.
267 | dropout (float): The dropout ratio.
268 |
269 | """
270 |
271 | def __init__(self, n_layers, n_vocab, n_units, dropout=0.1):
272 | super(BOWMLPEncoder, self).__init__(
273 | bow_encoder=BOWEncoder(n_vocab, n_units, dropout),
274 | mlp_encoder=MLP(n_layers, n_units, dropout)
275 | )
276 | self.out_units = n_units
277 |
278 | def __call__(self, xs):
279 | h = self.bow_encoder(xs)
280 | h = self.mlp_encoder(h)
281 | return h
282 |
--------------------------------------------------------------------------------
/text_classification/nlp_utils.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import io
3 |
4 | import numpy
5 |
6 | import chainer
7 | from chainer import cuda
8 |
9 |
10 | def split_text(text, char_based=False):
11 | if char_based:
12 | return list(text)
13 | else:
14 | return text.split()
15 |
16 |
17 | def normalize_text(text):
18 | return text.strip().lower()
19 |
20 |
21 | def make_vocab(dataset, max_vocab_size=50000, min_freq=1):
22 | counts = collections.defaultdict(int)
23 | for tokens, _ in dataset:
24 | for token in tokens:
25 | counts[token] += 1
26 |
27 | vocab = {'': 0, '': 1}
28 | for w, c in sorted(counts.items(), key=lambda x: (-x[1], x[0])):
29 | if len(vocab) >= max_vocab_size or c < min_freq:
30 | break
31 | vocab[w] = len(vocab)
32 | return vocab
33 |
34 |
35 | def read_vocab_list(path, max_vocab_size=20000):
36 | vocab = {'': 0, '': 1}
37 | with io.open(path, encoding='utf-8', errors='ignore') as f:
38 | for l in f:
39 | w = l.strip()
40 | if w not in vocab and w:
41 | vocab[w] = len(vocab)
42 | if len(vocab) >= max_vocab_size:
43 | break
44 | return vocab
45 |
46 |
47 | def make_array(tokens, vocab, add_eos=True, add_bos=True):
48 | unk_id = vocab['']
49 | eos_id = vocab['']
50 | ids = [vocab.get(token, unk_id) for token in tokens]
51 | if add_eos:
52 | ids.append(eos_id)
53 | if add_bos:
54 | ids = [eos_id] + ids
55 | return numpy.array(ids, 'i')
56 |
57 |
58 | def transform_to_array(dataset, vocab, with_label=True):
59 | if with_label:
60 | return [(make_array(tokens, vocab), numpy.array([cls], 'i'))
61 | for tokens, cls in dataset]
62 | else:
63 | return [make_array(tokens, vocab)
64 | for tokens in dataset]
65 |
66 |
67 | def convert_seq(batch, device=None, with_label=True):
68 | def to_device_batch(batch):
69 | if device is None:
70 | return batch
71 | elif device < 0:
72 | return [chainer.dataset.to_device(device, x) for x in batch]
73 | else:
74 | xp = cuda.cupy.get_array_module(*batch)
75 | concat = xp.concatenate(batch, axis=0)
76 | sections = numpy.cumsum([len(x) for x in batch[:-1]], dtype='i')
77 | concat_dev = chainer.dataset.to_device(device, concat)
78 | batch_dev = cuda.cupy.split(concat_dev, sections)
79 | return batch_dev
80 |
81 | if with_label:
82 | return [to_device_batch([x for x, _ in batch]),
83 | to_device_batch([y for _, y in batch])]
84 | # return {'xs': to_device_batch([x for x, _ in batch]),
85 | # 'ys': to_device_batch([y for _, y in batch])}
86 | else:
87 | return to_device_batch([x for x in batch])
88 |
--------------------------------------------------------------------------------
/text_classification/text_datasets.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import glob
3 | import io
4 | import os
5 | import shutil
6 | import tarfile
7 | import tempfile
8 |
9 | import numpy
10 |
11 | import chainer
12 |
13 | from .nlp_utils import make_vocab
14 | from .nlp_utils import normalize_text
15 | from .nlp_utils import split_text
16 | from .nlp_utils import transform_to_array
17 | import json
18 | URL_DBPEDIA = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz' # NOQA
19 | URL_IMDB = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
20 | URL_OTHER_BASE = 'https://raw.githubusercontent.com/harvardnlp/sent-conv-torch/master/data/' # NOQA
21 |
22 |
23 | def download_dbpedia():
24 | path = chainer.dataset.cached_download(URL_DBPEDIA)
25 | tf = tarfile.open(path, 'r')
26 | return tf
27 |
28 |
29 | def read_dbpedia(tf, split, shrink=1, char_based=False):
30 | dataset = []
31 | f = tf.extractfile('dbpedia_csv/{}.csv'.format(split))
32 | for i, (label, title, text) in enumerate(csv.reader(f)):
33 | if i % shrink != 0:
34 | continue
35 | label = int(label) - 1 # Index begins from 1
36 | tokens = split_text(normalize_text(text), char_based)
37 | dataset.append((tokens, label))
38 | return dataset
39 |
40 |
41 | def get_dbpedia(vocab=None, shrink=1, char_based=False):
42 | tf = download_dbpedia()
43 |
44 | print('read dbpedia')
45 | train = read_dbpedia(tf, 'train', shrink=shrink, char_based=char_based)
46 | test = read_dbpedia(tf, 'test', shrink=shrink, char_based=char_based)
47 |
48 | if vocab is None:
49 | print('constract vocabulary based on frequency')
50 | vocab = make_vocab(train)
51 |
52 | train = transform_to_array(train, vocab)
53 | test = transform_to_array(test, vocab)
54 |
55 | return train, test, vocab
56 |
57 |
58 | def download_imdb():
59 | path = chainer.dataset.cached_download(URL_IMDB)
60 | tf = tarfile.open(path, 'r')
61 | # To read many files fast, tarfile is untared
62 | path = tempfile.mkdtemp()
63 | tf.extractall(path)
64 | return path
65 |
66 |
67 | def read_imdb(path, split,
68 | shrink=1, fine_grained=False, char_based=False):
69 | fg_label_dict = {'1': 0, '2': 0, '3': 1, '4': 1,
70 | '7': 2, '8': 2, '9': 3, '10': 3}
71 |
72 | def read_and_label(posneg, label):
73 | dataset = []
74 | target = os.path.join(path, 'aclImdb', split, posneg, '*')
75 | for i, f_path in enumerate(glob.glob(target)):
76 | if i % shrink != 0:
77 | continue
78 | with io.open(f_path, encoding='utf-8', errors='ignore') as f:
79 | text = f.read().strip()
80 | tokens = split_text(normalize_text(text), char_based)
81 | if fine_grained:
82 | # extract from f_path. e.g. /pos/200_8.txt -> 8
83 | label = fg_label_dict[f_path.split('_')[-1][:-4]]
84 | dataset.append((tokens, label))
85 | else:
86 | dataset.append((tokens, label))
87 | return dataset
88 |
89 | pos_dataset = read_and_label('pos', 0)
90 | neg_dataset = read_and_label('neg', 1)
91 | return pos_dataset + neg_dataset
92 |
93 |
94 | def get_imdb(vocab=None, shrink=1, fine_grained=False,
95 | char_based=False):
96 | tmp_path = download_imdb()
97 |
98 | print('read imdb')
99 | train = read_imdb(tmp_path, 'train',
100 | shrink=shrink, fine_grained=fine_grained,
101 | char_based=char_based)
102 | test = read_imdb(tmp_path, 'test',
103 | shrink=shrink, fine_grained=fine_grained,
104 | char_based=char_based)
105 |
106 | shutil.rmtree(tmp_path)
107 |
108 | if vocab is None:
109 | print('constract vocabulary based on frequency')
110 | vocab = make_vocab(train)
111 |
112 | train = transform_to_array(train, vocab)
113 | test = transform_to_array(test, vocab)
114 |
115 | return train, test, vocab
116 |
117 |
118 | def download_other_dataset(name):
119 | if name in ['custrev', 'mpqa', 'rt-polarity', 'subj']:
120 | files = [name + '.all']
121 | elif name == 'TREC':
122 | files = [name + suff for suff in ['.train.all', '.test.all']]
123 | else:
124 | files = [name + suff for suff in ['.train', '.test']]
125 | file_paths = []
126 | for f_name in files:
127 | url = os.path.join(URL_OTHER_BASE, f_name)
128 | path = chainer.dataset.cached_download(url)
129 | file_paths.append(path)
130 | return file_paths
131 |
132 |
133 | def read_other_dataset(path, shrink=1, char_based=False):
134 | dataset = []
135 | with io.open(path, encoding='utf-8', errors='ignore') as f:
136 | for i, l in enumerate(f):
137 | if i % shrink != 0 or not len(l.strip()) >= 3:
138 | continue
139 | label, text = l.strip().split(None, 1)
140 | label = int(label)
141 | tokens = split_text(normalize_text(text), char_based)
142 | dataset.append((tokens, label))
143 | return dataset
144 |
145 |
146 | def _read_tsv(input_file, quotechar=None):
147 | """Reads a tab separated value file."""
148 | with open(input_file, "r") as f:
149 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
150 | lines = []
151 | for line in reader:
152 | lines.append((split_text(normalize_text(line[0])), line[1]))
153 | return lines[1:]
154 |
155 | def read_text_dataset(name, vocab=None, dir="datasets"):
156 | assert(name in ['TREC', 'stsa.binary', 'stsa.fine',
157 | 'custrev', 'mpqa', 'rt-polarity', 'subj'])
158 | train_path = os.path.join(dir, name, "train.tsv")
159 | eval_path = os.path.join("datasets", name, "dev.tsv")
160 | test_path = os.path.join("datasets", name, "test.tsv")
161 |
162 | train = _read_tsv(train_path)
163 | eval = _read_tsv(eval_path)
164 | test = _read_tsv(test_path)
165 |
166 | if vocab is None:
167 | print('constract vocabulary based on frequency')
168 | all_data = []
169 | all_data.extend(train)
170 | all_data.extend(eval)
171 | vocab = make_vocab(all_data)
172 |
173 | train = transform_to_array(train, vocab)
174 | eval = transform_to_array(eval, vocab)
175 | test = transform_to_array(test, vocab)
176 |
177 | return train, eval, test, vocab
178 |
179 | def get_other_text_dataset(name, vocab=None, shrink=1,
180 | char_based=False, seed=777):
181 | assert(name in ['TREC', 'stsa.binary', 'stsa.fine',
182 | 'custrev', 'mpqa', 'rt-polarity', 'subj'])
183 | datasets = download_other_dataset(name)
184 | train = read_other_dataset(
185 | datasets[0], shrink=shrink, char_based=char_based)
186 | if len(datasets) == 2:
187 | test = read_other_dataset(
188 | datasets[1], shrink=shrink, char_based=char_based)
189 | else:
190 | numpy.random.seed(seed)
191 | alldata = numpy.random.permutation(train)
192 | train = alldata[:-len(alldata) // 10]
193 | test = alldata[-len(alldata) // 10:]
194 |
195 | if vocab is None:
196 | print('constract vocabulary based on frequency')
197 | vocab = make_vocab(train)
198 |
199 | train = transform_to_array(train, vocab)
200 | test = transform_to_array(test, vocab)
201 |
202 | return train, test, vocab
203 |
--------------------------------------------------------------------------------
/train_text_classifier.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import print_function
4 |
5 | import argparse
6 | import datetime
7 | import json
8 | import os
9 | import numpy
10 |
11 | import chainer
12 | from chainer import training
13 | from chainer.training import extensions
14 | import cupy
15 | import nets as bilm_nets
16 |
17 | from text_classification import nets as class_nets
18 | from text_classification.nlp_utils import convert_seq
19 | from text_classification import text_datasets
20 |
21 | from evaluator import MicroEvaluator
22 |
23 | import args_of_text_classifier
24 | from utils import UnkDropout, Outer
25 |
26 | #cupy.cuda.set_allocator(None)
27 | #cupy.cuda.set_pinned_memory_allocator(None)
28 |
29 | # default local parameters -- no need to change
30 | parser = args_of_text_classifier.get_basic_arg_parser()
31 | args = parser.parse_args()
32 |
33 | # load global parameters
34 | with open("global.config", 'rb') as f:
35 | configs_dict = json.load(f)
36 |
37 | args.dataset = configs_dict.get("dataset")
38 | args.model = configs_dict.get("eval_model")
39 |
40 | def main():
41 |
42 | print(json.dumps(args.__dict__, indent=2))
43 | train(dir="aug_data", print_log=True)
44 |
45 |
46 | def train(dir="datasets", print_log=False):
47 | chainer.CHAINER_SEED = args.seed
48 | numpy.random.seed(args.seed)
49 |
50 | vocab = None
51 |
52 | # Load a dataset
53 | if args.dataset == 'dbpedia':
54 | train, test, vocab = text_datasets.get_dbpedia(
55 | vocab=vocab)
56 | elif args.dataset.startswith('imdb.'):
57 | train, test, vocab = text_datasets.get_imdb(
58 | fine_grained=args.dataset.endswith('.fine'),
59 | vocab=vocab)
60 | elif args.dataset in ['TREC', 'stsa.binary', 'stsa.fine',
61 | 'custrev', 'mpqa', 'rt-polarity', 'subj']:
62 | train, test, real_test, vocab = text_datasets.read_text_dataset(
63 | args.dataset, vocab=None, dir=dir)
64 | #train, test, vocab = text_datasets.get_other_text_dataset(
65 | # args.dataset, vocab=vocab)
66 | #if args.validation:
67 | # real_test = test
68 | # dataset_pairs = chainer.datasets.get_cross_validation_datasets_random(
69 | # train, 10, seed=777)
70 | # train, test = dataset_pairs[0]
71 |
72 | print('# train data: {}'.format(len(train)))
73 | print('# test data: {}'.format(len(test)))
74 | print('# vocab: {}'.format(len(vocab)))
75 | n_class = len(set([int(d[1]) for d in train]))
76 | print('# class: {}'.format(n_class))
77 |
78 | chainer.CHAINER_SEED = args.seed
79 | numpy.random.seed(args.seed)
80 | train = UnkDropout(train, vocab[''], 0.01)
81 | train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
82 | test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
83 | repeat=False, shuffle=False)
84 |
85 | # Setup a model
86 | chainer.CHAINER_SEED = args.seed
87 | numpy.random.seed(args.seed)
88 | if args.model == 'rnn':
89 | Encoder = class_nets.RNNEncoder
90 | elif args.model == 'cnn':
91 | Encoder = class_nets.CNNEncoder
92 | elif args.model == 'bow':
93 | Encoder = class_nets.BOWMLPEncoder
94 | encoder = Encoder(n_layers=args.layer, n_vocab=len(vocab),
95 | n_units=args.unit, dropout=args.dropout)
96 | model = class_nets.TextClassifier(encoder, n_class)
97 |
98 | if args.bilm:
99 | bilm = bilm_nets.BiLanguageModel(
100 | len(vocab), args.bilm_unit, args.bilm_layer, args.bilm_dropout)
101 | n_labels = len(set([int(v[1]) for v in test]))
102 | print('# labels =', n_labels)
103 | if not args.no_label:
104 | print('add label')
105 | bilm.add_label_condition_nets(n_labels, args.bilm_unit)
106 | else:
107 | print('not using label')
108 | chainer.serializers.load_npz(args.bilm, bilm)
109 | with model.encoder.init_scope():
110 | initialW = numpy.array(model.encoder.embed.W.data)
111 | del model.encoder.embed
112 | model.encoder.embed = bilm_nets.PredictiveEmbed(
113 | len(vocab), args.unit, bilm, args.dropout,
114 | initialW=initialW)
115 | model.encoder.use_predict_embed = True
116 |
117 | model.encoder.embed.setup(
118 | mode=args.bilm_mode,
119 | temp=args.bilm_temp,
120 | word_lower_bound=0.,
121 | gold_lower_bound=0.,
122 | gumbel=args.bilm_gumbel,
123 | residual=args.bilm_residual,
124 | wordwise=args.bilm_wordwise,
125 | add_original=args.bilm_add_original,
126 | augment_ratio=args.bilm_ratio,
127 | ignore_unk=vocab[''])
128 |
129 | if args.gpu >= 0:
130 | # Make a specified GPU current
131 | chainer.cuda.get_device_from_id(args.gpu).use()
132 | model.to_gpu() # Copy the model to the GPU
133 | model.xp.random.seed(args.seed)
134 | chainer.CHAINER_SEED = args.seed
135 | numpy.random.seed(args.seed)
136 |
137 | # Setup an optimizer
138 | optimizer = chainer.optimizers.Adam(args.learning_rate)
139 | optimizer.setup(model)
140 |
141 | # Set up a trainer
142 | updater = training.StandardUpdater(
143 | train_iter, optimizer,
144 | converter=convert_seq, device=args.gpu)
145 |
146 | from triggers import FailMaxValueTrigger
147 | stop_trigger = FailMaxValueTrigger(
148 | key='validation/main/accuracy', trigger=(1, 'epoch'),
149 | n_times=args.stop_epoch, max_trigger=args.epoch)
150 | trainer = training.Trainer(
151 | updater, stop_trigger, out=args.out)
152 |
153 | # Evaluate the model with the test dataset for each epoch
154 | # VALIDATION SET
155 | trainer.extend(MicroEvaluator(
156 | test_iter, model,
157 | converter=convert_seq, device=args.gpu))
158 |
159 | if args.validation:
160 | real_test_iter = chainer.iterators.SerialIterator(
161 | real_test, args.batchsize,
162 | repeat=False, shuffle=False)
163 | eval_on_real_test = MicroEvaluator(
164 | real_test_iter, model,
165 | converter=convert_seq, device=args.gpu)
166 | eval_on_real_test.default_name = 'test'
167 | trainer.extend(eval_on_real_test)
168 |
169 | # Take a best snapshot
170 | record_trigger = training.triggers.MaxValueTrigger(
171 | 'validation/main/accuracy', (1, 'epoch'))
172 | if args.save_model:
173 | trainer.extend(extensions.snapshot_object(
174 | model, 'best_model.npz'),
175 | trigger=record_trigger)
176 |
177 | # Write a log of evaluation statistics for each epoch
178 | out = Outer()
179 | trainer.extend(extensions.LogReport())
180 | if print_log:
181 | trainer.extend(extensions.PrintReport(
182 | ['epoch', 'main/loss', 'validation/main/loss',
183 | 'main/accuracy', 'validation/main/accuracy',
184 | 'test/main/loss', 'test/main/accuracy',
185 | # 'elapsed_time']))
186 | 'elapsed_time']), trigger=record_trigger)
187 | else:
188 | trainer.extend(extensions.PrintReport(
189 | ['main/accuracy', 'validation/main/accuracy',
190 | 'test/main/accuracy'], out=out), trigger=record_trigger)
191 |
192 | # Print a progress bar to stdout
193 | #trainer.extend(extensions.ProgressBar())
194 |
195 | # Run the training
196 | trainer.run()
197 |
198 | # free all unused memory blocks “cached” in the memory pool
199 | mempool = cupy.get_default_memory_pool()
200 | mempool.free_all_blocks()
201 | #print("val_acc:{}, test_acc:{}\n", out[-2], out[-1])
202 | return float(out[-1])
203 |
204 | if __name__ == '__main__':
205 | main()
206 |
--------------------------------------------------------------------------------
/triggers.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 | import chainer
4 | from chainer import cuda
5 | from chainer import function
6 | from chainer import reporter
7 | from chainer.training import util
8 | from chainer import utils
9 | from chainer.utils import type_check
10 | from chainer.training import triggers
11 |
12 |
13 | class FailBestValueTrigger(triggers.IntervalTrigger):
14 |
15 | """Trigger invoked when specific value fails to become best.
16 |
17 | Args:
18 | key (str): Key of value.
19 | compare (function): Compare function which takes current best value and
20 | new value and returns whether new value is better than current
21 | best.
22 | trigger: Trigger that decides the comparison interval between current
23 | best value and new value. This must be a tuple in the form of
24 | ``, 'epoch'`` or ``, 'iteration'`` which is passed to
25 | :class:`~chainer.training.triggers.IntervalTrigger`.
26 |
27 | """
28 |
29 | def __init__(self, key, compare, trigger=(1, 'epoch'),
30 | n_times=5, max_trigger=None, print_triger=False):
31 | self.period = max_trigger
32 | self.unit = 'epoch'
33 |
34 | self._key = key
35 | self._best_value = None
36 | self._interval_trigger = util.get_trigger(trigger)
37 | self._init_summary()
38 | self._compare = compare
39 | self._print_triger = print_triger
40 |
41 | self._n_times = n_times
42 | self._n_fails = 0
43 |
44 | self._max_trigger = max_trigger
45 | self._n_triggers = 0
46 |
47 | def __call__(self, trainer):
48 | """Decides whether the extension should be called on this iteration.
49 |
50 | Args:
51 | trainer (~chainer.training.Trainer): Trainer object that this
52 | trigger is associated with. The ``observation`` of this trainer
53 | is used to determine if the trigger should fire.
54 |
55 | Returns:
56 | bool: ``True`` if the corresponding extension should be invoked in
57 | this iteration.
58 |
59 | """
60 |
61 | observation = trainer.observation
62 | summary = self._summary
63 | key = self._key
64 | if key in observation:
65 | summary.add({key: observation[key]})
66 |
67 | if not self._interval_trigger(trainer):
68 | return False
69 |
70 | stats = summary.compute_mean()
71 | value = float(stats[key]) # copy to CPU
72 | self._init_summary()
73 |
74 | self._n_triggers += 1
75 | if self._n_triggers == 1:
76 | return False
77 | if self._max_trigger is not None \
78 | and self._n_triggers >= self._max_trigger:
79 | return True
80 |
81 | if self._best_value is None or self._compare(self._best_value, value):
82 | self._best_value = value
83 | self._n_fails = 0
84 | return False
85 | self._n_fails += 1
86 | if self._n_fails >= self._n_times:
87 | return True
88 | else:
89 | return False
90 |
91 | def _init_summary(self):
92 | self._summary = reporter.DictSummary()
93 |
94 |
95 | class FailMaxValueTrigger(FailBestValueTrigger):
96 | def __init__(self, key, trigger=(1, 'epoch'), n_times=5, max_trigger=None):
97 | super(FailMaxValueTrigger, self).__init__(
98 | key, lambda max_value, new_value: new_value > max_value,
99 | trigger, n_times, max_trigger)
100 |
101 |
102 | class FailMinValueTrigger(FailBestValueTrigger):
103 | def __init__(self, key, trigger=(1, 'epoch'), n_times=5, max_trigger=None):
104 | super(FailMinValueTrigger, self).__init__(
105 | key, lambda min_value, new_value: new_value < min_value,
106 | trigger, n_times, max_trigger)
107 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import division
3 | from __future__ import print_function
4 | import collections
5 | import io
6 | import json
7 | import os
8 |
9 | import numpy as np
10 | import progressbar
11 |
12 | import chainer
13 | from chainer import cuda
14 | from chainer.dataset import convert
15 |
16 | class Outer(list):
17 | def __init__(self):
18 | list.__init__(self)
19 | def write(self, item):
20 | if '\n' not in item:
21 | self.append(item.strip())
22 |
23 | class UnkDropout(chainer.dataset.DatasetMixin):
24 | def __init__(self, dataset, unk, ratio=0.01):
25 | self.dataset = dataset
26 | self.unk = unk
27 | self.ratio = ratio
28 |
29 | def __len__(self):
30 | return len(self.dataset)
31 |
32 | def get_example(self, i):
33 | x, y = self.dataset[i]
34 | if chainer.config.train:
35 | rand = np.random.rand(x.size - 2) < self.ratio
36 | # keep bos and eos
37 | _x = x.copy()
38 | _x[1:-1] = np.where(rand, self.unk, _x[1:-1])
39 | return (_x, y)
40 | return (x, y)
41 |
42 |
43 | def convert_xt_batch_seq(xt_batch_seq, gpu):
44 | batchsize = len(xt_batch_seq[0])
45 | seq_len = len(xt_batch_seq)
46 | xt_batch_seq = np.array(xt_batch_seq, 'i')
47 | # (bproplen, batch, 2)
48 | xt_batch_seq = convert.to_device(gpu, xt_batch_seq)
49 | xp = cuda.get_array_module(xt_batch_seq)
50 | x_seq_batch = xp.split(
51 | xt_batch_seq[:, :, 0].T.reshape(batchsize * seq_len),
52 | batchsize, axis=0)
53 | t_seq_batch = xp.split(
54 | xt_batch_seq[:, :, 1].T.reshape(batchsize * seq_len),
55 | batchsize, axis=0)
56 | return x_seq_batch, t_seq_batch
57 |
58 |
59 | def count_words_from_file(counts, file_path):
60 | bar = progressbar.ProgressBar()
61 | for l in bar(io.open(file_path, encoding='utf-8')):
62 | # TODO: parallel
63 | if l.strip():
64 | words = l.strip().split()
65 | for word in words:
66 | counts[word] += 1
67 | return counts
68 |
69 |
70 | def count_words(dataset, alpha=0.4):
71 | counts = collections.defaultdict(int)
72 | for w in dataset:
73 | counts[w] += 1
74 | counts = [counts[i] for i in range(len(counts))]
75 | counts = np.array(counts, 'f')
76 | counts /= counts.sum()
77 | counts = counts ** alpha
78 | counts = counts.tolist()
79 | return counts
80 |
81 |
82 | def make_chain_dataset(file_path, vocab={}, update_vocab=False,
83 | chain_length=2):
84 | dataset = []
85 | chain = []
86 | unk_id = vocab['']
87 |
88 | def make_array(chain):
89 | array_chain = []
90 | for words in chain:
91 | tokens = []
92 | for word in words:
93 | if update_vocab:
94 | if word not in vocab:
95 | vocab[word] = len(vocab)
96 | tokens.append(vocab.get(word, unk_id))
97 | array_chain.append(np.array(tokens, 'i'))
98 | return array_chain
99 |
100 | for line in io.open(file_path, encoding='utf-8'):
101 | if not line.strip():
102 | if len(chain) >= chain_length:
103 | dataset.append(make_array(chain))
104 | chain = []
105 | continue
106 | words = line.strip().split() + ['']
107 | chain.append(words)
108 | if len(chain) >= chain_length:
109 | dataset.append(make_array(chain))
110 | return dataset, vocab
111 |
112 |
113 | def tokenize_text(file_path, vocab={}, update_vocab=False):
114 | tokens = []
115 | unk_id = vocab['']
116 | with io.open(file_path, encoding='utf-8') as f:
117 | for line in f:
118 | words = line.split() + ['']
119 | for word in words:
120 | if update_vocab:
121 | if word not in vocab:
122 | vocab[word] = len(vocab)
123 | tokens.append(vocab.get(word, unk_id))
124 | return tokens, vocab
125 |
126 |
127 | def get_wikitext_words_and_vocab(
128 | name='wikitext-2', base_dir='datasets', vocab=None):
129 | assert(name in ['wikitext-2', 'wikitext-103'])
130 | base_dir2 = os.path.join(base_dir, name)
131 | predata_path = os.path.join(base_dir2, 'preprocessed_data.json')
132 | if os.path.exists(predata_path) and vocab is None:
133 | train, valid, test, vocab = json.load(open(predata_path))
134 | else:
135 | prepared_vocab = (vocab is not None)
136 | if not prepared_vocab:
137 | vocab = {'': 0, '': 1}
138 | train, vocab = tokenize_text(
139 | os.path.join(base_dir2, 'wiki.train.tokens'),
140 | vocab, update_vocab=not prepared_vocab)
141 | valid, _ = tokenize_text(
142 | os.path.join(base_dir2, 'wiki.valid.tokens'),
143 | vocab, update_vocab=False)
144 | test, _ = tokenize_text(
145 | os.path.join(base_dir2, 'wiki.test.tokens'),
146 | vocab, update_vocab=False)
147 | json.dump([train, valid, test, vocab], open(predata_path, 'w'))
148 | return train, valid, test, vocab
149 |
--------------------------------------------------------------------------------