├── .idea ├── cbert_aug.iml ├── deployment.xml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── webServers.xml └── workspace.xml ├── README.md ├── __pycache__ ├── args_of_text_classifier.cpython-36.pyc ├── evaluator.cpython-36.pyc ├── nets.cpython-36.pyc ├── train_text_classifier.cpython-36.pyc ├── triggers.cpython-36.pyc └── utils.cpython-36.pyc ├── args_of_text_classifier.py ├── aug_data ├── TREC │ ├── dev.tsv │ ├── test.tsv │ └── train_origin.tsv ├── mpqa │ ├── dev.tsv │ ├── test.tsv │ └── train_origin.tsv ├── rt-polarity │ ├── dev.tsv │ ├── test.tsv │ └── train_origin.tsv ├── stsa.binary │ ├── dev.tsv │ ├── run.log │ ├── test.tsv │ ├── train_epoch_0.tsv │ ├── train_epoch_1.tsv │ ├── train_epoch_2.tsv │ ├── train_epoch_3.tsv │ ├── train_epoch_4.tsv │ ├── train_epoch_5.tsv │ └── train_origin.tsv ├── stsa.fine │ ├── dev.tsv │ ├── test.tsv │ └── train_origin.tsv └── subj │ ├── dev.tsv │ ├── test.tsv │ └── train_origin.tsv ├── aug_dataset.py ├── aug_dataset_wo_ft.py ├── datasets ├── TREC │ ├── dev.tsv │ ├── test.tsv │ ├── train.tsv │ └── train_dev.tsv ├── mpqa │ ├── dev.tsv │ ├── test.tsv │ ├── train.tsv │ └── train_dev.tsv ├── rt-polarity │ ├── dev.tsv │ ├── test.tsv │ ├── train.tsv │ └── train_dev.tsv ├── stsa.binary │ ├── dev.tsv │ ├── test.tsv │ ├── train.tsv │ └── train_dev.tsv ├── stsa.fine │ ├── dev.tsv │ ├── test.tsv │ ├── train.tsv │ └── train_dev.tsv └── subj │ ├── dev.tsv │ ├── test.tsv │ ├── train.tsv │ └── train_dev.tsv ├── evaluator.py ├── finetune_dataset.py ├── global.config ├── nets.py ├── text_classification ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── nets.cpython-36.pyc │ ├── nlp_utils.cpython-36.pyc │ └── text_datasets.cpython-36.pyc ├── nets.py ├── nlp_utils.py └── text_datasets.py ├── train_text_classifier.py ├── triggers.py ├── utils.py └── vocabs ├── TREC.vocab.json ├── custrev.vocab.json ├── mpqa.vocab.json ├── rt-polarity.vocab.json ├── stsa.binary.vocab.json ├── stsa.fine.vocab.json └── subj.vocab.json /.idea/cbert_aug.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/webServers.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cbert_aug 2 | This repo is deprecate, please refer to https://github.com/IIEKES/cbert_aug/ for detailed trails. 3 | 4 | 5 | This is a implementation of paper "Conditional BERT Contextual Augmentation" https://arxiv.org/pdf/1812.06705.pdf. 6 | Our original implementation was two-stage, for convenience, we rewrite the code. 7 | 8 | The *global.config* contains the global configuration for bert and classifier. 9 | The datasets directory contains files for bert, and the aug_data directory contain augmented files for classifier. 10 | 11 | You can run the code in two ways. Our default way is the second. 12 | - You can load the original bert and run aug_dataset_wo_ft.py directly 13 | - python aug_dataset_wo_ft.py 14 | - Or you can finetune bert on each dataset before run aug_dataset.py, and then load fine-tuned bert in aug_dataset.py 15 | 1. python finetune_dataset.py 16 | 2. python aug_dataset.py 17 | 18 | The hyperparameters of the models and training were selected by a grid-search using baseline models without data augmentation in each task’s validation set individually. 19 | 20 | We show the process of augmenting stsa.binary and training rnn-classifier on it, in aug_data/stsa.binary directory. 21 | 22 | The classifier code is from , thanks to the author. 23 | -------------------------------------------------------------------------------- /__pycache__/args_of_text_classifier.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/args_of_text_classifier.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/evaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/evaluator.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/nets.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/nets.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/train_text_classifier.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/train_text_classifier.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/triggers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/triggers.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /args_of_text_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | import argparse 5 | import json 6 | 7 | 8 | def get_basic_arg_parser(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--batchsize', '-b', type=int, default=64, 11 | help='Number of images in each mini-batch') 12 | parser.add_argument('--epoch', '-e', type=int, default=100, 13 | help='Number of sweeps over the dataset to train') 14 | parser.add_argument('--gpu', '-g', type=int, default=0, 15 | help='GPU ID (negative value indicates CPU)') 16 | parser.add_argument('--out', '-o', default='result', 17 | help='Directory to output the result') 18 | parser.add_argument('--unit', '-u', type=int, default=300, 19 | help='Number of units') 20 | parser.add_argument('--layer', '-l', type=int, default=1, 21 | help='Number of layers of RNN or MLP following CNN') 22 | parser.add_argument('--dropout', '-d', type=float, default=0.5, 23 | help='Dropout rate') 24 | parser.add_argument('--learning-rate', '-lr', type=float, default=1e-4, 25 | help='Learning rate') 26 | parser.add_argument('--dataset', '-data', default='imdb.binary', 27 | choices=['dbpedia', 'imdb.binary', 'imdb.fine', 28 | 'TREC', 'stsa.binary', 'stsa.fine', 29 | 'custrev', 'mpqa', 'rt-polarity', 'subj'], 30 | help='Name of dataset.') 31 | parser.add_argument('--model', '-model', default='cnn', 32 | choices=['cnn', 'rnn', 'bow'], 33 | help='Name of encoder model type.') 34 | 35 | parser.add_argument('--bilm', '-bilm') 36 | parser.add_argument('--bilm-unit', '-bilm-u', type=int, default=1024) 37 | parser.add_argument('--bilm-layer', '-bilm-l', type=int, default=1) 38 | parser.add_argument('--bilm-dropout', '-bilm-d', type=float, default=0.) 39 | 40 | parser.add_argument('--bilm-ratio', '-bilm-r', type=float, default=0.25) 41 | parser.add_argument('--bilm-temp', '-bilm-t', type=float, default=1.) 42 | parser.add_argument('--bilm-mode', '-bilm-m', default='sampling', 43 | choices=['weighted_sum', 'sampling']) 44 | parser.add_argument('--bilm-gumbel', action='store_true') 45 | parser.add_argument('--bilm-wordwise', action='store_true', default=True) 46 | parser.add_argument('--bilm-add-original', type=float, default=0.) 47 | parser.add_argument('--bilm-residual', type=float, default=0., 48 | help='if not 0, (original + context) * THIS') 49 | 50 | parser.add_argument('--resume-vocab') 51 | 52 | parser.add_argument('--validation', default=True) 53 | parser.add_argument('--seed', type=int, default=2018) 54 | parser.add_argument('--save-model', action='store_true') 55 | parser.add_argument('--stop-epoch', type=int, default=20) 56 | 57 | parser.add_argument('--no-label', action='store_true') 58 | 59 | return parser 60 | 61 | 62 | if __name__ == '__main__': 63 | parser = get_basic_arg_parser() 64 | args = parser.parse_args() 65 | print(json.dumps(args.__dict__, indent=2)) 66 | -------------------------------------------------------------------------------- /aug_data/TREC/test.tsv: -------------------------------------------------------------------------------- 1 | sentence label 2 | How far is it from Denver to Aspen ? 5 3 | What county is Modesto , California in ? 4 4 | Who was Galileo ? 3 5 | What is an atom ? 0 6 | When did Hawaii become a state ? 5 7 | How tall is the Sears Building ? 5 8 | George Bush purchased a small interest in which baseball team ? 3 9 | What is Australia 's national flower ? 1 10 | Why does the moon turn orange ? 0 11 | What is autism ? 0 12 | What city had a world fair in 1900 ? 4 13 | What person 's head is on a dime ? 3 14 | What is the average weight of a Yellow Labrador ? 5 15 | Who was the first man to fly across the Pacific Ocean ? 3 16 | When did Idaho become a state ? 5 17 | What is the life expectancy for crickets ? 5 18 | What metal has the highest melting point ? 1 19 | Who developed the vaccination against polio ? 3 20 | What is epilepsy ? 0 21 | What year did the Titanic sink ? 5 22 | Who was the first American to walk in space ? 3 23 | What is a biosphere ? 0 24 | What river in the US is known as the Big Muddy ? 4 25 | What is bipolar disorder ? 0 26 | What is cholesterol ? 0 27 | Who developed the Macintosh computer ? 3 28 | What is caffeine ? 0 29 | What imaginary line is halfway between the North and South Poles ? 4 30 | Where is John Wayne airport ? 4 31 | What hemisphere is the Philippines in ? 4 32 | What is the average speed of the horses at the Kentucky Derby ? 5 33 | Where are the Rocky Mountains ? 4 34 | What are invertebrates ? 0 35 | What is the temperature at the center of the earth ? 5 36 | When did John F. Kennedy get elected as President ? 5 37 | How old was Elvis Presley when he died ? 5 38 | Where is the Orinoco River ? 4 39 | How far is the service line from the net in tennis ? 5 40 | How much fiber should you have per day ? 5 41 | How many Great Lakes are there ? 5 42 | Material called linen is made from what plant ? 1 43 | What is Teflon ? 0 44 | What is amitriptyline ? 0 45 | What is a shaman ? 0 46 | What is the proper name for a female walrus ? 1 47 | What is a group of turkeys called ? 1 48 | How long did Rip Van Winkle sleep ? 5 49 | What are triglycerides ? 0 50 | How many liters in a gallon ? 5 51 | What is the name of the chocolate company in San Francisco ? 3 52 | What are amphibians ? 0 53 | Who discovered x-rays ? 3 54 | Which comedian 's signature line is `` Can we talk '' ? 3 55 | What is fibromyalgia ? 0 56 | What is done with worn or outdated flags ? 0 57 | What does cc in engines mean ? 0 58 | When did Elvis Presley die ? 5 59 | What is the capital of Yugoslavia ? 4 60 | Where is Milan ? 4 61 | What is the speed hummingbirds fly ? 5 62 | What is the oldest city in the United States ? 4 63 | What was W.C. Fields ' real name ? 3 64 | What river flows between Fargo , North Dakota and Moorhead , Minnesota ? 4 65 | What do bats eat ? 1 66 | What state did the Battle of Bighorn take place in ? 4 67 | Who was Abraham Lincoln ? 3 68 | What do you call a newborn kangaroo ? 1 69 | What are spider veins ? 0 70 | What day and month did John Lennon die ? 5 71 | What strait separates North America from Asia ? 4 72 | What is the population of Seattle ? 5 73 | How much was a ticket for the Titanic ? 5 74 | What is the largest city in the world ? 4 75 | What American composer wrote the music for `` West Side Story '' ? 3 76 | Where is the Mall of the America ? 4 77 | What is the pH scale ? 0 78 | What type of currency is used in Australia ? 1 79 | How tall is the Gateway Arch in St. Louis , MO ? 5 80 | How much does the human adult female brain weigh ? 5 81 | Who was the first governor of Alaska ? 3 82 | What is a prism ? 0 83 | When was the first liver transplant ? 5 84 | Who was elected president of South Africa in 1994 ? 3 85 | What is the population of China ? 5 86 | When was Rosa Parks born ? 5 87 | Why is a ladybug helpful ? 0 88 | What is amoxicillin ? 0 89 | Who was the first female United States Representative ? 3 90 | What are xerophytes ? 0 91 | What country did Ponce de Leon come from ? 4 92 | The U.S. Department of Treasury first issued paper currency for the U.S. during which war ? 1 93 | What is desktop publishing ? 0 94 | What is the temperature of the sun 's surface ? 5 95 | What year did Canada join the United Nations ? 5 96 | What is the oldest university in the US ? 3 97 | Where is Prince Edward Island ? 4 98 | Mercury , what year was it discovered ? 5 99 | What is cryogenics ? 0 100 | What are coral reefs ? 0 101 | What is the longest major league baseball-winning streak ? 1 102 | What is neurology ? 0 103 | Who invented the calculator ? 3 104 | How do you measure earthquakes ? 0 105 | Who is Duke Ellington ? 3 106 | What county is Phoenix , AZ in ? 4 107 | What is a micron ? 0 108 | The sun 's core , what is the temperature ? 5 109 | What is the Ohio state bird ? 1 110 | When were William Shakespeare 's twins born ? 5 111 | What is the highest dam in the U.S. ? 4 112 | What color is a poison arrow frog ? 1 113 | What is acupuncture ? 0 114 | What is the length of the coastline of the state of Alaska ? 5 115 | What is the name of Neil Armstrong 's wife ? 3 116 | What is Hawaii 's state flower ? 1 117 | Who won Ms. American in 1989 ? 3 118 | When did the Hindenberg crash ? 5 119 | What mineral helps prevent osteoporosis ? 1 120 | What was the last year that the Chicago Cubs won the World Series ? 5 121 | Where is Perth ? 4 122 | What year did WWII begin ? 5 123 | What is the diameter of a golf ball ? 5 124 | What is an eclipse ? 0 125 | Who discovered America ? 3 126 | What is the earth 's diameter ? 5 127 | Which president was unmarried ? 3 128 | How wide is the Milky Way galaxy ? 5 129 | During which season do most thunderstorms occur ? 5 130 | What is Wimbledon ? 0 131 | What is the gestation period for a cat ? 5 132 | How far is a nautical mile ? 5 133 | Who was the abolitionist who led the raid on Harper 's Ferry in 1859 ? 3 134 | What does target heart rate mean ? 0 135 | What was the first satellite to go into space ? 1 136 | What is foreclosure ? 0 137 | What is the major fault line near Kentucky ? 1 138 | Where is the Holland Tunnel ? 4 139 | Who wrote the hymn `` Amazing Grace '' ? 3 140 | What position did Willie Davis play in baseball ? 3 141 | What are platelets ? 0 142 | What is severance pay ? 0 143 | What is the name of Roy Roger 's dog ? 1 144 | Where are the National Archives ? 4 145 | What is a baby turkey called ? 1 146 | What is poliomyelitis ? 0 147 | What is the longest bone in the human body ? 1 148 | Who is a German philosopher ? 3 149 | What were Christopher Columbus ' three ships ? 1 150 | What does Phi Beta Kappa mean ? 0 151 | What is nicotine ? 0 152 | What is another name for vitamin B1 ? 1 153 | Who discovered radium ? 3 154 | What are sunspots ? 0 155 | When was Algeria colonized ? 5 156 | What baseball team was the first to make numbers part of their uniform ? 3 157 | What continent is Egypt on ? 4 158 | What is the capital of Mongolia ? 4 159 | What is nanotechnology ? 0 160 | In the late 1700 's British convicts were used to populate which colony ? 4 161 | What state is the geographic center of the lower 48 states ? 4 162 | What is an obtuse angle ? 0 163 | What are polymers ? 0 164 | When is hurricane season in the Caribbean ? 5 165 | Where is the volcano Mauna Loa ? 4 166 | What is another astronomic term for the Northern Lights ? 1 167 | What peninsula is Spain part of ? 4 168 | When was Lyndon B. Johnson born ? 5 169 | What is acetaminophen ? 0 170 | What state has the least amount of rain per year ? 4 171 | Who founded American Red Cross ? 3 172 | What year did the Milwaukee Braves become the Atlanta Braves ? 5 173 | How fast is alcohol absorbed ? 5 174 | When is the summer solstice ? 5 175 | What is supernova ? 0 176 | Where is the Shawnee National Forest ? 4 177 | What U.S. state 's motto is `` Live free or Die '' ? 4 178 | Where is the Lourve ? 4 179 | When was the first stamp issued ? 5 180 | What primary colors do you mix to make orange ? 1 181 | How far is Pluto from the sun ? 5 182 | What body of water are the Canary Islands in ? 4 183 | What is neuropathy ? 0 184 | Where is the Euphrates River ? 4 185 | What is cryptography ? 0 186 | What is natural gas composed of ? 1 187 | Who is the Prime Minister of Canada ? 3 188 | What French ruler was defeated at the battle of Waterloo ? 3 189 | What is leukemia ? 0 190 | Where did Howard Hughes die ? 4 191 | What is the birthstone for June ? 1 192 | What is the sales tax in Minnesota ? 1 193 | What is the distance in miles from the earth to the sun ? 5 194 | What is the average life span for a chicken ? 5 195 | When was the first Wal-Mart store opened ? 5 196 | What is relative humidity ? 0 197 | What city has the zip code of 35824 ? 4 198 | What currency is used in Algeria ? 1 199 | Who invented the hula hoop ? 3 200 | What was the most popular toy in 1957 ? 1 201 | What is pastrami made of ? 1 202 | What is the name of the satellite that the Soviet Union sent into space in 1957 ? 1 203 | What city 's newspaper is called `` The Enquirer '' ? 4 204 | Who invented the slinky ? 3 205 | What are the animals that don 't have backbones called ? 1 206 | What is the melting point of copper ? 5 207 | Where is the volcano Olympus Mons located ? 4 208 | Who was the 23rd president of the United States ? 3 209 | What is the average body temperature ? 5 210 | What does a defibrillator do ? 0 211 | What is the effect of acid rain ? 0 212 | What year did the United States abolish the draft ? 5 213 | How fast is the speed of light ? 5 214 | What province is Montreal in ? 4 215 | What New York City structure is also known as the Twin Towers ? 4 216 | What is fungus ? 0 217 | What is the most frequently spoken language in the Netherlands ? 1 218 | What is sodium chloride ? 0 219 | What are the spots on dominoes called ? 1 220 | How many pounds in a ton ? 5 221 | What is influenza ? 0 222 | What is ozone depletion ? 0 223 | What year was the Mona Lisa painted ? 5 224 | What does `` Sitting Shiva '' mean ? 0 225 | What is the electrical output in Madrid , Spain ? 1 226 | Which mountain range in North America stretches from Maine to Georgia ? 4 227 | What is plastic made of ? 1 228 | What is the population of Nigeria ? 5 229 | What does your spleen do ? 0 230 | Where is the Grand Canyon ? 4 231 | Who invented the telephone ? 3 232 | What year did the U.S. buy Alaska ? 5 233 | What is the name of the leader of Ireland ? 3 234 | What is phenylalanine ? 0 235 | How many gallons of water are there in a cubic foot ? 5 236 | What are the two houses of the Legislative branch ? 1 237 | What is sonar ? 0 238 | In Poland , where do most people live ? 4 239 | What is phosphorus ? 0 240 | What is the location of the Sea of Tranquility ? 4 241 | How fast is sound ? 5 242 | What French province is cognac produced in ? 4 243 | What is Valentine 's Day ? 0 244 | What causes gray hair ? 0 245 | What is hypertension ? 0 246 | What is bandwidth ? 0 247 | What is the longest suspension bridge in the U.S. ? 4 248 | What is a parasite ? 0 249 | What is home equity ? 0 250 | What do meteorologists do ? 0 251 | What is the criterion for being legally blind ? 1 252 | Who is the tallest man in the world ? 3 253 | What are the twin cities ? 4 254 | What did Edward Binney and Howard Smith invent in 1903 ? 1 255 | What is the statue of liberty made of ? 1 256 | What is pilates ? 0 257 | What planet is known as the `` red '' planet ? 4 258 | What is the depth of the Nile river ? 5 259 | What is the colorful Korean traditional dress called ? 1 260 | What is Mardi Gras ? 0 261 | Mexican pesos are worth what in U.S. dollars ? 5 262 | Who was the first African American to play for the Brooklyn Dodgers ? 3 263 | Who was the first Prime Minister of Canada ? 3 264 | How many Admirals are there in the U.S. Navy ? 5 265 | What instrument did Glenn Miller play ? 1 266 | How old was Joan of Arc when she died ? 5 267 | What does the word fortnight mean ? 0 268 | What is dianetics ? 0 269 | What is the capital of Ethiopia ? 4 270 | For how long is an elephant pregnant ? 5 271 | How did Janice Joplin die ? 0 272 | What is the primary language in Iceland ? 1 273 | What is the difference between AM radio stations and FM radio stations ? 0 274 | What is osteoporosis ? 0 275 | Who was the first woman governor in the U.S. ? 3 276 | What is peyote ? 0 277 | What is the esophagus used for ? 0 278 | What is viscosity ? 0 279 | What year did Oklahoma become a state ? 5 280 | What is the abbreviation for Texas ? 2 281 | What is a mirror made out of ? 1 282 | Where on the body is a mortarboard worn ? 4 283 | What was J.F.K. 's wife 's name ? 3 284 | What does I.V. stand for ? 2 285 | What is the chunnel ? 0 286 | Where is Hitler buried ? 4 287 | What are antacids ? 0 288 | What is pulmonary fibrosis ? 0 289 | What are Quaaludes ? 0 290 | What is naproxen ? 0 291 | What is strep throat ? 0 292 | What is the largest city in the U.S. ? 4 293 | What is foot and mouth disease ? 1 294 | What is the life expectancy of a dollar bill ? 5 295 | What do you call a professional map drawer ? 1 296 | What are Aborigines ? 0 297 | What is hybridization ? 0 298 | What color is indigo ? 1 299 | How old do you have to be in order to rent a car in Italy ? 5 300 | What does a barometer measure ? 1 301 | What color is a giraffe 's tongue ? 1 302 | What does USPS stand for ? 2 303 | What year did the NFL go on strike ? 5 304 | What is solar wind ? 0 305 | What date did Neil Armstrong land on the moon ? 5 306 | When was Hiroshima bombed ? 5 307 | Where is the Savannah River ? 4 308 | Who was the first woman killed in the Vietnam War ? 3 309 | What planet has the strongest magnetic field of all the planets ? 4 310 | Who is the governor of Alaska ? 3 311 | What year did Mussolini seize power in Italy ? 5 312 | What is the capital of Persia ? 4 313 | Where is the Eiffel Tower ? 4 314 | How many hearts does an octopus have ? 5 315 | What is pneumonia ? 0 316 | What is the deepest lake in the US ? 4 317 | What is a fuel cell ? 0 318 | Who was the first U.S. president to appear on TV ? 3 319 | Where is the Little League Museum ? 4 320 | What are the two types of twins ? 1 321 | What is the brightest star ? 4 322 | What is diabetes ? 0 323 | When was President Kennedy shot ? 5 324 | What is TMJ ? 2 325 | What color is yak milk ? 1 326 | What date was Dwight D. Eisenhower born ? 5 327 | What does the technical term ISDN mean ? 2 328 | Why is the sun yellow ? 0 329 | What is the conversion rate between dollars and pounds ? 5 330 | When was Abraham Lincoln born ? 5 331 | What is the Milky Way ? 0 332 | What is mold ? 0 333 | What year was Mozart born ? 5 334 | What is a group of frogs called ? 1 335 | What is the name of William Penn 's ship ? 1 336 | What is the melting point of gold ? 5 337 | What is the street address of the White House ? 4 338 | What is semolina ? 0 339 | What fruit is Melba sauce made from ? 1 340 | What is Ursa Major ? 0 341 | What is the percentage of water content in the human body ? 5 342 | How much does water weigh ? 5 343 | What was President Lyndon Johnson 's reform program called ? 1 344 | What is the murder rate in Windsor , Ontario ? 5 345 | Who is the only president to serve 2 non-consecutive terms ? 3 346 | What is the population of Australia ? 5 347 | Who painted the ceiling of the Sistine Chapel ? 3 348 | Name a stimulant . 1 349 | What is the effect of volcanoes on the climate ? 0 350 | What year did the Andy Griffith show begin ? 5 351 | What is acid rain ? 0 352 | What is the date of Mexico 's independence ? 5 353 | What is the location of Lake Champlain ? 4 354 | What is the Illinois state flower ? 1 355 | What is Maryland 's state bird ? 1 356 | What is quicksilver ? 0 357 | Who wrote `` The Divine Comedy '' ? 3 358 | What is the speed of light ? 5 359 | What is the width of a football field ? 5 360 | Why in tennis are zero points called love ? 0 361 | What kind of dog was Toto in the Wizard of Oz ? 1 362 | What is a thyroid ? 0 363 | What does ciao mean ? 0 364 | What is the only artery that carries blue blood from the heart to the lungs ? 1 365 | How often does Old Faithful erupt at Yellowstone National Park ? 5 366 | What is acetic acid ? 0 367 | What is the elevation of St. Louis , MO ? 5 368 | What color does litmus paper turn when it comes into contact with a strong acid ? 1 369 | What are the colors of the German flag ? 1 370 | What is the Moulin Rouge ? 0 371 | What soviet seaport is on the Black Sea ? 4 372 | What is the atomic weight of silver ? 5 373 | What currency do they use in Brazil ? 1 374 | What are pathogens ? 0 375 | What is mad cow disease ? 0 376 | Name a food high in zinc . 1 377 | When did North Carolina enter the union ? 5 378 | Where do apple snails live ? 4 379 | What are ethics ? 0 380 | What does CPR stand for ? 2 381 | What is an annuity ? 0 382 | Who killed John F. Kennedy ? 3 383 | Who was the first vice president of the U.S. ? 3 384 | What birthstone is turquoise ? 1 385 | Who was the first US President to ride in an automobile to his inauguration ? 3 386 | How old was the youngest president of the United States ? 5 387 | When was Ulysses S. Grant born ? 5 388 | What is Muscular Dystrophy ? 0 389 | Who lived in the Neuschwanstein castle ? 3 390 | What is propylene glycol ? 0 391 | What is a panic disorder ? 0 392 | Who invented the instant Polaroid camera ? 3 393 | What is a carcinogen ? 0 394 | What is a baby lion called ? 1 395 | What is the world 's population ? 5 396 | What is nepotism ? 0 397 | What is die-casting ? 0 398 | What is myopia ? 0 399 | What is the sales tax rate in New York ? 5 400 | Developing nations comprise what percentage of the world 's population ? 5 401 | What is the fourth highest mountain in the world ? 4 402 | What is Shakespeare 's nickname ? 3 403 | What is the heaviest naturally occurring element ? 1 404 | When is Father 's Day ? 5 405 | What does the acronym NASA stand for ? 2 406 | How long is the Columbia River in miles ? 5 407 | What city 's newspaper is called `` The Star '' ? 4 408 | What is carbon dioxide ? 0 409 | Where is the Mason/Dixon line ? 4 410 | When was the Boston tea party ? 5 411 | What is metabolism ? 0 412 | Which U.S.A. president appeared on `` Laugh-In '' ? 3 413 | What are cigarettes made of ? 1 414 | What is the capital of Zimbabwe ? 4 415 | What does NASA stand for ? 2 416 | What is the state flower of Michigan ? 1 417 | What are semiconductors ? 0 418 | What is nuclear power ? 0 419 | What is a tsunami ? 0 420 | Who is the congressman from state of Texas on the armed forces committee ? 3 421 | Who was president in 1913 ? 3 422 | When was the first kidney transplant ? 5 423 | What are Canada 's two territories ? 4 424 | What was the name of the plane Lindbergh flew solo across the Atlantic ? 1 425 | What is genocide ? 0 426 | What continent is Argentina on ? 4 427 | What monastery was raided by Vikings in the late eighth century ? 1 428 | What is an earthquake ? 0 429 | Where is the tallest roller coaster located ? 4 430 | What are enzymes ? 0 431 | Who discovered oxygen ? 3 432 | What is bangers and mash ? 0 433 | What is the name given to the Tiger at Louisiana State University ? 1 434 | Where are the British crown jewels kept ? 4 435 | Who was the first person to reach the North Pole ? 3 436 | What is an ulcer ? 0 437 | What is vertigo ? 0 438 | What is the spirometer test ? 0 439 | When is the official first day of summer ? 5 440 | What does the abbreviation SOS mean ? 2 441 | What is the smallest bird in Britain ? 1 442 | Who invented Trivial Pursuit ? 3 443 | What gasses are in the troposphere ? 1 444 | Which country has the most water pollution ? 4 445 | What is the scientific name for elephant ? 1 446 | Who is the actress known for her role in the movie `` Gypsy '' ? 3 447 | What breed of hunting dog did the Beverly Hillbillies own ? 1 448 | What is the rainiest place on Earth ? 4 449 | Who was the first African American to win the Nobel Prize in literature ? 3 450 | When is St. Patrick 's Day ? 5 451 | What was FDR 's dog 's name ? 1 452 | What colors need to be mixed to get the color pink ? 1 453 | What is the most popular sport in Japan ? 1 454 | What is the active ingredient in baking soda ? 1 455 | When was Thomas Jefferson born ? 5 456 | How cold should a refrigerator be ? 5 457 | When was the telephone invented ? 5 458 | What is the most common eye color ? 1 459 | Where was the first golf course in the United States ? 4 460 | What is schizophrenia ? 0 461 | What is angiotensin ? 0 462 | What did Jesse Jackson organize ? 3 463 | What is New York 's state bird ? 1 464 | What is the National Park in Utah ? 4 465 | What is Susan B. Anthony 's birthday ? 5 466 | In which state would you find the Catskill Mountains ? 4 467 | What do you call a word that is spelled the same backwards and forwards ? 1 468 | What are pediatricians ? 0 469 | What chain store is headquartered in Bentonville , Arkansas ? 3 470 | What are solar cells ? 0 471 | What is compounded interest ? 0 472 | What are capers ? 0 473 | What is an antigen ? 0 474 | What currency does Luxembourg use ? 1 475 | What is the population of Venezuela ? 5 476 | What type of polymer is used for bulletproof vests ? 1 477 | What currency does Argentina use ? 1 478 | What is a thermometer ? 0 479 | What Canadian city has the largest population ? 4 480 | What color are crickets ? 1 481 | Which country gave New York the Statue of Liberty ? 4 482 | What was the name of the first U.S. satellite sent into space ? 1 483 | What precious stone is a form of pure carbon ? 1 484 | What kind of gas is in a fluorescent bulb ? 1 485 | What is rheumatoid arthritis ? 0 486 | What river runs through Rowe , Italy ? 4 487 | What is cerebral palsy ? 0 488 | What city is also known as `` The Gateway to the West '' ? 4 489 | How far away is the moon ? 5 490 | What is the source of natural gas ? 1 491 | In what spacecraft did U.S. astronaut Alan Shepard make his historic 1961 flight ? 1 492 | What is pectin ? 0 493 | What is bio-diversity ? 0 494 | What 's the easiest way to remove wallpaper ? 1 495 | What year did the Titanic start on its journey ? 5 496 | How much of an apple is water ? 5 497 | Who was the 22nd President of the US ? 3 498 | What is the money they use in Zambia ? 1 499 | How many feet in a mile ? 5 500 | What is the birthstone of October ? 1 501 | What is e-coli ? 0 502 | -------------------------------------------------------------------------------- /aug_data/mpqa/dev.tsv: -------------------------------------------------------------------------------- 1 | sentence label 2 | unfettered support 1 3 | such a mindset will impair 0 4 | has revealed george bush's talents as a war leader 1 5 | provide support to the us , as it did 1 6 | kill 0 7 | play with 0 8 | embark on trampling upon human rights of civilians 0 9 | in such a complex state 0 10 | pursue 0 11 | will continue to support 1 12 | places strains 0 13 | wish 1 14 | subjected 0 15 | benefits 0 16 | highly respected 1 17 | this agreement 1 18 | under the strong influence 0 19 | preach 1 20 | hardest hit 0 21 | would be easier 1 22 | will be irresponsible 0 23 | conspicuous policy of playing games 0 24 | to agree 1 25 | to ask 1 26 | was very strong about this 1 27 | were even more full of praise 1 28 | can not be taken seriously 0 29 | felt gratified and relieved 1 30 | invited 1 31 | languages without nuances 0 32 | scheme 0 33 | incapacity to put crime under control 0 34 | it is pure rhetoric 0 35 | charged 0 36 | attacks 0 37 | anger 0 38 | vowed 1 39 | ridiculous 0 40 | are ready 1 41 | were accused 0 42 | will of 1 43 | unheeded 0 44 | opposes and rejects 0 45 | outcry 0 46 | join forces 1 47 | ca n't be burying its head in the sand 0 48 | insidious 0 49 | would not accept 0 50 | too costly to meet 0 51 | should not 0 52 | does not reflect 0 53 | invited 1 54 | top priority of 1 55 | only benefits corporate america 0 56 | still 0 57 | support 1 58 | not intent on revitalizing its economies 0 59 | angry anti - us protests erupted 0 60 | hopes 1 61 | m feeling much better 1 62 | branded 0 63 | the danger 0 64 | proposal to reunify 1 65 | nothing seemed more normal than to settle in someone else's territory 0 66 | to blame 0 67 | have been growing less visionary 0 68 | will fight on 0 69 | shortcomings 0 70 | deadly 0 71 | would incur its displeasure 0 72 | like a dangerous virus 0 73 | it is a futile illusion because it is a lie 0 74 | are irritated 0 75 | in a way our government has not been 0 76 | neither good nor bad , just incorrigible 0 77 | destroyed 0 78 | respecting 1 79 | to promote 1 80 | witch - hunting 0 81 | biggest terrorist country 0 82 | legitimate dissent 1 83 | even against vehicles transporting pregnant women in labor and against unarmed citizens 0 84 | niceties 1 85 | his ship has been sailing towards a wrong direction for far too long 0 86 | critics 0 87 | once again turn its back 0 88 | growing unrest 0 89 | defects 0 90 | has no love lost 0 91 | violent protests 0 92 | confined to minute cells 0 93 | immediately 1 94 | corrupt politicians 0 95 | a crime against humanity 0 96 | the rights and privileges that they would automatically get under geneva convention 0 97 | perils of delusion 0 98 | would be better off 1 99 | have been concerned 0 100 | uproar 0 101 | pained him 0 102 | terrorism 0 103 | not only prejudice 0 104 | a brutal occupation 0 105 | meddling 0 106 | falls at a bad time 0 107 | ugliest crimes 0 108 | picking on 0 109 | allegations 0 110 | are being well treated 1 111 | the serious crisis 0 112 | will be a monumental tragedy 0 113 | is proud 1 114 | thus 0 115 | oh , my god 0 116 | will not allow 0 117 | the worst crisis 0 118 | turn its back 0 119 | deepened 0 120 | will complain 0 121 | very hard line 0 122 | opposition 0 123 | would not dilute 1 124 | secret 0 125 | opposition 0 126 | illusory 0 127 | not adequately free and fair 0 128 | should follow 1 129 | impossible 0 130 | an aberration that goes against mankind 0 131 | have preference for 1 132 | all around us 0 133 | very , very large 1 134 | lie 0 135 | were ill - received 0 136 | difficult 0 137 | and then assert the laws of war do not apply 0 138 | from upholding religious and cultural values 1 139 | preferred 1 140 | full support 1 141 | criticizing 0 142 | immediate exploitation 0 143 | ignored 0 144 | has unashamedly seen fit 1 145 | support 1 146 | now a reality 1 147 | gaffe 0 148 | virtually nothing 0 149 | no place in israel can be considered safe 0 150 | to invite 1 151 | prevent such a development 1 152 | rigged 0 153 | logical solution 1 154 | bungled its dealings 0 155 | demerits 0 156 | like the suffering 0 157 | unacceptable 0 158 | supports 1 159 | 100 percent 1 160 | does not appear 0 161 | pretext 0 162 | what begins to happen when we hate our friends 0 163 | denies 0 164 | intimidation 0 165 | children skipping school 0 166 | criticism 0 167 | violence and intimidation 0 168 | blatantly obstructed 0 169 | reactions of repulsion 0 170 | are not happy 0 171 | criticized 0 172 | a growing impression that misuari could pose a security threat 0 173 | left behind 1 174 | they have not succeeded , and will never succeed 1 175 | it's shameful 0 176 | beyond the reach of any legal regime 0 177 | is pessimistic 0 178 | has no right 0 179 | are masterminded 0 180 | regret 0 181 | will weaken soon 0 182 | terrible tragedy 0 183 | there is no alternative to it but conflict , isolation , nationalism , and ultimately war 0 184 | freak show 0 185 | insisted 1 186 | axis of evil 0 187 | come under fire 0 188 | threats expressed by 0 189 | the support 1 190 | wrong 0 191 | has resisted 0 192 | the darkest hour is always before the dawn 1 193 | the entire palestinian people 0 194 | denounced 0 195 | very serious threat 0 196 | only 0 197 | making a spectacle of 0 198 | promote meetings 1 199 | to mould the electoral process in his favour 0 200 | not a single day would pass in peace and with no palestinians shedding blood 0 201 | crises 0 202 | hoping 1 203 | to intimidate women and children 0 204 | disputes 0 205 | extreme right 0 206 | most dangerous 0 207 | may even get better 0 208 | promise 1 209 | has pledged 1 210 | one body with two heads 0 211 | feel at ease 1 212 | turning a blind eye 0 213 | has floundered 0 214 | warns 0 215 | most cogent argument 1 216 | lump 0 217 | deals a blow 0 218 | bad treatment 0 219 | concerned 0 220 | have criticized 0 221 | you can hardly speak of a targeting error 0 222 | certain countries resort to 0 223 | called for 1 224 | violating human rights 0 225 | unusual 0 226 | would undermine 0 227 | a terrorist act 0 228 | threat 0 229 | objectives 1 230 | can be abated 1 231 | muscle - flexing 0 232 | pursued 1 233 | brilliant 1 234 | committed 1 235 | committing themselves 1 236 | support 1 237 | the importance of china 1 238 | only one single 0 239 | hoped 1 240 | closed ranks behind 1 241 | peace 1 242 | would not be a bad idea 1 243 | legitimate 1 244 | closer to that of cowboys than to a civilized mentality 0 245 | such fruitful results 1 246 | double crime 0 247 | perfectly at ease 1 248 | mistake 0 249 | repeatedly accused 0 250 | ceased to be a soldier of the fatherland 0 251 | however 0 252 | apocalyptic savagery 0 253 | a border of peace and good neighbourliness 1 254 | call for 1 255 | hurt 0 256 | declined to endorse 0 257 | countries such as iran , iraq and north korea represent an ``axis of evil 0 258 | more serious 0 259 | am confident 1 260 | picking on 0 261 | whether 0 262 | go out into the streets to defend 1 263 | committed one more mistake 0 264 | the agreement 1 265 | charging 0 266 | other aggressive acts against lebanon 0 267 | enjoying 1 268 | decided 0 269 | hard - line 0 270 | feels 0 271 | do n't want 0 272 | the threats launched 0 273 | rejected 0 274 | breaking fundamental concepts 0 275 | increasingly tyrannical 0 276 | undisguised declaration of war and a rhetoric threatening aggression 0 277 | want 1 278 | hyperbole , 0 279 | patronizing 0 280 | arbitrary arrests 0 281 | jeopardy 0 282 | could not be said to adequately comply 0 283 | with good economic management , israel should have little trouble 1 284 | terrorist allies 0 285 | swift criticism from 0 286 | instead 0 287 | steering the economy into disaster 0 288 | grew so unhappy 0 289 | will not resort 0 290 | has refused 0 291 | may not be feasible 0 292 | danger of being shelved altogether 0 293 | unmanageable 0 294 | favouring independence 1 295 | misrule 0 296 | misery 0 297 | steering the nation to prosperity and progress 1 298 | america's biding 0 299 | compounded the problem 0 300 | designed to benefit mugabe 0 301 | supposed to be 0 302 | appraised 1 303 | stand beside right and justice 1 304 | refuses 0 305 | surged 1 306 | denied 0 307 | fake imposter 0 308 | great evil on open display 0 309 | ideal , sunny clime 1 310 | seems to be determined to expand the scope of the anti - terror war 0 311 | a body blow 0 312 | as full citizens in the same sate 1 313 | lecturing 0 314 | suffered 0 315 | harmed 0 316 | criticized 0 317 | scores of 0 318 | ignore the consequences 0 319 | territorial ambition 0 320 | not 0 321 | no one has the right to wage war against the history of this nation 0 322 | wants 1 323 | hatred 0 324 | what occured in the united states on 11 september 0 325 | oneupmanship 0 326 | labeling 0 327 | showed little - disguised irritation 0 328 | concern 0 329 | even 0 330 | the possibility of a democratic , stable and prosperous 1 331 | strongly criticized and condemned 0 332 | will not admit 0 333 | persuade 1 334 | was more than confident 1 335 | had not heeded 0 336 | cost - effective 1 337 | mistake 0 338 | would adhere to a pluralistic vision 1 339 | had asked 1 340 | replete with 1 341 | shameful 0 342 | will only 0 343 | accidental slight 0 344 | nothing whatsoever 0 345 | could no longer tolerate 0 346 | leeway 1 347 | the doubts 0 348 | aggressions against 0 349 | raving 0 350 | are blamed by 0 351 | so many uncertainties 0 352 | destined to collapse 1 353 | confidence crisis 0 354 | are accusing 0 355 | warned 0 356 | such pessimism 0 357 | anxiety 0 358 | cause a rift 0 359 | no politically prudent 0 360 | held out an olive branch 1 361 | continues to demolish 0 362 | does not endorse 0 363 | failed 1 364 | comprehensive destructive policy 0 365 | sounds clumsy 0 366 | fails to meet the standard of being free and fair 0 367 | willfulness 1 368 | tough policy 0 369 | has not satisfied 0 370 | extremist inclinations 0 371 | unilateral 0 372 | the iranians have not done what the pakistan government has done 0 373 | because 0 374 | boycotted 0 375 | without just compensation 0 376 | desperation of the people 0 377 | damage 0 378 | dominating the world 0 379 | sought 1 380 | fearing 0 381 | that concessions are sufficiently concrete 1 382 | put his nation first 1 383 | expresses the concern 0 384 | can contaminate 0 385 | to put it mildly 0 386 | will do its utmost 1 387 | endorsed 1 388 | will surely be on the president's lips 0 389 | with typical understatement 0 390 | lambasted 0 391 | affirmed 1 392 | vitriol 0 393 | in compliance 0 394 | numerous serious abuses 0 395 | asked 1 396 | confidence 1 397 | played the same tactic again 0 398 | are enthusiastic 1 399 | valued ally and friend 1 400 | so - called 0 401 | are regarding 0 402 | running out of meaningful options 0 403 | can not accept 0 404 | is concerned 0 405 | further criticism 0 406 | of all the unconscionable things 0 407 | longest destructive war 0 408 | plight of 0 409 | can no longer stand shoulder to shoulder 0 410 | it would be madness to hold elections now 0 411 | should be looking up smiling 1 412 | if one goes by the us logic , only the us spy plane is permitted to spy at other people's doorsteps 0 413 | begins to bear fruit 1 414 | rigged 0 415 | serious division 0 416 | enjoys no social base 0 417 | very constructive 1 418 | grateful 1 419 | turned thugs 0 420 | was unconstitutional 0 421 | scares away 0 422 | declining to endorse 0 423 | violate human rights 0 424 | accused 0 425 | negatively 0 426 | due regard 1 427 | simplistic 0 428 | will help renew 1 429 | succumbing 0 430 | peace to prevail 1 431 | despite all of this , however 0 432 | proves this beyond the shadow of a doubt 1 433 | is accused 0 434 | intolerant 0 435 | the visit has achieved positive and successful results 1 436 | wantonly infringing 0 437 | assassin 0 438 | continue to obstruct 0 439 | interests 1 440 | was a strong desire among 1 441 | to denounce 0 442 | left no avenue unexplored 1 443 | endorsed 1 444 | would also ensure a durable peace 1 445 | as a vehicle for increasing personal popularity 1 446 | would have been sending a very bad signal 0 447 | continues to refuse 0 448 | pursuit 0 449 | committing themselves to peace 1 450 | accuses 0 451 | as if they were quarries 0 452 | far worse than those prevailing in camp x - ray 0 453 | inhumanely 0 454 | appropriate 1 455 | poor 0 456 | good education system 1 457 | illegal 0 458 | devastating 0 459 | to create the impression 0 460 | exacerbating 0 461 | faltered as a result of israel's intransigence 0 462 | his career was like a rough sea , with highs and lows and never calm 0 463 | generally approved of 1 464 | deviant 0 465 | without trying to maneuver , place obstacles , or set impossible conditions 0 466 | the fire is raging at home 0 467 | lacks credibility and can not withstand any objective scrutiny 0 468 | endorsed 1 469 | assassinate innocent activists and citizens 0 470 | than they deserve 0 471 | refusal to respect its obligations 0 472 | would not 0 473 | for the first time 1 474 | beautiful historic coincidence 1 475 | hardly elastic enough 0 476 | approve 1 477 | hope 1 478 | was so hard on 0 479 | most serious consequences 0 480 | will have trouble wriggling out of the need to explain and justify 0 481 | not completely reliable 0 482 | democracy exhausted all its generosity 0 483 | relatively calm 1 484 | are extremely concerned 0 485 | bringing an end to terrorism and the taliban 1 486 | reject 0 487 | stand firm 1 488 | new political bogey 0 489 | abstractly recommend 1 490 | was worried 0 491 | hardline 0 492 | crash course 0 493 | recognition 1 494 | repeated denunciations 0 495 | hope 1 496 | carnage 0 497 | threats 0 498 | swapping the silver - visored helmet of a space cadet for the green eyeshade of a consummate bean counter 0 499 | to voice his concern 0 500 | significant 0 501 | can bring security and stability to all the parties without exception 1 502 | would consider 1 503 | be reproached 0 504 | or so it claims 0 505 | harboring serious doubts 0 506 | if it becomes more of a nuisance 0 507 | need to establish a just peace 1 508 | immediate support 1 509 | discrediting 0 510 | picking a quarrel 0 511 | bridle the israeli oppressive activity 1 512 | the euro , our currency 1 513 | edifying photograph 1 514 | wrong judgment 0 515 | preservation of global peace and security 1 516 | sharply questioned 0 517 | denied 0 518 | respect 1 519 | unfeasible 0 520 | has promised 1 521 | fierce demonstrations 0 522 | to thwart 1 523 | disrespect of advice 0 524 | war on 0 525 | as the saying goes , when you pull up the turnip , mud comes with it 0 526 | smiling 1 527 | judgment 0 528 | an axis of evil 0 529 | criticizes 0 530 | costly burden 0 531 | detrimental 0 532 | earned eternal notoriety 0 533 | making liberal use of the but construction 0 534 | plotting 0 535 | financial disaster 0 536 | would be regarded 0 537 | parasitic economies 0 538 | wanted 1 539 | more demagogy than arguments 0 540 | israel's superior ability to punish palestinians 0 541 | flirting with 0 542 | was perceived 0 543 | what is europe 0 544 | the best role model 1 545 | inaccurate and fabricated 0 546 | recognition 1 547 | axis of evil rhetoric 0 548 | lost their illegitimate interests 0 549 | not a man who likes to improvise 0 550 | is criticized 0 551 | can trust 1 552 | foresaw 0 553 | heresy 0 554 | him has not been pretty 0 555 | demonstrations and rallies against 0 556 | axis of evil 0 557 | cold war heritage 0 558 | possible regression 0 559 | will guarantee 0 560 | came out in protest 0 561 | progressive 1 562 | peaceful protests 0 563 | difficulty , of course 0 564 | put on a spectacle 0 565 | dogma 0 566 | devastating what remains 0 567 | is stiffening in its attitude toward 0 568 | stuck for 18 months on ground zero 0 569 | put most of the blame 0 570 | very large 0 571 | once again 0 572 | it is almost impossible 0 573 | has criticised 0 574 | axis of evil 0 575 | humiliation of 0 576 | calm 1 577 | are tired 0 578 | enjoy 1 579 | wanted 1 580 | has blamed 0 581 | unprecedented force 0 582 | provocative 0 583 | far preferable 1 584 | repeated warnings 0 585 | neither free nor fair 0 586 | backing 1 587 | widespread debates against 0 588 | using violence , intimidation , special laws and dirty tricks to fix the two - day election 0 589 | continue to obstruct 0 590 | seeking 1 591 | biased attitude 0 592 | long neglected 0 593 | success of western europe after world war ii laid in its creation of the good neighborly field 1 594 | even risking his own future 0 595 | accused 0 596 | denied 0 597 | desperate tableau 0 598 | could n't wait 1 599 | blessed 1 600 | nuclear club 0 601 | most important 1 602 | wants 0 603 | questioning 0 604 | no one is listening 0 605 | complicates any situation 0 606 | who were expelled from their homeland 0 607 | distortion of reality 0 608 | had the advantage for washington of weakening opec 1 609 | the support 1 610 | may god be satisfied with him 1 611 | continue to rise 0 612 | will not be able lay claim to a worthy place in the civilized world 0 613 | has to be maintained at any cost 0 614 | thugs 0 615 | apprehensions 0 616 | want 1 617 | recommendations 1 618 | one of the few enlightened officials 1 619 | democratic achievements 1 620 | tensions between 0 621 | see 0 622 | infuriating 0 623 | first organized protest 0 624 | feared by 0 625 | ignored 0 626 | believe 0 627 | particular concern is raised 0 628 | main problems are in the economic area 0 629 | damages the credibility 0 630 | declining to comply 0 631 | blamed 0 632 | allegedly threatening 0 633 | dismisses 0 634 | the unconditional support 1 635 | not satisfactory 0 636 | negated 0 637 | berating 0 638 | would remake venezuela to benefit the poor 1 639 | `world judge of human rights' 0 640 | were only to be expected 0 641 | cooperation 1 642 | issuing a letter of objection 0 643 | would support wholeheartedly 1 644 | giving him medium approval 1 645 | at a loss 0 646 | dwindling moral values 0 647 | is self - inflicted 0 648 | if it is successful 0 649 | impose 0 650 | eradication 0 651 | feared 0 652 | scupper any hope 0 653 | backing away 0 654 | held 0 655 | will have no universally - acknowledged 0 656 | did not show the slightest sympathy , still less the least regret 0 657 | advanced a mendacious critique 0 658 | disputes between 0 659 | encouraged 1 660 | another setback for the imf 0 661 | wanted 1 662 | clear priority 1 663 | crimes of war 0 664 | destroyed 0 665 | warned 0 666 | decided to back 1 667 | violates the united nations charter 0 668 | interfere 0 669 | have n't already violated 0 670 | defenceless 0 671 | was effective especially 1 672 | is accusing 0 673 | a man blinded by power 0 674 | backing 1 675 | it's really strange 0 676 | his favourite newfie 0 677 | mercurial strongman 0 678 | that four shots would solve the problem 0 679 | would not accept 0 680 | there is no reason for it to be impossible 1 681 | poor response 0 682 | aspirations 1 683 | to the healthy development of sino - us relations 1 684 | resolute commitment 1 685 | democracy 1 686 | the concern 0 687 | only wants 1 688 | needlessly 0 689 | secretly behind every local misfortune 0 690 | positive 1 691 | is disgusted 0 692 | high degree of difficulty 0 693 | improvement 1 694 | spurned 0 695 | has denounced 0 696 | axis of evil theory 0 697 | extensive support 1 698 | worst 0 699 | its selfishness 0 700 | ambition 1 701 | find no grounds for any support 0 702 | achieving better results 1 703 | no tears will be shed in this corner 0 704 | is a telling example of a new - and perhaps risky - approach 0 705 | will become one of the best elements 1 706 | was also quite naive 0 707 | the criticism 0 708 | desire to work 1 709 | for the sake of peace 1 710 | double standard 0 711 | pretended 0 712 | alarm 0 713 | discontent 0 714 | furthermore 0 715 | massacring thousands of innocent people 0 716 | immense gulf between 0 717 | still wants 1 718 | basically sound 1 719 | repeatedly threatened 0 720 | mendacious 0 721 | beyond reproach 1 722 | but 0 723 | neat stuff 1 724 | purposely play up 0 725 | blatant campaign of intimidation 0 726 | disappointment 0 727 | provoked 0 728 | intends to 0 729 | worried 0 730 | real danger 0 731 | supported 1 732 | support 1 733 | were plotting 0 734 | agreed 1 735 | most realistic 1 736 | violating 0 737 | value sharing 1 738 | to its knees 0 739 | to request 1 740 | get out ! . 0 741 | the axis of evil 0 742 | assassins , assassins , assassins 0 743 | is the appropriate one 1 744 | resistance 1 745 | indulging in blood - shed and their lunaticism 0 746 | desperate 0 747 | to describe 0 748 | can ask 1 749 | know 0 750 | invited 1 751 | to express satisfaction 1 752 | harmonious and close 1 753 | support 1 754 | has been inclined toward 1 755 | incited 0 756 | was like giving away the country to its former colonial masters 0 757 | the destruction 0 758 | already volatile 0 759 | overwhelming evidence 0 760 | imperative for harmonious society 1 761 | nor does it seem proper 0 762 | storming palestinian territories 0 763 | is reflected 1 764 | expressed satisfaction 1 765 | intends 1 766 | ruined 0 767 | made public a blacklist 0 768 | those who seek to attack and destroy law and order and legitimate government 0 769 | unlawful 0 770 | freely 1 771 | reassurances 1 772 | agreeable 1 773 | dont want 0 774 | did not agree 0 775 | to deny 0 776 | perception 0 777 | had particularly harsh words for 0 778 | want 1 779 | the brutality with which this closure was implemented was unheard of 0 780 | friendship 1 781 | axis of evil 0 782 | supported 1 783 | slaughter of more than 0 784 | will be asked 1 785 | just , legitimate rights 1 786 | condemned 0 787 | thanked 1 788 | unstable situation 0 789 | merely an interlude 0 790 | depends on 0 791 | has been efficient enough 1 792 | grand 1 793 | any step backward for democracy 0 794 | quite supportive 1 795 | great leader 1 796 | would help 1 797 | formed a close friendship 1 798 | uphold 1 799 | is feared by 0 800 | used to boast 1 801 | regarded 1 802 | would violate international standards prohibiting cruel , inhuman or degrading treatment 0 803 | convince the americans that the bush administration has not only succeeded in removing a regime of religious madmen 0 804 | has put west asia on the brink of another war 0 805 | has also backed 1 806 | attacks 0 807 | banned 0 808 | dishonoring 0 809 | sent congratulations 1 810 | thought 0 811 | may be disconcerted 0 812 | concern 0 813 | will be inviting 1 814 | have to bash 0 815 | plan 1 816 | guaranteed 1 817 | the mistake is to assume 0 818 | insists 1 819 | sticking to the polluting policies 0 820 | can be difficult 0 821 | can prevent 1 822 | extremely dangerous 0 823 | would beg for surrender 0 824 | protesting 0 825 | proclaimed 1 826 | to applaud everything 1 827 | the most ideal way 1 828 | accused 0 829 | what fate awaits argentina's crisis - stricken economy 0 830 | especially those who are dangerous and suicidal 0 831 | has failed for a decade now 0 832 | had likely deteriorated beyond repair 0 833 | attempts to suppress 0 834 | endorse 1 835 | if there were serious problems 0 836 | designed to achieve an outcome - power at all costs 0 837 | criticism of 0 838 | everything 0 839 | so that we would not become terrorists 1 840 | higher transparency 1 841 | warning 0 842 | does not justify all the means 0 843 | the further 0 844 | artificial obstacles 0 845 | would improve 0 846 | has now made himself heard 1 847 | openly invited 1 848 | argued 0 849 | decisions 0 850 | admittedly 1 851 | still prefer 1 852 | had planned 0 853 | sacrifice himself 0 854 | very pleasant for 1 855 | prevented 0 856 | favorable opinions 1 857 | no one we know to have planned such deeds will escape 0 858 | to express openly dissenting political and religious views 0 859 | confidence 1 860 | no business 0 861 | has provoked concern 0 862 | had aligned against 0 863 | under the pretext of fighting terrorism 0 864 | violation of the palestinian people's human rights 0 865 | understanding and approval 1 866 | concerns 0 867 | universal character of the prophets 1 868 | force himself 0 869 | getting mixed up with crime , drugs , and violence 0 870 | bold and fair 1 871 | like an upholder of justice 1 872 | factually inaccurate 0 873 | criticism 0 874 | was shocked 0 875 | saw 1 876 | had refused to accept 0 877 | would make it possible 1 878 | domino effect 0 879 | palestinian hand that is stretched for peace 0 880 | has refused to bow 0 881 | despite the perils 0 882 | will be instrumental 1 883 | concerns 0 884 | those who are really behind it all , those who are behind this business , use the imf 0 885 | accommodate 1 886 | doubts 0 887 | slammed 0 888 | will seek 1 889 | grieving 0 890 | called for 1 891 | isolated and frustrated 0 892 | in the name of self - righteousness 0 893 | intensify and accelerate 0 894 | keeping a wary eye 0 895 | took to the streets 0 896 | gave free rein 0 897 | to rid 1 898 | wanted 1 899 | betrayal 0 900 | the branding 0 901 | collapse 0 902 | i lost any sense of dignity 0 903 | massive intimidatory show of force that was evident in the deployment of security forces 0 904 | unpopular system 0 905 | regarded 0 906 | has posed serious threats 0 907 | to draw up an action campaign aimed 0 908 | illegally 0 909 | reputation was ruined 0 910 | the opposition 0 911 | cracks are appearing 0 912 | could fray the bilateral goodwill 0 913 | was deemed 0 914 | has renounced 0 915 | the cia would organize coups and assassinations to further us interests 0 916 | the definition of a democratic regime is subjected to double standards 0 917 | strike 0 918 | challenged 0 919 | sympathy 0 920 | rubbed their palms at length 1 921 | shook 0 922 | sided with 1 923 | trying to move the goal posts 0 924 | lose popular support among 0 925 | will only invite worse criticism and rejection 0 926 | axis of evil 0 927 | absolutely not permitted 0 928 | found little to object to 0 929 | justify 1 930 | extension of its characteristic policy of hegemony 0 931 | unrealistic 0 932 | warned against 0 933 | crippled 0 934 | well 1 935 | it is ineffective 0 936 | is now promoting 1 937 | called 0 938 | in spite of the good offices 0 939 | genuinely 1 940 | for more than two years 0 941 | tried to encourage 1 942 | loathsome action 0 943 | reckless 0 944 | and killed 0 945 | likening 0 946 | will now agree 1 947 | exalted 1 948 | would like 1 949 | hoped 1 950 | even more dismal 0 951 | gives new life 1 952 | rejected 0 953 | predatory 0 954 | viciously spoke ill 0 955 | in protest against 0 956 | was remarkable 1 957 | -------------------------------------------------------------------------------- /aug_data/stsa.binary/run.log: -------------------------------------------------------------------------------- 1 | /home/xgg/anaconda3/bin/python /home/xgg/pros/cbert_aug/aug_dataset_wo_ft.py 2 | Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. 3 | /home/xgg/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. 4 | from ._conv import register_converters as _register_converters 5 | Namespace(bert_model='bert-base-uncased', data_dir='datasets', do_lower_case=False, learning_rate=5e-05, max_seq_length=64, num_train_epochs=6.0, output_dir='aug_data', seed=42, task_name='stsa.binary', train_batch_size=32, warmup_proportion=0.1) 6 | 03/08/2019 21:02:11 - INFO - pytorch_pretrained_bert.modeling - Model config { 7 | "attention_probs_dropout_prob": 0.1, 8 | "hidden_act": "gelu", 9 | "hidden_dropout_prob": 0.1, 10 | "hidden_size": 768, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 3072, 13 | "max_position_embeddings": 512, 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "type_vocab_size": 2, 17 | "vocab_size": 30522 18 | } 19 | 20 | 03/08/2019 21:02:14 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] 21 | 03/08/2019 21:02:15 - INFO - __main__ - *** Example *** 22 | 03/08/2019 21:02:15 - INFO - __main__ - guid: train-1 23 | 03/08/2019 21:02:15 - INFO - __main__ - tokens: [CLS] an un ##int ##ent ##ional ##ly surreal kid ' s picture . . . in which actors in bad bear suits en ##act a sort of inter - species parody of a vh1 behind the music episode . [SEP] 24 | 03/08/2019 21:02:15 - INFO - __main__ - init_ids: 101 2019 4895 18447 4765 19301 2135 16524 4845 1005 1055 3861 1012 1012 1012 1999 2029 5889 1999 2919 4562 11072 4372 18908 1037 4066 1997 6970 1011 2427 12354 1997 1037 26365 2369 1996 2189 2792 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 | 03/08/2019 21:02:15 - INFO - __main__ - input_ids: 101 2019 4895 18447 4765 103 2135 103 103 1005 1055 3861 1012 1012 1012 6970 103 5889 1999 2919 4562 11072 4372 18908 1037 4066 1997 6970 1011 2427 12354 1997 1037 26365 2369 1996 2189 26365 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 26 | 03/08/2019 21:02:15 - INFO - __main__ - input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 27 | 03/08/2019 21:02:15 - INFO - __main__ - segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 28 | 03/08/2019 21:02:15 - INFO - __main__ - masked_lm_labels: -1 -1 -1 -1 -1 19301 -1 16524 4845 -1 -1 -1 -1 -1 -1 1999 2029 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 2792 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 29 | 03/08/2019 21:02:15 - INFO - __main__ - *** Example *** 30 | 03/08/2019 21:02:15 - INFO - __main__ - guid: train-2 31 | 03/08/2019 21:02:15 - INFO - __main__ - tokens: [CLS] the kind of film that leaves you scratching your head in amazement over the fact that so many talented people could participate in such an ill - advised and poorly executed idea . [SEP] 32 | 03/08/2019 21:02:15 - INFO - __main__ - init_ids: 101 1996 2785 1997 2143 2008 3727 2017 20291 2115 2132 1999 21606 2058 1996 2755 2008 2061 2116 10904 2111 2071 5589 1999 2107 2019 5665 1011 9449 1998 9996 6472 2801 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 | 03/08/2019 21:02:15 - INFO - __main__ - input_ids: 101 1996 2785 1997 2143 2008 3727 2017 20291 2115 2132 1999 103 103 1996 2755 2008 2061 2116 10904 103 2071 5589 1999 2107 2019 5665 1011 9449 6472 9996 103 2801 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 34 | 03/08/2019 21:02:15 - INFO - __main__ - input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 35 | 03/08/2019 21:02:15 - INFO - __main__ - segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36 | 03/08/2019 21:02:15 - INFO - __main__ - masked_lm_labels: -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 21606 2058 -1 -1 -1 -1 -1 -1 2111 -1 -1 -1 -1 -1 -1 -1 -1 1998 -1 6472 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 37 | 03/08/2019 21:02:17 - INFO - __main__ - ***** Running training ***** 38 | 03/08/2019 21:02:17 - INFO - __main__ - Num examples = 6228 39 | 03/08/2019 21:02:17 - INFO - __main__ - Batch size = 32 40 | 03/08/2019 21:02:17 - INFO - __main__ - Num steps = 1167 41 | constract vocabulary based on frequency 42 | # train data: 6228 43 | # test data: 692 44 | # vocab: 14830 45 | # class: 2 46 | Epoch: 0%| | 0/6 [00:00 max_seq_length - 2: 138 | tokens_a = tokens_a[0:(max_seq_length - 2)] 139 | 140 | # Due to we use conditional bert, we need to place label information in segment_ids 141 | tokens = [] 142 | segment_ids = [] 143 | # is [CLS]和[SEP] needed ？ 144 | tokens.append("[CLS]") 145 | segment_ids.append(segment_id) 146 | for token in tokens_a: 147 | tokens.append(token) 148 | segment_ids.append(segment_id) 149 | tokens.append("[SEP]") 150 | segment_ids.append(segment_id) 151 | masked_lm_labels = [-1] * max_seq_length 152 | 153 | cand_indexes = [] 154 | for (i, token) in enumerate(tokens): 155 | if token == "[CLS]" or token == "[SEP]": 156 | continue 157 | cand_indexes.append(i) 158 | 159 | rng.shuffle(cand_indexes) 160 | len_cand = len(cand_indexes) 161 | 162 | output_tokens = list(tokens) 163 | 164 | num_to_predict = min(max_predictions_per_seq, 165 | max(1, int(round(len(tokens) * masked_lm_prob)))) 166 | 167 | masked_lms_pos = [] 168 | covered_indexes = set() 169 | for index in cand_indexes: 170 | if len(masked_lms_pos) >= num_to_predict: 171 | break 172 | if index in covered_indexes: 173 | continue 174 | covered_indexes.add(index) 175 | 176 | masked_token = None 177 | # 80% of the time, replace with [MASK] 178 | if rng.random() < 0.8: 179 | masked_token = "[MASK]" 180 | else: 181 | # 10% of the time, keep original 182 | if rng.random() < 0.5: 183 | masked_token = tokens[index] 184 | # 10% of the time, replace with random word 185 | else: 186 | masked_token = tokens[cand_indexes[rng.randint(0, len_cand - 1)]] 187 | 188 | masked_lm_labels[index] = tokenizer.convert_tokens_to_ids([tokens[index]])[0] 189 | output_tokens[index] = masked_token 190 | masked_lms_pos.append(index) 191 | 192 | init_ids = tokenizer.convert_tokens_to_ids(tokens) 193 | input_ids = tokenizer.convert_tokens_to_ids(output_tokens) 194 | 195 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 196 | # tokens are attended to. 197 | input_mask = [1] * len(input_ids) 198 | 199 | # Zero-pad up to the sequence length. 200 | while len(input_ids) < max_seq_length: 201 | init_ids.append(0) 202 | input_ids.append(0) 203 | input_mask.append(0) 204 | segment_ids.append(0) # ?segment_id 205 | 206 | assert len(init_ids) == max_seq_length 207 | assert len(input_ids) == max_seq_length 208 | assert len(input_mask) == max_seq_length 209 | assert len(segment_ids) == max_seq_length 210 | 211 | if ex_index < 2: 212 | logger.info("*** Example ***") 213 | logger.info("guid: %s" % (example.guid)) 214 | logger.info("tokens: %s" % " ".join( 215 | [str(x) for x in tokens])) 216 | logger.info("init_ids: %s" % " ".join([str(x) for x in init_ids])) 217 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 218 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 219 | logger.info( 220 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 221 | logger.info("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels])) 222 | 223 | features.append( 224 | InputFeatures(init_ids=init_ids, 225 | input_ids=input_ids, 226 | input_mask=input_mask, 227 | segment_ids=segment_ids, 228 | masked_lm_labels=masked_lm_labels)) 229 | return features 230 | 231 | def rev_wordpiece(str): 232 | #print(str) 233 | if len(str) > 1: 234 | for i in range(len(str)-1, 0, -1): 235 | if str[i] == '[PAD]': 236 | str.remove(str[i]) 237 | elif len(str[i]) > 1 and str[i][0]=='#' and str[i][1]=='#': 238 | str[i-1] += str[i][2:] 239 | str.remove(str[i]) 240 | return " ".join(str[1:-1]) 241 | 242 | def main(): 243 | parser = argparse.ArgumentParser() 244 | 245 | ## Required parameters 246 | parser.add_argument("--data_dir", default="datasets", type=str, 247 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 248 | parser.add_argument("--output_dir", default="aug_data", type=str, 249 | help="The output dir for augmented dataset") 250 | parser.add_argument("--bert_model", default="bert-base-uncased", type=str, 251 | help="The path of pretrained bert model.") 252 | parser.add_argument("--task_name",default="subj",type=str, 253 | help="The name of the task to train.") 254 | parser.add_argument("--max_seq_length", default=64, type=int, 255 | help="The maximum total input sequence length after WordPiece tokenization. \n" 256 | "Sequences longer than this will be truncated, and sequences shorter \n" 257 | "than this will be padded.") 258 | parser.add_argument("--do_lower_case", default=False, action='store_true', 259 | help="Set this flag if you are using an uncased model.") 260 | parser.add_argument("--train_batch_size", default=32, type=int, 261 | help="Total batch size for training.") 262 | parser.add_argument("--learning_rate", default=5e-5, type=float, 263 | help="The initial learning rate for Adam.") 264 | parser.add_argument("--num_train_epochs", default=9.0, type=float, 265 | help="Total number of training epochs to perform.") 266 | parser.add_argument("--warmup_proportion", default=0.1, type=float, 267 | help="Proportion of training to perform linear learning rate warmup for. " 268 | "E.g., 0.1 = 10%% of training.") 269 | parser.add_argument('--seed', type=int, default=42, 270 | help="random seed for initialization") 271 | 272 | args = parser.parse_args() 273 | with open("global.config", 'rb') as f: 274 | configs_dict = json.load(f) 275 | 276 | args.task_name = configs_dict.get("dataset") 277 | print(args) 278 | run_aug(args, save_every_epoch=False) 279 | 280 | 281 | def run_aug(args, save_every_epoch=False): 282 | processors = { 283 | # you can your processor here 284 | "TREC": AugProcessor, 285 | "stsa.fine": AugProcessor, 286 | "stsa.binary": AugProcessor, 287 | "mpqa": AugProcessor, 288 | "rt-polarity": AugProcessor, 289 | "subj": AugProcessor, 290 | } 291 | 292 | task_name = args.task_name 293 | if task_name not in processors: 294 | raise ValueError("Task not found: %s" % (task_name)) 295 | args.data_dir = os.path.join(args.data_dir, task_name) 296 | args.output_dir = os.path.join(args.output_dir, task_name) 297 | 298 | random.seed(args.seed) 299 | np.random.seed(args.seed) 300 | torch.manual_seed(args.seed) 301 | os.makedirs(args.output_dir, exist_ok=True) 302 | processor = processors[task_name]() 303 | label_list = processor.get_labels(task_name) 304 | 305 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 306 | 307 | train_examples = None 308 | num_train_steps = None 309 | train_examples = processor.get_train_examples(args.data_dir) 310 | #dev_examples = processor.get_dev_examples(args.data_dir) 311 | #train_examples.extend(dev_examples) 312 | num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs) 313 | 314 | # Prepare model 315 | def load_model(model_name): 316 | weights_path = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, model_name) 317 | model = torch.load(weights_path) 318 | return model 319 | cbert_name = "{}/BertForMaskedLM_{}_epoch_10".format(task_name.lower(), task_name.lower()) 320 | model = load_model(cbert_name) 321 | model.cuda() 322 | 323 | # Prepare optimizer 324 | param_optimizer = list(model.named_parameters()) 325 | no_decay = ['bias', 'gamma', 'beta'] 326 | optimizer_grouped_parameters = [ 327 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, 328 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} 329 | ] 330 | t_total = num_train_steps 331 | optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate, 332 | warmup=args.warmup_proportion,t_total=t_total) 333 | 334 | global_step = 0 335 | train_features = convert_examples_to_features( 336 | train_examples, label_list, args.max_seq_length, tokenizer) 337 | logger.info("***** Running training *****") 338 | logger.info(" Num examples = %d", len(train_examples)) 339 | logger.info(" Batch size = %d", args.train_batch_size) 340 | logger.info(" Num steps = %d", num_train_steps) 341 | all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long) 342 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) 343 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) 344 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) 345 | all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long) 346 | train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels) 347 | train_sampler = RandomSampler(train_data) 348 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) 349 | 350 | save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) 351 | if not os.path.exists(save_model_dir): 352 | os.mkdir(save_model_dir) 353 | MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0] 354 | 355 | origin_train_path = os.path.join(args.output_dir, "train_origin.tsv") 356 | save_train_path = os.path.join(args.output_dir, "train.tsv") 357 | shutil.copy(origin_train_path, save_train_path) 358 | best_test_acc = train_text_classifier.train("aug_data") 359 | print("before augment best acc:{}".format(best_test_acc)) 360 | 361 | for e in trange(int(args.num_train_epochs), desc="Epoch"): 362 | avg_loss = 0. 363 | 364 | for step, batch in enumerate(train_dataloader): 365 | model.train() 366 | batch = tuple(t.cuda() for t in batch) 367 | _, input_ids, input_mask, segment_ids, masked_ids = batch 368 | loss = model(input_ids, segment_ids, input_mask, masked_ids) 369 | loss.backward() 370 | avg_loss += loss.item() 371 | optimizer.step() 372 | model.zero_grad() 373 | if (step + 1) % 50 == 0: 374 | print("avg_loss: {}".format(avg_loss / 50)) 375 | avg_loss = 0 376 | torch.cuda.empty_cache() 377 | shutil.copy(origin_train_path, save_train_path) 378 | save_train_file = open(save_train_path, 'a') 379 | tsv_writer = csv.writer(save_train_file, delimiter='\t') 380 | #tsv_writer.writerow(['sentence', 'label']) 381 | for step, batch in enumerate(train_dataloader): 382 | model.eval() 383 | batch = tuple(t.cuda() for t in batch) 384 | init_ids, _, input_mask, segment_ids, _ = batch 385 | input_lens = [sum(mask).item() for mask in input_mask] 386 | #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens]) 387 | masked_idx = np.squeeze([np.random.randint(0, l, max(l//7,2)) for l in input_lens]) 388 | for ids, idx in zip(init_ids,masked_idx): 389 | ids[idx] = MASK_id 390 | predictions = model(init_ids, segment_ids, input_mask) 391 | for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions, segment_ids): 392 | #pred = torch.argsort(pred)[:,-e-1][idx] 393 | ''' 394 | pred = torch.argsort(preds)[:,-1][idx] 395 | ids[idx] = pred 396 | new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) 397 | new_str = rev_wordpiece(new_str) 398 | tsv_writer.writerow([new_str, seg[0].item()]) 399 | ''' 400 | pred = torch.argsort(preds)[:, -2][idx] 401 | ids[idx] = pred 402 | new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) 403 | new_str = rev_wordpiece(new_str) 404 | tsv_writer.writerow([new_str, seg[0].item()]) 405 | torch.cuda.empty_cache() 406 | predictions = predictions.detach().cpu() 407 | torch.cuda.empty_cache() 408 | bak_train_path = os.path.join(args.output_dir, "train_epoch_{}.tsv".format(e)) 409 | shutil.copy(save_train_path, bak_train_path) 410 | best_test_acc = train_text_classifier.train("aug_data") 411 | print("epoch {} augment best acc:{}".format(e, best_test_acc)) 412 | if save_every_epoch: 413 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1) 414 | save_model_path = os.path.join(save_model_dir, save_model_name) 415 | torch.save(model, save_model_path) 416 | else: 417 | if (e + 1) % 10 == 0: 418 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1) 419 | save_model_path = os.path.join(save_model_dir, save_model_name) 420 | torch.save(model, save_model_path) 421 | 422 | 423 | if __name__ == "__main__": 424 | main() 425 | -------------------------------------------------------------------------------- /aug_dataset_wo_ft.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import csv 6 | import os 7 | import shutil 8 | import logging 9 | import argparse 10 | import random 11 | from tqdm import tqdm, trange 12 | import json 13 | 14 | import numpy as np 15 | import torch 16 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 17 | from torch.utils.data.distributed import DistributedSampler 18 | 19 | from pytorch_pretrained_bert.tokenization import BertTokenizer 20 | from pytorch_pretrained_bert.modeling import BertForMaskedLM 21 | from pytorch_pretrained_bert.optimization import BertAdam 22 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE 23 | 24 | import train_text_classifier 25 | 26 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 27 | datefmt='%m/%d/%Y %H:%M:%S', 28 | level=logging.INFO) 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | class DataProcessor(object): 33 | """Base class for data converters for sequence classification data sets.""" 34 | 35 | def get_train_examples(self, data_dir): 36 | """Gets a collection of `InputExample`s for the train set.""" 37 | raise NotImplementedError() 38 | 39 | def get_dev_examples(self, data_dir): 40 | """Gets a collection of `InputExample`s for the dev set.""" 41 | raise NotImplementedError() 42 | 43 | def get_labels(self): 44 | """Gets the list of labels for this data set.""" 45 | raise NotImplementedError() 46 | 47 | @classmethod 48 | def _read_tsv(cls, input_file, quotechar=None): 49 | """Reads a tab separated value file.""" 50 | with open(input_file, "r") as f: 51 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 52 | lines = [] 53 | for line in reader: 54 | lines.append(line) 55 | return lines 56 | 57 | 58 | class InputExample(object): 59 | """A single training/test example for simple sequence classification.""" 60 | 61 | def __init__(self, guid, text_a, label=None): 62 | """Constructs a InputExample. 63 | 64 | Args: 65 | guid: Unique id for the example. 66 | text_a: string. The untokenized text of the first sequence. For single 67 | sequence tasks, only this sequence must be specified. 68 | """ 69 | self.guid = guid 70 | self.text_a = text_a 71 | self.label = label 72 | 73 | 74 | class InputFeatures(object): 75 | """A single set of features of data.""" 76 | 77 | def __init__(self, init_ids, input_ids, input_mask, segment_ids, masked_lm_labels): 78 | self.init_ids = init_ids 79 | self.input_ids = input_ids 80 | self.input_mask = input_mask 81 | self.segment_ids = segment_ids 82 | self.masked_lm_labels = masked_lm_labels 83 | 84 | class AugProcessor(DataProcessor): 85 | """Processor for dataset to be augmented.""" 86 | 87 | def get_train_examples(self, data_dir): 88 | """See base class.""" 89 | return self._create_examples( 90 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 91 | 92 | def get_dev_examples(self, data_dir): 93 | """See base class.""" 94 | return self._create_examples( 95 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 96 | 97 | def get_labels(self, name): 98 | """add your dataset here""" 99 | if name in ['stsa.binary', 'mpqa', 'rt-polarity', 'subj']: 100 | return ["0", "1"] 101 | elif name in ['stsa.fine']: 102 | return ["0", "1", "2", "3", "4"] 103 | elif name in ['TREC']: 104 | return ["0", "1", "2", "3", "4", "5"] 105 | 106 | def _create_examples(self, lines, set_type): 107 | """Creates examples for the training and dev sets.""" 108 | examples = [] 109 | for (i, line) in enumerate(lines): 110 | if i == 0: 111 | continue 112 | guid = "%s-%s" % (set_type, i) 113 | text_a = line[0] 114 | label = line[-1] 115 | examples.append( 116 | InputExample(guid=guid, text_a=text_a, label=label)) 117 | return examples 118 | 119 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): 120 | """Loads a data file into a list of `InputBatch`s.""" 121 | 122 | label_map = {} 123 | for (i, label) in enumerate(label_list): 124 | label_map[label] = i 125 | 126 | features = [] 127 | # ---- 128 | dupe_factor = 5 129 | masked_lm_prob = 0.15 130 | max_predictions_per_seq = 20 131 | rng = random.Random(12345) 132 | 133 | for (ex_index, example) in enumerate(examples): 134 | tokens_a = tokenizer.tokenize(example.text_a) 135 | segment_id = label_map[example.label] 136 | # Account for [CLS] and [SEP] with "- 2" 137 | if len(tokens_a) > max_seq_length - 2: 138 | tokens_a = tokens_a[0:(max_seq_length - 2)] 139 | 140 | # 由于是CMLM，所以需要用标签 141 | tokens = [] 142 | segment_ids = [] 143 | # is [CLS]和[SEP] needed ？ 144 | tokens.append("[CLS]") 145 | segment_ids.append(segment_id) 146 | for token in tokens_a: 147 | tokens.append(token) 148 | segment_ids.append(segment_id) 149 | tokens.append("[SEP]") 150 | segment_ids.append(segment_id) 151 | masked_lm_labels = [-1] * max_seq_length 152 | 153 | cand_indexes = [] 154 | for (i, token) in enumerate(tokens): 155 | if token == "[CLS]" or token == "[SEP]": 156 | continue 157 | cand_indexes.append(i) 158 | 159 | rng.shuffle(cand_indexes) 160 | len_cand = len(cand_indexes) 161 | 162 | output_tokens = list(tokens) 163 | 164 | num_to_predict = min(max_predictions_per_seq, 165 | max(1, int(round(len(tokens) * masked_lm_prob)))) 166 | 167 | masked_lms_pos = [] 168 | covered_indexes = set() 169 | for index in cand_indexes: 170 | if len(masked_lms_pos) >= num_to_predict: 171 | break 172 | if index in covered_indexes: 173 | continue 174 | covered_indexes.add(index) 175 | 176 | masked_token = None 177 | # 80% of the time, replace with [MASK] 178 | if rng.random() < 0.8: 179 | masked_token = "[MASK]" 180 | else: 181 | # 10% of the time, keep original 182 | if rng.random() < 0.5: 183 | masked_token = tokens[index] 184 | # 10% of the time, replace with random word 185 | else: 186 | masked_token = tokens[cand_indexes[rng.randint(0, len_cand - 1)]] 187 | 188 | masked_lm_labels[index] = tokenizer.convert_tokens_to_ids([tokens[index]])[0] 189 | output_tokens[index] = masked_token 190 | masked_lms_pos.append(index) 191 | 192 | init_ids = tokenizer.convert_tokens_to_ids(tokens) 193 | input_ids = tokenizer.convert_tokens_to_ids(output_tokens) 194 | 195 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 196 | # tokens are attended to. 197 | input_mask = [1] * len(input_ids) 198 | 199 | # Zero-pad up to the sequence length. 200 | while len(input_ids) < max_seq_length: 201 | init_ids.append(0) 202 | input_ids.append(0) 203 | input_mask.append(0) 204 | segment_ids.append(0) # ?segment_id 205 | 206 | assert len(init_ids) == max_seq_length 207 | assert len(input_ids) == max_seq_length 208 | assert len(input_mask) == max_seq_length 209 | assert len(segment_ids) == max_seq_length 210 | 211 | if ex_index < 2: 212 | logger.info("*** Example ***") 213 | logger.info("guid: %s" % (example.guid)) 214 | logger.info("tokens: %s" % " ".join( 215 | [str(x) for x in tokens])) 216 | logger.info("init_ids: %s" % " ".join([str(x) for x in init_ids])) 217 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 218 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 219 | logger.info( 220 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 221 | logger.info("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels])) 222 | 223 | features.append( 224 | InputFeatures(init_ids=init_ids, 225 | input_ids=input_ids, 226 | input_mask=input_mask, 227 | segment_ids=segment_ids, 228 | masked_lm_labels=masked_lm_labels)) 229 | return features 230 | 231 | def rev_wordpiece(str): 232 | #print(str) 233 | if len(str) > 1: 234 | for i in range(len(str)-1, 0, -1): 235 | if str[i] == '[PAD]': 236 | str.remove(str[i]) 237 | elif len(str[i]) > 1 and str[i][0]=='#' and str[i][1]=='#': 238 | str[i-1] += str[i][2:] 239 | str.remove(str[i]) 240 | return " ".join(str[1:-1]) 241 | 242 | def main(): 243 | parser = argparse.ArgumentParser() 244 | 245 | ## Required parameters 246 | parser.add_argument("--data_dir", default="datasets", type=str, 247 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 248 | parser.add_argument("--output_dir", default="aug_data", type=str, 249 | help="The output dir for augmented dataset") 250 | parser.add_argument("--bert_model", default="bert-base-uncased", type=str, 251 | help="The path of pretrained bert model.") 252 | parser.add_argument("--task_name",default="subj",type=str, 253 | help="The name of the task to train.") 254 | parser.add_argument("--max_seq_length", default=64, type=int, 255 | help="The maximum total input sequence length after WordPiece tokenization. \n" 256 | "Sequences longer than this will be truncated, and sequences shorter \n" 257 | "than this will be padded.") 258 | parser.add_argument("--do_lower_case", default=False, action='store_true', 259 | help="Set this flag if you are using an uncased model.") 260 | parser.add_argument("--train_batch_size", default=32, type=int, 261 | help="Total batch size for training.") 262 | parser.add_argument("--learning_rate", default=5e-5, type=float, 263 | help="The initial learning rate for Adam.") 264 | parser.add_argument("--num_train_epochs", default=9.0, type=float, 265 | help="Total number of training epochs to perform.") 266 | parser.add_argument("--warmup_proportion", default=0.1, type=float, 267 | help="Proportion of training to perform linear learning rate warmup for. " 268 | "E.g., 0.1 = 10%% of training.") 269 | parser.add_argument('--seed', type=int, default=42, 270 | help="random seed for initialization") 271 | 272 | args = parser.parse_args() 273 | with open("global.config", 'rb') as f: 274 | configs_dict = json.load(f) 275 | 276 | args.task_name = configs_dict.get("dataset") 277 | print(args) 278 | run_aug(args, save_every_epoch=False) 279 | 280 | 281 | def run_aug(args, save_every_epoch=False): 282 | processors = { 283 | # you can your processor here 284 | "TREC": AugProcessor, 285 | "stsa.fine": AugProcessor, 286 | "stsa.binary": AugProcessor, 287 | "mpqa": AugProcessor, 288 | "rt-polarity": AugProcessor, 289 | "subj": AugProcessor, 290 | } 291 | 292 | task_name = args.task_name 293 | if task_name not in processors: 294 | raise ValueError("Task not found: %s" % (task_name)) 295 | args.data_dir = os.path.join(args.data_dir, task_name) 296 | args.output_dir = os.path.join(args.output_dir, task_name) 297 | 298 | random.seed(args.seed) 299 | np.random.seed(args.seed) 300 | torch.manual_seed(args.seed) 301 | os.makedirs(args.output_dir, exist_ok=True) 302 | processor = processors[task_name]() 303 | label_list = processor.get_labels(task_name) 304 | 305 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 306 | 307 | train_examples = None 308 | num_train_steps = None 309 | train_examples = processor.get_train_examples(args.data_dir) 310 | #dev_examples = processor.get_dev_examples(args.data_dir) 311 | #train_examples.extend(dev_examples) 312 | num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs) 313 | 314 | # Prepare model 315 | model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) 316 | 317 | if task_name == 'stsa.fine': 318 | model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(5, 768) 319 | model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02) 320 | elif task_name == 'TREC': 321 | model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(6, 768) 322 | model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02) 323 | 324 | model.cuda() 325 | 326 | # Prepare optimizer 327 | param_optimizer = list(model.named_parameters()) 328 | no_decay = ['bias', 'gamma', 'beta'] 329 | optimizer_grouped_parameters = [ 330 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, 331 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} 332 | ] 333 | t_total = num_train_steps 334 | optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate, 335 | warmup=args.warmup_proportion,t_total=t_total) 336 | 337 | global_step = 0 338 | train_features = convert_examples_to_features( 339 | train_examples, label_list, args.max_seq_length, tokenizer) 340 | logger.info("***** Running training *****") 341 | logger.info(" Num examples = %d", len(train_examples)) 342 | logger.info(" Batch size = %d", args.train_batch_size) 343 | logger.info(" Num steps = %d", num_train_steps) 344 | all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long) 345 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) 346 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) 347 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) 348 | all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long) 349 | train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels) 350 | train_sampler = RandomSampler(train_data) 351 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) 352 | 353 | save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) 354 | if not os.path.exists(save_model_dir): 355 | os.mkdir(save_model_dir) 356 | MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0] 357 | 358 | origin_train_path = os.path.join(args.output_dir, "train_origin.tsv") 359 | save_train_path = os.path.join(args.output_dir, "train.tsv") 360 | shutil.copy(origin_train_path, save_train_path) 361 | best_test_acc = train_text_classifier.train("aug_data") 362 | print("before augment best acc:{}".format(best_test_acc)) 363 | 364 | for e in trange(int(args.num_train_epochs), desc="Epoch"): 365 | avg_loss = 0. 366 | 367 | for step, batch in enumerate(train_dataloader): 368 | model.train() 369 | batch = tuple(t.cuda() for t in batch) 370 | _, input_ids, input_mask, segment_ids, masked_ids = batch 371 | loss = model(input_ids, segment_ids, input_mask, masked_ids) 372 | loss.backward() 373 | avg_loss += loss.item() 374 | optimizer.step() 375 | model.zero_grad() 376 | if (step + 1) % 50 == 0: 377 | print("avg_loss: {}".format(avg_loss / 50)) 378 | avg_loss = 0 379 | torch.cuda.empty_cache() 380 | shutil.copy(origin_train_path, save_train_path) 381 | save_train_file = open(save_train_path, 'a') 382 | tsv_writer = csv.writer(save_train_file, delimiter='\t') 383 | #tsv_writer.writerow(['sentence', 'label']) 384 | for step, batch in enumerate(train_dataloader): 385 | model.eval() 386 | batch = tuple(t.cuda() for t in batch) 387 | init_ids, _, input_mask, segment_ids, _ = batch 388 | input_lens = [sum(mask).item() for mask in input_mask] 389 | #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens]) 390 | masked_idx = np.squeeze([np.random.randint(0, l, max(l//7,2)) for l in input_lens]) 391 | for ids, idx in zip(init_ids,masked_idx): 392 | ids[idx] = MASK_id 393 | predictions = model(init_ids, segment_ids, input_mask) 394 | for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions, segment_ids): 395 | #pred = torch.argsort(pred)[:,-e-1][idx] 396 | ''' 397 | pred = torch.argsort(preds)[:,-1][idx] 398 | ids[idx] = pred 399 | new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) 400 | new_str = rev_wordpiece(new_str) 401 | tsv_writer.writerow([new_str, seg[0].item()]) 402 | ''' 403 | pred = torch.argsort(preds)[:, -2][idx] 404 | ids[idx] = pred 405 | new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) 406 | new_str = rev_wordpiece(new_str) 407 | tsv_writer.writerow([new_str, seg[0].item()]) 408 | torch.cuda.empty_cache() 409 | predictions = predictions.detach().cpu() 410 | torch.cuda.empty_cache() 411 | bak_train_path = os.path.join(args.output_dir, "train_epoch_{}.tsv".format(e)) 412 | shutil.copy(save_train_path, bak_train_path) 413 | best_test_acc = train_text_classifier.train("aug_data") 414 | print("epoch {} augment best acc:{}".format(e, best_test_acc)) 415 | if save_every_epoch: 416 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1) 417 | save_model_path = os.path.join(save_model_dir, save_model_name) 418 | torch.save(model, save_model_path) 419 | else: 420 | if (e + 1) % 10 == 0: 421 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1) 422 | save_model_path = os.path.join(save_model_dir, save_model_name) 423 | torch.save(model, save_model_path) 424 | 425 | 426 | if __name__ == "__main__": 427 | main() 428 | -------------------------------------------------------------------------------- /datasets/TREC/test.tsv: -------------------------------------------------------------------------------- 1 | sentence label 2 | How far is it from Denver to Aspen ? 5 3 | What county is Modesto , California in ? 4 4 | Who was Galileo ? 3 5 | What is an atom ? 0 6 | When did Hawaii become a state ? 5 7 | How tall is the Sears Building ? 5 8 | George Bush purchased a small interest in which baseball team ? 3 9 | What is Australia 's national flower ? 1 10 | Why does the moon turn orange ? 0 11 | What is autism ? 0 12 | What city had a world fair in 1900 ? 4 13 | What person 's head is on a dime ? 3 14 | What is the average weight of a Yellow Labrador ? 5 15 | Who was the first man to fly across the Pacific Ocean ? 3 16 | When did Idaho become a state ? 5 17 | What is the life expectancy for crickets ? 5 18 | What metal has the highest melting point ? 1 19 | Who developed the vaccination against polio ? 3 20 | What is epilepsy ? 0 21 | What year did the Titanic sink ? 5 22 | Who was the first American to walk in space ? 3 23 | What is a biosphere ? 0 24 | What river in the US is known as the Big Muddy ? 4 25 | What is bipolar disorder ? 0 26 | What is cholesterol ? 0 27 | Who developed the Macintosh computer ? 3 28 | What is caffeine ? 0 29 | What imaginary line is halfway between the North and South Poles ? 4 30 | Where is John Wayne airport ? 4 31 | What hemisphere is the Philippines in ? 4 32 | What is the average speed of the horses at the Kentucky Derby ? 5 33 | Where are the Rocky Mountains ? 4 34 | What are invertebrates ? 0 35 | What is the temperature at the center of the earth ? 5 36 | When did John F. Kennedy get elected as President ? 5 37 | How old was Elvis Presley when he died ? 5 38 | Where is the Orinoco River ? 4 39 | How far is the service line from the net in tennis ? 5 40 | How much fiber should you have per day ? 5 41 | How many Great Lakes are there ? 5 42 | Material called linen is made from what plant ? 1 43 | What is Teflon ? 0 44 | What is amitriptyline ? 0 45 | What is a shaman ? 0 46 | What is the proper name for a female walrus ? 1 47 | What is a group of turkeys called ? 1 48 | How long did Rip Van Winkle sleep ? 5 49 | What are triglycerides ? 0 50 | How many liters in a gallon ? 5 51 | What is the name of the chocolate company in San Francisco ? 3 52 | What are amphibians ? 0 53 | Who discovered x-rays ? 3 54 | Which comedian 's signature line is `` Can we talk '' ? 3 55 | What is fibromyalgia ? 0 56 | What is done with worn or outdated flags ? 0 57 | What does cc in engines mean ? 0 58 | When did Elvis Presley die ? 5 59 | What is the capital of Yugoslavia ? 4 60 | Where is Milan ? 4 61 | What is the speed hummingbirds fly ? 5 62 | What is the oldest city in the United States ? 4 63 | What was W.C. Fields ' real name ? 3 64 | What river flows between Fargo , North Dakota and Moorhead , Minnesota ? 4 65 | What do bats eat ? 1 66 | What state did the Battle of Bighorn take place in ? 4 67 | Who was Abraham Lincoln ? 3 68 | What do you call a newborn kangaroo ? 1 69 | What are spider veins ? 0 70 | What day and month did John Lennon die ? 5 71 | What strait separates North America from Asia ? 4 72 | What is the population of Seattle ? 5 73 | How much was a ticket for the Titanic ? 5 74 | What is the largest city in the world ? 4 75 | What American composer wrote the music for `` West Side Story '' ? 3 76 | Where is the Mall of the America ? 4 77 | What is the pH scale ? 0 78 | What type of currency is used in Australia ? 1 79 | How tall is the Gateway Arch in St. Louis , MO ? 5 80 | How much does the human adult female brain weigh ? 5 81 | Who was the first governor of Alaska ? 3 82 | What is a prism ? 0 83 | When was the first liver transplant ? 5 84 | Who was elected president of South Africa in 1994 ? 3 85 | What is the population of China ? 5 86 | When was Rosa Parks born ? 5 87 | Why is a ladybug helpful ? 0 88 | What is amoxicillin ? 0 89 | Who was the first female United States Representative ? 3 90 | What are xerophytes ? 0 91 | What country did Ponce de Leon come from ? 4 92 | The U.S. Department of Treasury first issued paper currency for the U.S. during which war ? 1 93 | What is desktop publishing ? 0 94 | What is the temperature of the sun 's surface ? 5 95 | What year did Canada join the United Nations ? 5 96 | What is the oldest university in the US ? 3 97 | Where is Prince Edward Island ? 4 98 | Mercury , what year was it discovered ? 5 99 | What is cryogenics ? 0 100 | What are coral reefs ? 0 101 | What is the longest major league baseball-winning streak ? 1 102 | What is neurology ? 0 103 | Who invented the calculator ? 3 104 | How do you measure earthquakes ? 0 105 | Who is Duke Ellington ? 3 106 | What county is Phoenix , AZ in ? 4 107 | What is a micron ? 0 108 | The sun 's core , what is the temperature ? 5 109 | What is the Ohio state bird ? 1 110 | When were William Shakespeare 's twins born ? 5 111 | What is the highest dam in the U.S. ? 4 112 | What color is a poison arrow frog ? 1 113 | What is acupuncture ? 0 114 | What is the length of the coastline of the state of Alaska ? 5 115 | What is the name of Neil Armstrong 's wife ? 3 116 | What is Hawaii 's state flower ? 1 117 | Who won Ms. American in 1989 ? 3 118 | When did the Hindenberg crash ? 5 119 | What mineral helps prevent osteoporosis ? 1 120 | What was the last year that the Chicago Cubs won the World Series ? 5 121 | Where is Perth ? 4 122 | What year did WWII begin ? 5 123 | What is the diameter of a golf ball ? 5 124 | What is an eclipse ? 0 125 | Who discovered America ? 3 126 | What is the earth 's diameter ? 5 127 | Which president was unmarried ? 3 128 | How wide is the Milky Way galaxy ? 5 129 | During which season do most thunderstorms occur ? 5 130 | What is Wimbledon ? 0 131 | What is the gestation period for a cat ? 5 132 | How far is a nautical mile ? 5 133 | Who was the abolitionist who led the raid on Harper 's Ferry in 1859 ? 3 134 | What does target heart rate mean ? 0 135 | What was the first satellite to go into space ? 1 136 | What is foreclosure ? 0 137 | What is the major fault line near Kentucky ? 1 138 | Where is the Holland Tunnel ? 4 139 | Who wrote the hymn `` Amazing Grace '' ? 3 140 | What position did Willie Davis play in baseball ? 3 141 | What are platelets ? 0 142 | What is severance pay ? 0 143 | What is the name of Roy Roger 's dog ? 1 144 | Where are the National Archives ? 4 145 | What is a baby turkey called ? 1 146 | What is poliomyelitis ? 0 147 | What is the longest bone in the human body ? 1 148 | Who is a German philosopher ? 3 149 | What were Christopher Columbus ' three ships ? 1 150 | What does Phi Beta Kappa mean ? 0 151 | What is nicotine ? 0 152 | What is another name for vitamin B1 ? 1 153 | Who discovered radium ? 3 154 | What are sunspots ? 0 155 | When was Algeria colonized ? 5 156 | What baseball team was the first to make numbers part of their uniform ? 3 157 | What continent is Egypt on ? 4 158 | What is the capital of Mongolia ? 4 159 | What is nanotechnology ? 0 160 | In the late 1700 's British convicts were used to populate which colony ? 4 161 | What state is the geographic center of the lower 48 states ? 4 162 | What is an obtuse angle ? 0 163 | What are polymers ? 0 164 | When is hurricane season in the Caribbean ? 5 165 | Where is the volcano Mauna Loa ? 4 166 | What is another astronomic term for the Northern Lights ? 1 167 | What peninsula is Spain part of ? 4 168 | When was Lyndon B. Johnson born ? 5 169 | What is acetaminophen ? 0 170 | What state has the least amount of rain per year ? 4 171 | Who founded American Red Cross ? 3 172 | What year did the Milwaukee Braves become the Atlanta Braves ? 5 173 | How fast is alcohol absorbed ? 5 174 | When is the summer solstice ? 5 175 | What is supernova ? 0 176 | Where is the Shawnee National Forest ? 4 177 | What U.S. state 's motto is `` Live free or Die '' ? 4 178 | Where is the Lourve ? 4 179 | When was the first stamp issued ? 5 180 | What primary colors do you mix to make orange ? 1 181 | How far is Pluto from the sun ? 5 182 | What body of water are the Canary Islands in ? 4 183 | What is neuropathy ? 0 184 | Where is the Euphrates River ? 4 185 | What is cryptography ? 0 186 | What is natural gas composed of ? 1 187 | Who is the Prime Minister of Canada ? 3 188 | What French ruler was defeated at the battle of Waterloo ? 3 189 | What is leukemia ? 0 190 | Where did Howard Hughes die ? 4 191 | What is the birthstone for June ? 1 192 | What is the sales tax in Minnesota ? 1 193 | What is the distance in miles from the earth to the sun ? 5 194 | What is the average life span for a chicken ? 5 195 | When was the first Wal-Mart store opened ? 5 196 | What is relative humidity ? 0 197 | What city has the zip code of 35824 ? 4 198 | What currency is used in Algeria ? 1 199 | Who invented the hula hoop ? 3 200 | What was the most popular toy in 1957 ? 1 201 | What is pastrami made of ? 1 202 | What is the name of the satellite that the Soviet Union sent into space in 1957 ? 1 203 | What city 's newspaper is called `` The Enquirer '' ? 4 204 | Who invented the slinky ? 3 205 | What are the animals that don 't have backbones called ? 1 206 | What is the melting point of copper ? 5 207 | Where is the volcano Olympus Mons located ? 4 208 | Who was the 23rd president of the United States ? 3 209 | What is the average body temperature ? 5 210 | What does a defibrillator do ? 0 211 | What is the effect of acid rain ? 0 212 | What year did the United States abolish the draft ? 5 213 | How fast is the speed of light ? 5 214 | What province is Montreal in ? 4 215 | What New York City structure is also known as the Twin Towers ? 4 216 | What is fungus ? 0 217 | What is the most frequently spoken language in the Netherlands ? 1 218 | What is sodium chloride ? 0 219 | What are the spots on dominoes called ? 1 220 | How many pounds in a ton ? 5 221 | What is influenza ? 0 222 | What is ozone depletion ? 0 223 | What year was the Mona Lisa painted ? 5 224 | What does `` Sitting Shiva '' mean ? 0 225 | What is the electrical output in Madrid , Spain ? 1 226 | Which mountain range in North America stretches from Maine to Georgia ? 4 227 | What is plastic made of ? 1 228 | What is the population of Nigeria ? 5 229 | What does your spleen do ? 0 230 | Where is the Grand Canyon ? 4 231 | Who invented the telephone ? 3 232 | What year did the U.S. buy Alaska ? 5 233 | What is the name of the leader of Ireland ? 3 234 | What is phenylalanine ? 0 235 | How many gallons of water are there in a cubic foot ? 5 236 | What are the two houses of the Legislative branch ? 1 237 | What is sonar ? 0 238 | In Poland , where do most people live ? 4 239 | What is phosphorus ? 0 240 | What is the location of the Sea of Tranquility ? 4 241 | How fast is sound ? 5 242 | What French province is cognac produced in ? 4 243 | What is Valentine 's Day ? 0 244 | What causes gray hair ? 0 245 | What is hypertension ? 0 246 | What is bandwidth ? 0 247 | What is the longest suspension bridge in the U.S. ? 4 248 | What is a parasite ? 0 249 | What is home equity ? 0 250 | What do meteorologists do ? 0 251 | What is the criterion for being legally blind ? 1 252 | Who is the tallest man in the world ? 3 253 | What are the twin cities ? 4 254 | What did Edward Binney and Howard Smith invent in 1903 ? 1 255 | What is the statue of liberty made of ? 1 256 | What is pilates ? 0 257 | What planet is known as the `` red '' planet ? 4 258 | What is the depth of the Nile river ? 5 259 | What is the colorful Korean traditional dress called ? 1 260 | What is Mardi Gras ? 0 261 | Mexican pesos are worth what in U.S. dollars ? 5 262 | Who was the first African American to play for the Brooklyn Dodgers ? 3 263 | Who was the first Prime Minister of Canada ? 3 264 | How many Admirals are there in the U.S. Navy ? 5 265 | What instrument did Glenn Miller play ? 1 266 | How old was Joan of Arc when she died ? 5 267 | What does the word fortnight mean ? 0 268 | What is dianetics ? 0 269 | What is the capital of Ethiopia ? 4 270 | For how long is an elephant pregnant ? 5 271 | How did Janice Joplin die ? 0 272 | What is the primary language in Iceland ? 1 273 | What is the difference between AM radio stations and FM radio stations ? 0 274 | What is osteoporosis ? 0 275 | Who was the first woman governor in the U.S. ? 3 276 | What is peyote ? 0 277 | What is the esophagus used for ? 0 278 | What is viscosity ? 0 279 | What year did Oklahoma become a state ? 5 280 | What is the abbreviation for Texas ? 2 281 | What is a mirror made out of ? 1 282 | Where on the body is a mortarboard worn ? 4 283 | What was J.F.K. 's wife 's name ? 3 284 | What does I.V. stand for ? 2 285 | What is the chunnel ? 0 286 | Where is Hitler buried ? 4 287 | What are antacids ? 0 288 | What is pulmonary fibrosis ? 0 289 | What are Quaaludes ? 0 290 | What is naproxen ? 0 291 | What is strep throat ? 0 292 | What is the largest city in the U.S. ? 4 293 | What is foot and mouth disease ? 1 294 | What is the life expectancy of a dollar bill ? 5 295 | What do you call a professional map drawer ? 1 296 | What are Aborigines ? 0 297 | What is hybridization ? 0 298 | What color is indigo ? 1 299 | How old do you have to be in order to rent a car in Italy ? 5 300 | What does a barometer measure ? 1 301 | What color is a giraffe 's tongue ? 1 302 | What does USPS stand for ? 2 303 | What year did the NFL go on strike ? 5 304 | What is solar wind ? 0 305 | What date did Neil Armstrong land on the moon ? 5 306 | When was Hiroshima bombed ? 5 307 | Where is the Savannah River ? 4 308 | Who was the first woman killed in the Vietnam War ? 3 309 | What planet has the strongest magnetic field of all the planets ? 4 310 | Who is the governor of Alaska ? 3 311 | What year did Mussolini seize power in Italy ? 5 312 | What is the capital of Persia ? 4 313 | Where is the Eiffel Tower ? 4 314 | How many hearts does an octopus have ? 5 315 | What is pneumonia ? 0 316 | What is the deepest lake in the US ? 4 317 | What is a fuel cell ? 0 318 | Who was the first U.S. president to appear on TV ? 3 319 | Where is the Little League Museum ? 4 320 | What are the two types of twins ? 1 321 | What is the brightest star ? 4 322 | What is diabetes ? 0 323 | When was President Kennedy shot ? 5 324 | What is TMJ ? 2 325 | What color is yak milk ? 1 326 | What date was Dwight D. Eisenhower born ? 5 327 | What does the technical term ISDN mean ? 2 328 | Why is the sun yellow ? 0 329 | What is the conversion rate between dollars and pounds ? 5 330 | When was Abraham Lincoln born ? 5 331 | What is the Milky Way ? 0 332 | What is mold ? 0 333 | What year was Mozart born ? 5 334 | What is a group of frogs called ? 1 335 | What is the name of William Penn 's ship ? 1 336 | What is the melting point of gold ? 5 337 | What is the street address of the White House ? 4 338 | What is semolina ? 0 339 | What fruit is Melba sauce made from ? 1 340 | What is Ursa Major ? 0 341 | What is the percentage of water content in the human body ? 5 342 | How much does water weigh ? 5 343 | What was President Lyndon Johnson 's reform program called ? 1 344 | What is the murder rate in Windsor , Ontario ? 5 345 | Who is the only president to serve 2 non-consecutive terms ? 3 346 | What is the population of Australia ? 5 347 | Who painted the ceiling of the Sistine Chapel ? 3 348 | Name a stimulant . 1 349 | What is the effect of volcanoes on the climate ? 0 350 | What year did the Andy Griffith show begin ? 5 351 | What is acid rain ? 0 352 | What is the date of Mexico 's independence ? 5 353 | What is the location of Lake Champlain ? 4 354 | What is the Illinois state flower ? 1 355 | What is Maryland 's state bird ? 1 356 | What is quicksilver ? 0 357 | Who wrote `` The Divine Comedy '' ? 3 358 | What is the speed of light ? 5 359 | What is the width of a football field ? 5 360 | Why in tennis are zero points called love ? 0 361 | What kind of dog was Toto in the Wizard of Oz ? 1 362 | What is a thyroid ? 0 363 | What does ciao mean ? 0 364 | What is the only artery that carries blue blood from the heart to the lungs ? 1 365 | How often does Old Faithful erupt at Yellowstone National Park ? 5 366 | What is acetic acid ? 0 367 | What is the elevation of St. Louis , MO ? 5 368 | What color does litmus paper turn when it comes into contact with a strong acid ? 1 369 | What are the colors of the German flag ? 1 370 | What is the Moulin Rouge ? 0 371 | What soviet seaport is on the Black Sea ? 4 372 | What is the atomic weight of silver ? 5 373 | What currency do they use in Brazil ? 1 374 | What are pathogens ? 0 375 | What is mad cow disease ? 0 376 | Name a food high in zinc . 1 377 | When did North Carolina enter the union ? 5 378 | Where do apple snails live ? 4 379 | What are ethics ? 0 380 | What does CPR stand for ? 2 381 | What is an annuity ? 0 382 | Who killed John F. Kennedy ? 3 383 | Who was the first vice president of the U.S. ? 3 384 | What birthstone is turquoise ? 1 385 | Who was the first US President to ride in an automobile to his inauguration ? 3 386 | How old was the youngest president of the United States ? 5 387 | When was Ulysses S. Grant born ? 5 388 | What is Muscular Dystrophy ? 0 389 | Who lived in the Neuschwanstein castle ? 3 390 | What is propylene glycol ? 0 391 | What is a panic disorder ? 0 392 | Who invented the instant Polaroid camera ? 3 393 | What is a carcinogen ? 0 394 | What is a baby lion called ? 1 395 | What is the world 's population ? 5 396 | What is nepotism ? 0 397 | What is die-casting ? 0 398 | What is myopia ? 0 399 | What is the sales tax rate in New York ? 5 400 | Developing nations comprise what percentage of the world 's population ? 5 401 | What is the fourth highest mountain in the world ? 4 402 | What is Shakespeare 's nickname ? 3 403 | What is the heaviest naturally occurring element ? 1 404 | When is Father 's Day ? 5 405 | What does the acronym NASA stand for ? 2 406 | How long is the Columbia River in miles ? 5 407 | What city 's newspaper is called `` The Star '' ? 4 408 | What is carbon dioxide ? 0 409 | Where is the Mason/Dixon line ? 4 410 | When was the Boston tea party ? 5 411 | What is metabolism ? 0 412 | Which U.S.A. president appeared on `` Laugh-In '' ? 3 413 | What are cigarettes made of ? 1 414 | What is the capital of Zimbabwe ? 4 415 | What does NASA stand for ? 2 416 | What is the state flower of Michigan ? 1 417 | What are semiconductors ? 0 418 | What is nuclear power ? 0 419 | What is a tsunami ? 0 420 | Who is the congressman from state of Texas on the armed forces committee ? 3 421 | Who was president in 1913 ? 3 422 | When was the first kidney transplant ? 5 423 | What are Canada 's two territories ? 4 424 | What was the name of the plane Lindbergh flew solo across the Atlantic ? 1 425 | What is genocide ? 0 426 | What continent is Argentina on ? 4 427 | What monastery was raided by Vikings in the late eighth century ? 1 428 | What is an earthquake ? 0 429 | Where is the tallest roller coaster located ? 4 430 | What are enzymes ? 0 431 | Who discovered oxygen ? 3 432 | What is bangers and mash ? 0 433 | What is the name given to the Tiger at Louisiana State University ? 1 434 | Where are the British crown jewels kept ? 4 435 | Who was the first person to reach the North Pole ? 3 436 | What is an ulcer ? 0 437 | What is vertigo ? 0 438 | What is the spirometer test ? 0 439 | When is the official first day of summer ? 5 440 | What does the abbreviation SOS mean ? 2 441 | What is the smallest bird in Britain ? 1 442 | Who invented Trivial Pursuit ? 3 443 | What gasses are in the troposphere ? 1 444 | Which country has the most water pollution ? 4 445 | What is the scientific name for elephant ? 1 446 | Who is the actress known for her role in the movie `` Gypsy '' ? 3 447 | What breed of hunting dog did the Beverly Hillbillies own ? 1 448 | What is the rainiest place on Earth ? 4 449 | Who was the first African American to win the Nobel Prize in literature ? 3 450 | When is St. Patrick 's Day ? 5 451 | What was FDR 's dog 's name ? 1 452 | What colors need to be mixed to get the color pink ? 1 453 | What is the most popular sport in Japan ? 1 454 | What is the active ingredient in baking soda ? 1 455 | When was Thomas Jefferson born ? 5 456 | How cold should a refrigerator be ? 5 457 | When was the telephone invented ? 5 458 | What is the most common eye color ? 1 459 | Where was the first golf course in the United States ? 4 460 | What is schizophrenia ? 0 461 | What is angiotensin ? 0 462 | What did Jesse Jackson organize ? 3 463 | What is New York 's state bird ? 1 464 | What is the National Park in Utah ? 4 465 | What is Susan B. Anthony 's birthday ? 5 466 | In which state would you find the Catskill Mountains ? 4 467 | What do you call a word that is spelled the same backwards and forwards ? 1 468 | What are pediatricians ? 0 469 | What chain store is headquartered in Bentonville , Arkansas ? 3 470 | What are solar cells ? 0 471 | What is compounded interest ? 0 472 | What are capers ? 0 473 | What is an antigen ? 0 474 | What currency does Luxembourg use ? 1 475 | What is the population of Venezuela ? 5 476 | What type of polymer is used for bulletproof vests ? 1 477 | What currency does Argentina use ? 1 478 | What is a thermometer ? 0 479 | What Canadian city has the largest population ? 4 480 | What color are crickets ? 1 481 | Which country gave New York the Statue of Liberty ? 4 482 | What was the name of the first U.S. satellite sent into space ? 1 483 | What precious stone is a form of pure carbon ? 1 484 | What kind of gas is in a fluorescent bulb ? 1 485 | What is rheumatoid arthritis ? 0 486 | What river runs through Rowe , Italy ? 4 487 | What is cerebral palsy ? 0 488 | What city is also known as `` The Gateway to the West '' ? 4 489 | How far away is the moon ? 5 490 | What is the source of natural gas ? 1 491 | In what spacecraft did U.S. astronaut Alan Shepard make his historic 1961 flight ? 1 492 | What is pectin ? 0 493 | What is bio-diversity ? 0 494 | What 's the easiest way to remove wallpaper ? 1 495 | What year did the Titanic start on its journey ? 5 496 | How much of an apple is water ? 5 497 | Who was the 22nd President of the US ? 3 498 | What is the money they use in Zambia ? 1 499 | How many feet in a mile ? 5 500 | What is the birthstone of October ? 1 501 | What is e-coli ? 0 502 | -------------------------------------------------------------------------------- /datasets/mpqa/dev.tsv: -------------------------------------------------------------------------------- 1 | sentence label 2 | unfettered support 1 3 | such a mindset will impair 0 4 | has revealed george bush's talents as a war leader 1 5 | provide support to the us , as it did 1 6 | kill 0 7 | play with 0 8 | embark on trampling upon human rights of civilians 0 9 | in such a complex state 0 10 | pursue 0 11 | will continue to support 1 12 | places strains 0 13 | wish 1 14 | subjected 0 15 | benefits 0 16 | highly respected 1 17 | this agreement 1 18 | under the strong influence 0 19 | preach 1 20 | hardest hit 0 21 | would be easier 1 22 | will be irresponsible 0 23 | conspicuous policy of playing games 0 24 | to agree 1 25 | to ask 1 26 | was very strong about this 1 27 | were even more full of praise 1 28 | can not be taken seriously 0 29 | felt gratified and relieved 1 30 | invited 1 31 | languages without nuances 0 32 | scheme 0 33 | incapacity to put crime under control 0 34 | it is pure rhetoric 0 35 | charged 0 36 | attacks 0 37 | anger 0 38 | vowed 1 39 | ridiculous 0 40 | are ready 1 41 | were accused 0 42 | will of 1 43 | unheeded 0 44 | opposes and rejects 0 45 | outcry 0 46 | join forces 1 47 | ca n't be burying its head in the sand 0 48 | insidious 0 49 | would not accept 0 50 | too costly to meet 0 51 | should not 0 52 | does not reflect 0 53 | invited 1 54 | top priority of 1 55 | only benefits corporate america 0 56 | still 0 57 | support 1 58 | not intent on revitalizing its economies 0 59 | angry anti - us protests erupted 0 60 | hopes 1 61 | m feeling much better 1 62 | branded 0 63 | the danger 0 64 | proposal to reunify 1 65 | nothing seemed more normal than to settle in someone else's territory 0 66 | to blame 0 67 | have been growing less visionary 0 68 | will fight on 0 69 | shortcomings 0 70 | deadly 0 71 | would incur its displeasure 0 72 | like a dangerous virus 0 73 | it is a futile illusion because it is a lie 0 74 | are irritated 0 75 | in a way our government has not been 0 76 | neither good nor bad , just incorrigible 0 77 | destroyed 0 78 | respecting 1 79 | to promote 1 80 | witch - hunting 0 81 | biggest terrorist country 0 82 | legitimate dissent 1 83 | even against vehicles transporting pregnant women in labor and against unarmed citizens 0 84 | niceties 1 85 | his ship has been sailing towards a wrong direction for far too long 0 86 | critics 0 87 | once again turn its back 0 88 | growing unrest 0 89 | defects 0 90 | has no love lost 0 91 | violent protests 0 92 | confined to minute cells 0 93 | immediately 1 94 | corrupt politicians 0 95 | a crime against humanity 0 96 | the rights and privileges that they would automatically get under geneva convention 0 97 | perils of delusion 0 98 | would be better off 1 99 | have been concerned 0 100 | uproar 0 101 | pained him 0 102 | terrorism 0 103 | not only prejudice 0 104 | a brutal occupation 0 105 | meddling 0 106 | falls at a bad time 0 107 | ugliest crimes 0 108 | picking on 0 109 | allegations 0 110 | are being well treated 1 111 | the serious crisis 0 112 | will be a monumental tragedy 0 113 | is proud 1 114 | thus 0 115 | oh , my god 0 116 | will not allow 0 117 | the worst crisis 0 118 | turn its back 0 119 | deepened 0 120 | will complain 0 121 | very hard line 0 122 | opposition 0 123 | would not dilute 1 124 | secret 0 125 | opposition 0 126 | illusory 0 127 | not adequately free and fair 0 128 | should follow 1 129 | impossible 0 130 | an aberration that goes against mankind 0 131 | have preference for 1 132 | all around us 0 133 | very , very large 1 134 | lie 0 135 | were ill - received 0 136 | difficult 0 137 | and then assert the laws of war do not apply 0 138 | from upholding religious and cultural values 1 139 | preferred 1 140 | full support 1 141 | criticizing 0 142 | immediate exploitation 0 143 | ignored 0 144 | has unashamedly seen fit 1 145 | support 1 146 | now a reality 1 147 | gaffe 0 148 | virtually nothing 0 149 | no place in israel can be considered safe 0 150 | to invite 1 151 | prevent such a development 1 152 | rigged 0 153 | logical solution 1 154 | bungled its dealings 0 155 | demerits 0 156 | like the suffering 0 157 | unacceptable 0 158 | supports 1 159 | 100 percent 1 160 | does not appear 0 161 | pretext 0 162 | what begins to happen when we hate our friends 0 163 | denies 0 164 | intimidation 0 165 | children skipping school 0 166 | criticism 0 167 | violence and intimidation 0 168 | blatantly obstructed 0 169 | reactions of repulsion 0 170 | are not happy 0 171 | criticized 0 172 | a growing impression that misuari could pose a security threat 0 173 | left behind 1 174 | they have not succeeded , and will never succeed 1 175 | it's shameful 0 176 | beyond the reach of any legal regime 0 177 | is pessimistic 0 178 | has no right 0 179 | are masterminded 0 180 | regret 0 181 | will weaken soon 0 182 | terrible tragedy 0 183 | there is no alternative to it but conflict , isolation , nationalism , and ultimately war 0 184 | freak show 0 185 | insisted 1 186 | axis of evil 0 187 | come under fire 0 188 | threats expressed by 0 189 | the support 1 190 | wrong 0 191 | has resisted 0 192 | the darkest hour is always before the dawn 1 193 | the entire palestinian people 0 194 | denounced 0 195 | very serious threat 0 196 | only 0 197 | making a spectacle of 0 198 | promote meetings 1 199 | to mould the electoral process in his favour 0 200 | not a single day would pass in peace and with no palestinians shedding blood 0 201 | crises 0 202 | hoping 1 203 | to intimidate women and children 0 204 | disputes 0 205 | extreme right 0 206 | most dangerous 0 207 | may even get better 0 208 | promise 1 209 | has pledged 1 210 | one body with two heads 0 211 | feel at ease 1 212 | turning a blind eye 0 213 | has floundered 0 214 | warns 0 215 | most cogent argument 1 216 | lump 0 217 | deals a blow 0 218 | bad treatment 0 219 | concerned 0 220 | have criticized 0 221 | you can hardly speak of a targeting error 0 222 | certain countries resort to 0 223 | called for 1 224 | violating human rights 0 225 | unusual 0 226 | would undermine 0 227 | a terrorist act 0 228 | threat 0 229 | objectives 1 230 | can be abated 1 231 | muscle - flexing 0 232 | pursued 1 233 | brilliant 1 234 | committed 1 235 | committing themselves 1 236 | support 1 237 | the importance of china 1 238 | only one single 0 239 | hoped 1 240 | closed ranks behind 1 241 | peace 1 242 | would not be a bad idea 1 243 | legitimate 1 244 | closer to that of cowboys than to a civilized mentality 0 245 | such fruitful results 1 246 | double crime 0 247 | perfectly at ease 1 248 | mistake 0 249 | repeatedly accused 0 250 | ceased to be a soldier of the fatherland 0 251 | however 0 252 | apocalyptic savagery 0 253 | a border of peace and good neighbourliness 1 254 | call for 1 255 | hurt 0 256 | declined to endorse 0 257 | countries such as iran , iraq and north korea represent an ``axis of evil 0 258 | more serious 0 259 | am confident 1 260 | picking on 0 261 | whether 0 262 | go out into the streets to defend 1 263 | committed one more mistake 0 264 | the agreement 1 265 | charging 0 266 | other aggressive acts against lebanon 0 267 | enjoying 1 268 | decided 0 269 | hard - line 0 270 | feels 0 271 | do n't want 0 272 | the threats launched 0 273 | rejected 0 274 | breaking fundamental concepts 0 275 | increasingly tyrannical 0 276 | undisguised declaration of war and a rhetoric threatening aggression 0 277 | want 1 278 | hyperbole , 0 279 | patronizing 0 280 | arbitrary arrests 0 281 | jeopardy 0 282 | could not be said to adequately comply 0 283 | with good economic management , israel should have little trouble 1 284 | terrorist allies 0 285 | swift criticism from 0 286 | instead 0 287 | steering the economy into disaster 0 288 | grew so unhappy 0 289 | will not resort 0 290 | has refused 0 291 | may not be feasible 0 292 | danger of being shelved altogether 0 293 | unmanageable 0 294 | favouring independence 1 295 | misrule 0 296 | misery 0 297 | steering the nation to prosperity and progress 1 298 | america's biding 0 299 | compounded the problem 0 300 | designed to benefit mugabe 0 301 | supposed to be 0 302 | appraised 1 303 | stand beside right and justice 1 304 | refuses 0 305 | surged 1 306 | denied 0 307 | fake imposter 0 308 | great evil on open display 0 309 | ideal , sunny clime 1 310 | seems to be determined to expand the scope of the anti - terror war 0 311 | a body blow 0 312 | as full citizens in the same sate 1 313 | lecturing 0 314 | suffered 0 315 | harmed 0 316 | criticized 0 317 | scores of 0 318 | ignore the consequences 0 319 | territorial ambition 0 320 | not 0 321 | no one has the right to wage war against the history of this nation 0 322 | wants 1 323 | hatred 0 324 | what occured in the united states on 11 september 0 325 | oneupmanship 0 326 | labeling 0 327 | showed little - disguised irritation 0 328 | concern 0 329 | even 0 330 | the possibility of a democratic , stable and prosperous 1 331 | strongly criticized and condemned 0 332 | will not admit 0 333 | persuade 1 334 | was more than confident 1 335 | had not heeded 0 336 | cost - effective 1 337 | mistake 0 338 | would adhere to a pluralistic vision 1 339 | had asked 1 340 | replete with 1 341 | shameful 0 342 | will only 0 343 | accidental slight 0 344 | nothing whatsoever 0 345 | could no longer tolerate 0 346 | leeway 1 347 | the doubts 0 348 | aggressions against 0 349 | raving 0 350 | are blamed by 0 351 | so many uncertainties 0 352 | destined to collapse 1 353 | confidence crisis 0 354 | are accusing 0 355 | warned 0 356 | such pessimism 0 357 | anxiety 0 358 | cause a rift 0 359 | no politically prudent 0 360 | held out an olive branch 1 361 | continues to demolish 0 362 | does not endorse 0 363 | failed 1 364 | comprehensive destructive policy 0 365 | sounds clumsy 0 366 | fails to meet the standard of being free and fair 0 367 | willfulness 1 368 | tough policy 0 369 | has not satisfied 0 370 | extremist inclinations 0 371 | unilateral 0 372 | the iranians have not done what the pakistan government has done 0 373 | because 0 374 | boycotted 0 375 | without just compensation 0 376 | desperation of the people 0 377 | damage 0 378 | dominating the world 0 379 | sought 1 380 | fearing 0 381 | that concessions are sufficiently concrete 1 382 | put his nation first 1 383 | expresses the concern 0 384 | can contaminate 0 385 | to put it mildly 0 386 | will do its utmost 1 387 | endorsed 1 388 | will surely be on the president's lips 0 389 | with typical understatement 0 390 | lambasted 0 391 | affirmed 1 392 | vitriol 0 393 | in compliance 0 394 | numerous serious abuses 0 395 | asked 1 396 | confidence 1 397 | played the same tactic again 0 398 | are enthusiastic 1 399 | valued ally and friend 1 400 | so - called 0 401 | are regarding 0 402 | running out of meaningful options 0 403 | can not accept 0 404 | is concerned 0 405 | further criticism 0 406 | of all the unconscionable things 0 407 | longest destructive war 0 408 | plight of 0 409 | can no longer stand shoulder to shoulder 0 410 | it would be madness to hold elections now 0 411 | should be looking up smiling 1 412 | if one goes by the us logic , only the us spy plane is permitted to spy at other people's doorsteps 0 413 | begins to bear fruit 1 414 | rigged 0 415 | serious division 0 416 | enjoys no social base 0 417 | very constructive 1 418 | grateful 1 419 | turned thugs 0 420 | was unconstitutional 0 421 | scares away 0 422 | declining to endorse 0 423 | violate human rights 0 424 | accused 0 425 | negatively 0 426 | due regard 1 427 | simplistic 0 428 | will help renew 1 429 | succumbing 0 430 | peace to prevail 1 431 | despite all of this , however 0 432 | proves this beyond the shadow of a doubt 1 433 | is accused 0 434 | intolerant 0 435 | the visit has achieved positive and successful results 1 436 | wantonly infringing 0 437 | assassin 0 438 | continue to obstruct 0 439 | interests 1 440 | was a strong desire among 1 441 | to denounce 0 442 | left no avenue unexplored 1 443 | endorsed 1 444 | would also ensure a durable peace 1 445 | as a vehicle for increasing personal popularity 1 446 | would have been sending a very bad signal 0 447 | continues to refuse 0 448 | pursuit 0 449 | committing themselves to peace 1 450 | accuses 0 451 | as if they were quarries 0 452 | far worse than those prevailing in camp x - ray 0 453 | inhumanely 0 454 | appropriate 1 455 | poor 0 456 | good education system 1 457 | illegal 0 458 | devastating 0 459 | to create the impression 0 460 | exacerbating 0 461 | faltered as a result of israel's intransigence 0 462 | his career was like a rough sea , with highs and lows and never calm 0 463 | generally approved of 1 464 | deviant 0 465 | without trying to maneuver , place obstacles , or set impossible conditions 0 466 | the fire is raging at home 0 467 | lacks credibility and can not withstand any objective scrutiny 0 468 | endorsed 1 469 | assassinate innocent activists and citizens 0 470 | than they deserve 0 471 | refusal to respect its obligations 0 472 | would not 0 473 | for the first time 1 474 | beautiful historic coincidence 1 475 | hardly elastic enough 0 476 | approve 1 477 | hope 1 478 | was so hard on 0 479 | most serious consequences 0 480 | will have trouble wriggling out of the need to explain and justify 0 481 | not completely reliable 0 482 | democracy exhausted all its generosity 0 483 | relatively calm 1 484 | are extremely concerned 0 485 | bringing an end to terrorism and the taliban 1 486 | reject 0 487 | stand firm 1 488 | new political bogey 0 489 | abstractly recommend 1 490 | was worried 0 491 | hardline 0 492 | crash course 0 493 | recognition 1 494 | repeated denunciations 0 495 | hope 1 496 | carnage 0 497 | threats 0 498 | swapping the silver - visored helmet of a space cadet for the green eyeshade of a consummate bean counter 0 499 | to voice his concern 0 500 | significant 0 501 | can bring security and stability to all the parties without exception 1 502 | would consider 1 503 | be reproached 0 504 | or so it claims 0 505 | harboring serious doubts 0 506 | if it becomes more of a nuisance 0 507 | need to establish a just peace 1 508 | immediate support 1 509 | discrediting 0 510 | picking a quarrel 0 511 | bridle the israeli oppressive activity 1 512 | the euro , our currency 1 513 | edifying photograph 1 514 | wrong judgment 0 515 | preservation of global peace and security 1 516 | sharply questioned 0 517 | denied 0 518 | respect 1 519 | unfeasible 0 520 | has promised 1 521 | fierce demonstrations 0 522 | to thwart 1 523 | disrespect of advice 0 524 | war on 0 525 | as the saying goes , when you pull up the turnip , mud comes with it 0 526 | smiling 1 527 | judgment 0 528 | an axis of evil 0 529 | criticizes 0 530 | costly burden 0 531 | detrimental 0 532 | earned eternal notoriety 0 533 | making liberal use of the but construction 0 534 | plotting 0 535 | financial disaster 0 536 | would be regarded 0 537 | parasitic economies 0 538 | wanted 1 539 | more demagogy than arguments 0 540 | israel's superior ability to punish palestinians 0 541 | flirting with 0 542 | was perceived 0 543 | what is europe 0 544 | the best role model 1 545 | inaccurate and fabricated 0 546 | recognition 1 547 | axis of evil rhetoric 0 548 | lost their illegitimate interests 0 549 | not a man who likes to improvise 0 550 | is criticized 0 551 | can trust 1 552 | foresaw 0 553 | heresy 0 554 | him has not been pretty 0 555 | demonstrations and rallies against 0 556 | axis of evil 0 557 | cold war heritage 0 558 | possible regression 0 559 | will guarantee 0 560 | came out in protest 0 561 | progressive 1 562 | peaceful protests 0 563 | difficulty , of course 0 564 | put on a spectacle 0 565 | dogma 0 566 | devastating what remains 0 567 | is stiffening in its attitude toward 0 568 | stuck for 18 months on ground zero 0 569 | put most of the blame 0 570 | very large 0 571 | once again 0 572 | it is almost impossible 0 573 | has criticised 0 574 | axis of evil 0 575 | humiliation of 0 576 | calm 1 577 | are tired 0 578 | enjoy 1 579 | wanted 1 580 | has blamed 0 581 | unprecedented force 0 582 | provocative 0 583 | far preferable 1 584 | repeated warnings 0 585 | neither free nor fair 0 586 | backing 1 587 | widespread debates against 0 588 | using violence , intimidation , special laws and dirty tricks to fix the two - day election 0 589 | continue to obstruct 0 590 | seeking 1 591 | biased attitude 0 592 | long neglected 0 593 | success of western europe after world war ii laid in its creation of the good neighborly field 1 594 | even risking his own future 0 595 | accused 0 596 | denied 0 597 | desperate tableau 0 598 | could n't wait 1 599 | blessed 1 600 | nuclear club 0 601 | most important 1 602 | wants 0 603 | questioning 0 604 | no one is listening 0 605 | complicates any situation 0 606 | who were expelled from their homeland 0 607 | distortion of reality 0 608 | had the advantage for washington of weakening opec 1 609 | the support 1 610 | may god be satisfied with him 1 611 | continue to rise 0 612 | will not be able lay claim to a worthy place in the civilized world 0 613 | has to be maintained at any cost 0 614 | thugs 0 615 | apprehensions 0 616 | want 1 617 | recommendations 1 618 | one of the few enlightened officials 1 619 | democratic achievements 1 620 | tensions between 0 621 | see 0 622 | infuriating 0 623 | first organized protest 0 624 | feared by 0 625 | ignored 0 626 | believe 0 627 | particular concern is raised 0 628 | main problems are in the economic area 0 629 | damages the credibility 0 630 | declining to comply 0 631 | blamed 0 632 | allegedly threatening 0 633 | dismisses 0 634 | the unconditional support 1 635 | not satisfactory 0 636 | negated 0 637 | berating 0 638 | would remake venezuela to benefit the poor 1 639 | `world judge of human rights' 0 640 | were only to be expected 0 641 | cooperation 1 642 | issuing a letter of objection 0 643 | would support wholeheartedly 1 644 | giving him medium approval 1 645 | at a loss 0 646 | dwindling moral values 0 647 | is self - inflicted 0 648 | if it is successful 0 649 | impose 0 650 | eradication 0 651 | feared 0 652 | scupper any hope 0 653 | backing away 0 654 | held 0 655 | will have no universally - acknowledged 0 656 | did not show the slightest sympathy , still less the least regret 0 657 | advanced a mendacious critique 0 658 | disputes between 0 659 | encouraged 1 660 | another setback for the imf 0 661 | wanted 1 662 | clear priority 1 663 | crimes of war 0 664 | destroyed 0 665 | warned 0 666 | decided to back 1 667 | violates the united nations charter 0 668 | interfere 0 669 | have n't already violated 0 670 | defenceless 0 671 | was effective especially 1 672 | is accusing 0 673 | a man blinded by power 0 674 | backing 1 675 | it's really strange 0 676 | his favourite newfie 0 677 | mercurial strongman 0 678 | that four shots would solve the problem 0 679 | would not accept 0 680 | there is no reason for it to be impossible 1 681 | poor response 0 682 | aspirations 1 683 | to the healthy development of sino - us relations 1 684 | resolute commitment 1 685 | democracy 1 686 | the concern 0 687 | only wants 1 688 | needlessly 0 689 | secretly behind every local misfortune 0 690 | positive 1 691 | is disgusted 0 692 | high degree of difficulty 0 693 | improvement 1 694 | spurned 0 695 | has denounced 0 696 | axis of evil theory 0 697 | extensive support 1 698 | worst 0 699 | its selfishness 0 700 | ambition 1 701 | find no grounds for any support 0 702 | achieving better results 1 703 | no tears will be shed in this corner 0 704 | is a telling example of a new - and perhaps risky - approach 0 705 | will become one of the best elements 1 706 | was also quite naive 0 707 | the criticism 0 708 | desire to work 1 709 | for the sake of peace 1 710 | double standard 0 711 | pretended 0 712 | alarm 0 713 | discontent 0 714 | furthermore 0 715 | massacring thousands of innocent people 0 716 | immense gulf between 0 717 | still wants 1 718 | basically sound 1 719 | repeatedly threatened 0 720 | mendacious 0 721 | beyond reproach 1 722 | but 0 723 | neat stuff 1 724 | purposely play up 0 725 | blatant campaign of intimidation 0 726 | disappointment 0 727 | provoked 0 728 | intends to 0 729 | worried 0 730 | real danger 0 731 | supported 1 732 | support 1 733 | were plotting 0 734 | agreed 1 735 | most realistic 1 736 | violating 0 737 | value sharing 1 738 | to its knees 0 739 | to request 1 740 | get out ! . 0 741 | the axis of evil 0 742 | assassins , assassins , assassins 0 743 | is the appropriate one 1 744 | resistance 1 745 | indulging in blood - shed and their lunaticism 0 746 | desperate 0 747 | to describe 0 748 | can ask 1 749 | know 0 750 | invited 1 751 | to express satisfaction 1 752 | harmonious and close 1 753 | support 1 754 | has been inclined toward 1 755 | incited 0 756 | was like giving away the country to its former colonial masters 0 757 | the destruction 0 758 | already volatile 0 759 | overwhelming evidence 0 760 | imperative for harmonious society 1 761 | nor does it seem proper 0 762 | storming palestinian territories 0 763 | is reflected 1 764 | expressed satisfaction 1 765 | intends 1 766 | ruined 0 767 | made public a blacklist 0 768 | those who seek to attack and destroy law and order and legitimate government 0 769 | unlawful 0 770 | freely 1 771 | reassurances 1 772 | agreeable 1 773 | dont want 0 774 | did not agree 0 775 | to deny 0 776 | perception 0 777 | had particularly harsh words for 0 778 | want 1 779 | the brutality with which this closure was implemented was unheard of 0 780 | friendship 1 781 | axis of evil 0 782 | supported 1 783 | slaughter of more than 0 784 | will be asked 1 785 | just , legitimate rights 1 786 | condemned 0 787 | thanked 1 788 | unstable situation 0 789 | merely an interlude 0 790 | depends on 0 791 | has been efficient enough 1 792 | grand 1 793 | any step backward for democracy 0 794 | quite supportive 1 795 | great leader 1 796 | would help 1 797 | formed a close friendship 1 798 | uphold 1 799 | is feared by 0 800 | used to boast 1 801 | regarded 1 802 | would violate international standards prohibiting cruel , inhuman or degrading treatment 0 803 | convince the americans that the bush administration has not only succeeded in removing a regime of religious madmen 0 804 | has put west asia on the brink of another war 0 805 | has also backed 1 806 | attacks 0 807 | banned 0 808 | dishonoring 0 809 | sent congratulations 1 810 | thought 0 811 | may be disconcerted 0 812 | concern 0 813 | will be inviting 1 814 | have to bash 0 815 | plan 1 816 | guaranteed 1 817 | the mistake is to assume 0 818 | insists 1 819 | sticking to the polluting policies 0 820 | can be difficult 0 821 | can prevent 1 822 | extremely dangerous 0 823 | would beg for surrender 0 824 | protesting 0 825 | proclaimed 1 826 | to applaud everything 1 827 | the most ideal way 1 828 | accused 0 829 | what fate awaits argentina's crisis - stricken economy 0 830 | especially those who are dangerous and suicidal 0 831 | has failed for a decade now 0 832 | had likely deteriorated beyond repair 0 833 | attempts to suppress 0 834 | endorse 1 835 | if there were serious problems 0 836 | designed to achieve an outcome - power at all costs 0 837 | criticism of 0 838 | everything 0 839 | so that we would not become terrorists 1 840 | higher transparency 1 841 | warning 0 842 | does not justify all the means 0 843 | the further 0 844 | artificial obstacles 0 845 | would improve 0 846 | has now made himself heard 1 847 | openly invited 1 848 | argued 0 849 | decisions 0 850 | admittedly 1 851 | still prefer 1 852 | had planned 0 853 | sacrifice himself 0 854 | very pleasant for 1 855 | prevented 0 856 | favorable opinions 1 857 | no one we know to have planned such deeds will escape 0 858 | to express openly dissenting political and religious views 0 859 | confidence 1 860 | no business 0 861 | has provoked concern 0 862 | had aligned against 0 863 | under the pretext of fighting terrorism 0 864 | violation of the palestinian people's human rights 0 865 | understanding and approval 1 866 | concerns 0 867 | universal character of the prophets 1 868 | force himself 0 869 | getting mixed up with crime , drugs , and violence 0 870 | bold and fair 1 871 | like an upholder of justice 1 872 | factually inaccurate 0 873 | criticism 0 874 | was shocked 0 875 | saw 1 876 | had refused to accept 0 877 | would make it possible 1 878 | domino effect 0 879 | palestinian hand that is stretched for peace 0 880 | has refused to bow 0 881 | despite the perils 0 882 | will be instrumental 1 883 | concerns 0 884 | those who are really behind it all , those who are behind this business , use the imf 0 885 | accommodate 1 886 | doubts 0 887 | slammed 0 888 | will seek 1 889 | grieving 0 890 | called for 1 891 | isolated and frustrated 0 892 | in the name of self - righteousness 0 893 | intensify and accelerate 0 894 | keeping a wary eye 0 895 | took to the streets 0 896 | gave free rein 0 897 | to rid 1 898 | wanted 1 899 | betrayal 0 900 | the branding 0 901 | collapse 0 902 | i lost any sense of dignity 0 903 | massive intimidatory show of force that was evident in the deployment of security forces 0 904 | unpopular system 0 905 | regarded 0 906 | has posed serious threats 0 907 | to draw up an action campaign aimed 0 908 | illegally 0 909 | reputation was ruined 0 910 | the opposition 0 911 | cracks are appearing 0 912 | could fray the bilateral goodwill 0 913 | was deemed 0 914 | has renounced 0 915 | the cia would organize coups and assassinations to further us interests 0 916 | the definition of a democratic regime is subjected to double standards 0 917 | strike 0 918 | challenged 0 919 | sympathy 0 920 | rubbed their palms at length 1 921 | shook 0 922 | sided with 1 923 | trying to move the goal posts 0 924 | lose popular support among 0 925 | will only invite worse criticism and rejection 0 926 | axis of evil 0 927 | absolutely not permitted 0 928 | found little to object to 0 929 | justify 1 930 | extension of its characteristic policy of hegemony 0 931 | unrealistic 0 932 | warned against 0 933 | crippled 0 934 | well 1 935 | it is ineffective 0 936 | is now promoting 1 937 | called 0 938 | in spite of the good offices 0 939 | genuinely 1 940 | for more than two years 0 941 | tried to encourage 1 942 | loathsome action 0 943 | reckless 0 944 | and killed 0 945 | likening 0 946 | will now agree 1 947 | exalted 1 948 | would like 1 949 | hoped 1 950 | even more dismal 0 951 | gives new life 1 952 | rejected 0 953 | predatory 0 954 | viciously spoke ill 0 955 | in protest against 0 956 | was remarkable 1 957 | -------------------------------------------------------------------------------- /evaluator.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import copy 3 | 4 | import six 5 | 6 | import chainer 7 | from chainer import configuration 8 | from chainer.dataset import convert 9 | from chainer.dataset import iterator as iterator_module 10 | from chainer import function 11 | from chainer import link 12 | from chainer import reporter as reporter_module 13 | from chainer.training import extension 14 | 15 | 16 | class MicroEvaluator(chainer.training.extensions.Evaluator): 17 | 18 | def evaluate(self): 19 | iterator = self._iterators['main'] 20 | eval_func = self.eval_func or self._targets['main'] 21 | 22 | if self.eval_hook: 23 | self.eval_hook(self) 24 | 25 | if hasattr(iterator, 'reset'): 26 | iterator.reset() 27 | it = iterator 28 | else: 29 | it = copy.copy(iterator) 30 | 31 | # summary = reporter_module.DictSummary() 32 | summary = collections.defaultdict(list) 33 | 34 | for batch in it: 35 | observation = {} 36 | with reporter_module.report_scope(observation): 37 | in_arrays = self.converter(batch, self.device) 38 | with function.no_backprop_mode(): 39 | if isinstance(in_arrays, tuple): 40 | eval_func(*in_arrays) 41 | elif isinstance(in_arrays, dict): 42 | eval_func(**in_arrays) 43 | else: 44 | eval_func(in_arrays) 45 | n_data = len(batch) 46 | summary['n'].append(n_data) 47 | # summary.add(observation) 48 | for k, v in observation.items(): 49 | summary[k].append(v) 50 | 51 | mean = dict() 52 | ns = summary['n'] 53 | del summary['n'] 54 | for k, vs in summary.items(): 55 | mean[k] = sum(v * n for v, n in zip(vs, ns)) / sum(ns) 56 | return mean 57 | # return summary.compute_mean() 58 | -------------------------------------------------------------------------------- /finetune_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import csv 6 | import os 7 | import shutil 8 | import logging 9 | import argparse 10 | import random 11 | from tqdm import tqdm, trange 12 | import json 13 | 14 | import numpy as np 15 | import torch 16 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 17 | from torch.utils.data.distributed import DistributedSampler 18 | 19 | from pytorch_pretrained_bert.tokenization import BertTokenizer 20 | from pytorch_pretrained_bert.modeling import BertForMaskedLM 21 | from pytorch_pretrained_bert.optimization import BertAdam 22 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE 23 | 24 | import train_text_classifier 25 | 26 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 27 | datefmt='%m/%d/%Y %H:%M:%S', 28 | level=logging.INFO) 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | class DataProcessor(object): 33 | """Base class for data converters for sequence classification data sets.""" 34 | 35 | def get_train_examples(self, data_dir): 36 | """Gets a collection of `InputExample`s for the train set.""" 37 | raise NotImplementedError() 38 | 39 | def get_dev_examples(self, data_dir): 40 | """Gets a collection of `InputExample`s for the dev set.""" 41 | raise NotImplementedError() 42 | 43 | def get_labels(self): 44 | """Gets the list of labels for this data set.""" 45 | raise NotImplementedError() 46 | 47 | @classmethod 48 | def _read_tsv(cls, input_file, quotechar=None): 49 | """Reads a tab separated value file.""" 50 | with open(input_file, "r") as f: 51 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 52 | lines = [] 53 | for line in reader: 54 | lines.append(line) 55 | return lines 56 | 57 | 58 | class InputExample(object): 59 | """A single training/test example for simple sequence classification.""" 60 | 61 | def __init__(self, guid, text_a, label=None): 62 | """Constructs a InputExample. 63 | 64 | Args: 65 | guid: Unique id for the example. 66 | text_a: string. The untokenized text of the first sequence. For single 67 | sequence tasks, only this sequence must be specified. 68 | """ 69 | self.guid = guid 70 | self.text_a = text_a 71 | self.label = label 72 | 73 | 74 | class InputFeatures(object): 75 | """A single set of features of data.""" 76 | 77 | def __init__(self, init_ids, input_ids, input_mask, segment_ids, masked_lm_labels): 78 | self.init_ids = init_ids 79 | self.input_ids = input_ids 80 | self.input_mask = input_mask 81 | self.segment_ids = segment_ids 82 | self.masked_lm_labels = masked_lm_labels 83 | 84 | class AugProcessor(DataProcessor): 85 | """Processor for dataset to be augmented.""" 86 | 87 | def get_train_examples(self, data_dir): 88 | """See base class.""" 89 | return self._create_examples( 90 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 91 | 92 | def get_dev_examples(self, data_dir): 93 | """See base class.""" 94 | return self._create_examples( 95 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 96 | 97 | def get_labels(self, name): 98 | """add your dataset here""" 99 | if name in ['stsa.binary', 'mpqa', 'rt-polarity', 'subj']: 100 | return ["0", "1"] 101 | elif name in ['stsa.fine']: 102 | return ["0", "1", "2", "3", "4"] 103 | elif name in ['TREC']: 104 | return ["0", "1", "2", "3", "4", "5"] 105 | 106 | def _create_examples(self, lines, set_type): 107 | """Creates examples for the training and dev sets.""" 108 | examples = [] 109 | for (i, line) in enumerate(lines): 110 | if i == 0: 111 | continue 112 | guid = "%s-%s" % (set_type, i) 113 | text_a = line[0] 114 | label = line[-1] 115 | examples.append( 116 | InputExample(guid=guid, text_a=text_a, label=label)) 117 | return examples 118 | 119 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): 120 | """Loads a data file into a list of `InputBatch`s.""" 121 | 122 | label_map = {} 123 | for (i, label) in enumerate(label_list): 124 | label_map[label] = i 125 | 126 | features = [] 127 | # ---- 128 | dupe_factor = 5 129 | masked_lm_prob = 0.15 130 | max_predictions_per_seq = 20 131 | rng = random.Random(12345) 132 | 133 | for (ex_index, example) in enumerate(examples): 134 | tokens_a = tokenizer.tokenize(example.text_a) 135 | segment_id = label_map[example.label] 136 | # Account for [CLS] and [SEP] with "- 2" 137 | if len(tokens_a) > max_seq_length - 2: 138 | tokens_a = tokens_a[0:(max_seq_length - 2)] 139 | 140 | # Due to we use conditional bert, we need to place label information in segment_ids 141 | tokens = [] 142 | segment_ids = [] 143 | # is [CLS]和[SEP] needed ？ 144 | tokens.append("[CLS]") 145 | segment_ids.append(segment_id) 146 | for token in tokens_a: 147 | tokens.append(token) 148 | segment_ids.append(segment_id) 149 | tokens.append("[SEP]") 150 | segment_ids.append(segment_id) 151 | masked_lm_labels = [-1] * max_seq_length 152 | 153 | cand_indexes = [] 154 | for (i, token) in enumerate(tokens): 155 | if token == "[CLS]" or token == "[SEP]": 156 | continue 157 | cand_indexes.append(i) 158 | 159 | rng.shuffle(cand_indexes) 160 | len_cand = len(cand_indexes) 161 | 162 | output_tokens = list(tokens) 163 | 164 | num_to_predict = min(max_predictions_per_seq, 165 | max(1, int(round(len(tokens) * masked_lm_prob)))) 166 | 167 | masked_lms_pos = [] 168 | covered_indexes = set() 169 | for index in cand_indexes: 170 | if len(masked_lms_pos) >= num_to_predict: 171 | break 172 | if index in covered_indexes: 173 | continue 174 | covered_indexes.add(index) 175 | 176 | masked_token = None 177 | # 80% of the time, replace with [MASK] 178 | if rng.random() < 0.8: 179 | masked_token = "[MASK]" 180 | else: 181 | # 10% of the time, keep original 182 | if rng.random() < 0.5: 183 | masked_token = tokens[index] 184 | # 10% of the time, replace with random word 185 | else: 186 | masked_token = tokens[cand_indexes[rng.randint(0, len_cand - 1)]] 187 | 188 | masked_lm_labels[index] = tokenizer.convert_tokens_to_ids([tokens[index]])[0] 189 | output_tokens[index] = masked_token 190 | masked_lms_pos.append(index) 191 | 192 | init_ids = tokenizer.convert_tokens_to_ids(tokens) 193 | input_ids = tokenizer.convert_tokens_to_ids(output_tokens) 194 | 195 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 196 | # tokens are attended to. 197 | input_mask = [1] * len(input_ids) 198 | 199 | # Zero-pad up to the sequence length. 200 | while len(input_ids) < max_seq_length: 201 | init_ids.append(0) 202 | input_ids.append(0) 203 | input_mask.append(0) 204 | segment_ids.append(0) # ?segment_id 205 | 206 | assert len(init_ids) == max_seq_length 207 | assert len(input_ids) == max_seq_length 208 | assert len(input_mask) == max_seq_length 209 | assert len(segment_ids) == max_seq_length 210 | 211 | if ex_index < 2: 212 | logger.info("*** Example ***") 213 | logger.info("guid: %s" % (example.guid)) 214 | logger.info("tokens: %s" % " ".join( 215 | [str(x) for x in tokens])) 216 | logger.info("init_ids: %s" % " ".join([str(x) for x in init_ids])) 217 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 218 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 219 | logger.info( 220 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 221 | logger.info("masked_lm_labels: %s" % " ".join([str(x) for x in masked_lm_labels])) 222 | 223 | features.append( 224 | InputFeatures(init_ids=init_ids, 225 | input_ids=input_ids, 226 | input_mask=input_mask, 227 | segment_ids=segment_ids, 228 | masked_lm_labels=masked_lm_labels)) 229 | return features 230 | 231 | def rev_wordpiece(str): 232 | #print(str) 233 | if len(str) > 1: 234 | for i in range(len(str)-1, 0, -1): 235 | if str[i] == '[PAD]': 236 | str.remove(str[i]) 237 | elif len(str[i]) > 1 and str[i][0]=='#' and str[i][1]=='#': 238 | str[i-1] += str[i][2:] 239 | str.remove(str[i]) 240 | return " ".join(str[1:-1]) 241 | 242 | def main(): 243 | parser = argparse.ArgumentParser() 244 | 245 | ## Required parameters 246 | parser.add_argument("--data_dir", default="datasets", type=str, 247 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 248 | parser.add_argument("--output_dir", default="aug_data", type=str, 249 | help="The output dir for augmented dataset") 250 | parser.add_argument("--bert_model", default="bert-base-uncased", type=str, 251 | help="The path of pretrained bert model.") 252 | parser.add_argument("--task_name",default="subj",type=str, 253 | help="The name of the task to train.") 254 | parser.add_argument("--max_seq_length", default=64, type=int, 255 | help="The maximum total input sequence length after WordPiece tokenization. \n" 256 | "Sequences longer than this will be truncated, and sequences shorter \n" 257 | "than this will be padded.") 258 | parser.add_argument("--do_lower_case", default=False, action='store_true', 259 | help="Set this flag if you are using an uncased model.") 260 | parser.add_argument("--train_batch_size", default=32, type=int, 261 | help="Total batch size for training.") 262 | parser.add_argument("--learning_rate", default=5e-5, type=float, 263 | help="The initial learning rate for Adam.") 264 | parser.add_argument("--num_train_epochs", default=10.0, type=float, 265 | help="Total number of training epochs to perform.") 266 | parser.add_argument("--warmup_proportion", default=0.1, type=float, 267 | help="Proportion of training to perform linear learning rate warmup for. " 268 | "E.g., 0.1 = 10%% of training.") 269 | parser.add_argument('--seed', type=int, default=42, 270 | help="random seed for initialization") 271 | 272 | args = parser.parse_args() 273 | with open("global.config", 'rb') as f: 274 | configs_dict = json.load(f) 275 | 276 | args.task_name = configs_dict.get("dataset") 277 | print(args) 278 | run_aug(args, save_every_epoch=False) 279 | 280 | 281 | def run_aug(args, save_every_epoch=False): 282 | processors = { 283 | # you can your processor here 284 | "TREC": AugProcessor, 285 | "stsa.fine": AugProcessor, 286 | "stsa.binary": AugProcessor, 287 | "mpqa": AugProcessor, 288 | "rt-polarity": AugProcessor, 289 | "subj": AugProcessor, 290 | } 291 | 292 | task_name = args.task_name 293 | if task_name not in processors: 294 | raise ValueError("Task not found: %s" % (task_name)) 295 | args.data_dir = os.path.join(args.data_dir, task_name) 296 | args.output_dir = os.path.join(args.output_dir, task_name) 297 | 298 | random.seed(args.seed) 299 | np.random.seed(args.seed) 300 | torch.manual_seed(args.seed) 301 | os.makedirs(args.output_dir, exist_ok=True) 302 | processor = processors[task_name]() 303 | label_list = processor.get_labels(task_name) 304 | 305 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 306 | 307 | train_examples = None 308 | num_train_steps = None 309 | train_examples = processor.get_train_examples(args.data_dir) 310 | #dev_examples = processor.get_dev_examples(args.data_dir) 311 | #train_examples.extend(dev_examples) 312 | num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs) 313 | 314 | # Prepare model 315 | model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) 316 | 317 | if task_name == 'stsa.fine': 318 | model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(5, 768) 319 | model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02) 320 | elif task_name == 'TREC': 321 | model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(6, 768) 322 | model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02) 323 | 324 | model.cuda() 325 | 326 | # Prepare optimizer 327 | param_optimizer = list(model.named_parameters()) 328 | no_decay = ['bias', 'gamma', 'beta'] 329 | optimizer_grouped_parameters = [ 330 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, 331 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} 332 | ] 333 | t_total = num_train_steps 334 | optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate, 335 | warmup=args.warmup_proportion,t_total=t_total) 336 | 337 | global_step = 0 338 | train_features = convert_examples_to_features( 339 | train_examples, label_list, args.max_seq_length, tokenizer) 340 | logger.info("***** Running training *****") 341 | logger.info(" Num examples = %d", len(train_examples)) 342 | logger.info(" Batch size = %d", args.train_batch_size) 343 | logger.info(" Num steps = %d", num_train_steps) 344 | all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long) 345 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) 346 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) 347 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) 348 | all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long) 349 | train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels) 350 | train_sampler = RandomSampler(train_data) 351 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) 352 | 353 | model.train() 354 | 355 | save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) 356 | if not os.path.exists(save_model_dir): 357 | os.mkdir(save_model_dir) 358 | for e in trange(int(args.num_train_epochs), desc="Epoch"): 359 | avg_loss = 0. 360 | 361 | for step, batch in enumerate(train_dataloader): 362 | batch = tuple(t.cuda() for t in batch) 363 | _, input_ids, input_mask, segment_ids, masked_ids = batch 364 | loss = model(input_ids, segment_ids, input_mask, masked_ids) 365 | loss.backward() 366 | avg_loss += loss.item() 367 | optimizer.step() 368 | model.zero_grad() 369 | if (step + 1) % 50 == 0: 370 | print("avg_loss: {}".format(avg_loss / 50)) 371 | avg_loss = 0 372 | if save_every_epoch: 373 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1) 374 | save_model_path = os.path.join(save_model_dir, save_model_name) 375 | torch.save(model, save_model_path) 376 | else: 377 | if (e + 1) % 10 == 0: 378 | save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1) 379 | save_model_path = os.path.join(save_model_dir, save_model_name) 380 | torch.save(model, save_model_path) 381 | 382 | 383 | if __name__ == "__main__": 384 | main() 385 | -------------------------------------------------------------------------------- /global.config: -------------------------------------------------------------------------------- 1 | { 2 | "dataset":"stsa.binary", 3 | "eval_model":"cnn" 4 | } 5 | -------------------------------------------------------------------------------- /nets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Sample script of recurrent neural network language model. 3 | 4 | This code is ported from the following implementation written in Torch. 5 | https://github.com/tomsercu/lstm 6 | 7 | """ 8 | from __future__ import division 9 | from __future__ import print_function 10 | import argparse 11 | import json 12 | import warnings 13 | 14 | import numpy as np 15 | 16 | import chainer 17 | from chainer import cuda 18 | import chainer.functions as F 19 | import chainer.links as L 20 | from chainer import training 21 | from chainer.training import extensions 22 | from chainer import reporter 23 | 24 | embed_init = chainer.initializers.Uniform(.25) 25 | 26 | 27 | def embed_seq_batch(embed, seq_batch, dropout=0., context=None): 28 | x_len = [len(seq) for seq in seq_batch] 29 | x_section = np.cumsum(x_len[:-1]) 30 | ex = embed(F.concat(seq_batch, axis=0)) 31 | ex = F.dropout(ex, dropout) 32 | if context is not None: 33 | ids = [embed.xp.full((l, ), i).astype('i') 34 | for i, l in enumerate(x_len)] 35 | ids = embed.xp.concatenate(ids, axis=0) 36 | cx = F.embed_id(ids, context) 37 | ex = F.concat([ex, cx], axis=1) 38 | exs = F.split_axis(ex, x_section, 0) 39 | return exs 40 | 41 | 42 | class NormalOutputLayer(L.Linear): 43 | 44 | def __init__(self, *args, **kwargs): 45 | super(NormalOutputLayer, self).__init__(*args, **kwargs) 46 | 47 | def output_and_loss(self, h, t, reduce='mean'): 48 | logit = self(h) 49 | return F.softmax_cross_entropy( 50 | logit, t, normalize=False, reduce=reduce) 51 | 52 | def output(self, h, t=None): 53 | return self(h) 54 | 55 | 56 | class MLP(chainer.Chain): 57 | def __init__(self, n_hidden, in_units, hidden_units, out_units, dropout=0.): 58 | super(MLP, self).__init__() 59 | with self.init_scope(): 60 | self.l1 = L.Linear(in_units, hidden_units) 61 | self.lo = L.Linear(hidden_units, out_units) 62 | for i in range(2, n_hidden + 2): 63 | setattr(self, 'l{}'.format(i), 64 | L.Linear(hidden_units, hidden_units)) 65 | self.n_hidden = n_hidden 66 | self.dropout = dropout 67 | 68 | def __call__(self, x, label=None): 69 | x = self.l1(x) 70 | for i in range(2, self.n_hidden + 2): 71 | x = F.relu(x) 72 | x = F.dropout(x, self.dropout) 73 | x = getattr(self, 'l{}'.format(i))(x) 74 | x = F.relu(x) 75 | x = F.dropout(x, self.dropout) 76 | x = self.lo(x) 77 | x = F.relu(x) 78 | if hasattr(self, 'l1_label') and label is not None: 79 | x += self.l1_label(label) 80 | return x 81 | 82 | 83 | class BiLanguageModel(chainer.Chain): 84 | 85 | def __init__(self, n_vocab, n_units, n_layers=2, dropout=0.5): 86 | super(BiLanguageModel, self).__init__() 87 | with self.init_scope(): 88 | self.embed = L.EmbedID(n_vocab, n_units) 89 | RNN = L.NStepLSTM 90 | self.encoder_fw = RNN(n_layers, n_units, n_units, dropout) 91 | self.encoder_bw = RNN(n_layers, n_units, n_units, dropout) 92 | self.output = NormalOutputLayer(n_units, n_vocab) 93 | self.mlp = MLP(1, n_units * 2, n_units, n_units, dropout) 94 | self.dropout = dropout 95 | self.n_units = n_units 96 | self.n_layers = n_layers 97 | 98 | def add_label_condition_nets(self, n_labels, label_units): 99 | with self.init_scope(): 100 | self.mlp.add_link( 101 | 'l1_label', 102 | L.Linear(None, self.mlp.l1.b.size, nobias=True, 103 | initialW=chainer.initializers.Uniform(0.4))) 104 | self.n_labels = n_labels 105 | 106 | def encode(self, seq_batch, labels=None): 107 | seq_batch_wo_2bos = [seq[2::] for seq in seq_batch] 108 | revseq_batch_wo_2bos = [seq[::-1] for seq in seq_batch_wo_2bos] 109 | seq_batch_wo_2eos = [seq[:-2] for seq in seq_batch] 110 | bwe_seq_batch = self.embed_seq_batch(revseq_batch_wo_2bos) 111 | fwe_seq_batch = self.embed_seq_batch(seq_batch_wo_2eos) 112 | bwt_out_batch = self.encode_seq_batch( 113 | bwe_seq_batch, self.encoder_bw)[-1] 114 | fwt_out_batch = self.encode_seq_batch( 115 | fwe_seq_batch, self.encoder_fw)[-1] 116 | revbwt_concat = F.concat( 117 | [b[::-1] for b in bwt_out_batch], axis=0) 118 | fwt_concat = F.concat(fwt_out_batch, axis=0) 119 | t_out_concat = F.concat([fwt_concat, revbwt_concat], axis=1) 120 | t_out_concat = F.dropout(t_out_concat, self.dropout) 121 | if hasattr(self.mlp, 'l1_label') and labels is not None: 122 | labels = [[labels[i]] * f.shape[0] 123 | for i, f in enumerate(fwt_out_batch)] 124 | labels = self.xp.concatenate(sum(labels, []), axis=0) 125 | label_concat = self.xp.zeros( 126 | (t_out_concat.shape[0], self.n_labels)).astype('f') 127 | label_concat[self.xp.arange(len(labels)), labels] = 1. 128 | t_out_concat = self.mlp(t_out_concat, label_concat) 129 | else: 130 | t_out_concat = self.mlp(t_out_concat) 131 | return t_out_concat 132 | 133 | def embed_seq_batch(self, x_seq_batch, context=None): 134 | e_seq_batch = embed_seq_batch( 135 | self.embed, x_seq_batch, 136 | dropout=self.dropout, 137 | context=context) 138 | return e_seq_batch 139 | 140 | def encode_seq_batch(self, e_seq_batch, encoder): 141 | hs, cs, y_seq_batch = encoder(None, None, e_seq_batch) 142 | return hs, cs, y_seq_batch 143 | 144 | def calculate_loss(self, input_chain, **args): 145 | seq_batch = sum(input_chain, []) 146 | t_out_concat = self.encode(seq_batch) 147 | seq_batch_mid = [seq[1:-1] for seq in seq_batch] 148 | seq_mid_concat = F.concat(seq_batch_mid, axis=0) 149 | n_tok = sum(len(s) for s in seq_batch_mid) 150 | loss = self.output_and_loss_from_concat( 151 | t_out_concat, seq_mid_concat, 152 | normalize=n_tok) 153 | reporter.report({'perp': self.xp.exp(loss.data)}, self) 154 | return loss 155 | 156 | def output_and_loss_from_concat(self, y, t, normalize=None): 157 | y = F.dropout(y, ratio=self.dropout) 158 | loss = self.output.output_and_loss(y, t) 159 | if normalize is not None: 160 | loss *= 1. * t.shape[0] / normalize 161 | else: 162 | loss *= t.shape[0] 163 | return loss 164 | 165 | def calculate_loss_with_labels(self, seq_batch_with_labels): 166 | seq_batch, labels = seq_batch_with_labels 167 | t_out_concat = self.encode(seq_batch, labels=labels) 168 | seq_batch_mid = [seq[1:-1] for seq in seq_batch] 169 | seq_mid_concat = F.concat(seq_batch_mid, axis=0) 170 | n_tok = sum(len(s) for s in seq_batch_mid) 171 | loss = self.output_and_loss_from_concat( 172 | t_out_concat, seq_mid_concat, 173 | normalize=n_tok) 174 | reporter.report({'perp': self.xp.exp(loss.data)}, self) 175 | return loss 176 | 177 | def predict(self, xs, labels=None): 178 | with chainer.using_config('train', False), chainer.no_backprop_mode(): 179 | t_out_concat = self.encode(xs, labels=labels, add_original=0.) 180 | prob_concat = F.softmax(self.output.output(t_out_concat)).data 181 | x_len = [len(x) for x in xs] 182 | x_section = np.cumsum(x_len[:-1]) 183 | ps = np.split(cuda.to_cpu(prob_concat), x_section, 0) 184 | return ps 185 | 186 | def predict_embed(self, 187 | xs, embedW, 188 | labels=None, 189 | dropout=0., 190 | mode='sampling', 191 | temp=1., 192 | word_lower_bound=0., 193 | gold_lower_bound=0., 194 | gumbel=True, 195 | residual=0., 196 | wordwise=True, 197 | add_original=0., 198 | augment_ratio=0.25): 199 | x_len = [len(x) for x in xs] 200 | with chainer.using_config('train', False), chainer.no_backprop_mode(): 201 | t_out_concat = self.encode(xs, labels=labels) 202 | prob_concat = self.output.output(t_out_concat).data 203 | prob_concat /= temp 204 | prob_concat += self.xp.random.gumbel( 205 | size=prob_concat.shape).astype('f') 206 | prob_concat = F.softmax(prob_concat).data 207 | 208 | out_concat = F.embed_id( 209 | self.xp.argmax(prob_concat, axis=1).astype(np.int32), embedW) 210 | 211 | # insert eos 212 | eos = embedW[0][None] 213 | new_out = [] 214 | count = 0 215 | for i, x in enumerate(xs): 216 | new_out.append(eos) 217 | new_out.append(out_concat[count:count + len(xs) - 2]) 218 | new_out.append(eos) 219 | count += len(xs) - 2 220 | out_concat = F.concat(new_out, axis=0) 221 | 222 | def embed_func(x): return F.embed_id(x, embedW, ignore_label=-1) 223 | raw_concat = F.concat( 224 | sequence_embed(embed_func, xs, self.dropout), axis=0) 225 | b, u = raw_concat.shape 226 | 227 | mask = self.xp.broadcast_to( 228 | (self.xp.random.rand(b, 1) < augment_ratio), 229 | raw_concat.shape) 230 | out_concat = F.where(mask, out_concat, raw_concat) 231 | 232 | x_len = [len(x) for x in xs] 233 | x_section = np.cumsum(x_len[:-1]) 234 | out_concat = F.dropout(out_concat, dropout) 235 | exs = F.split_axis(out_concat, x_section, 0) 236 | return exs 237 | 238 | 239 | def sequence_embed(embed, xs, dropout=0.): 240 | """Efficient embedding function for variable-length sequences 241 | 242 | This output is equally to 243 | "return [F.dropout(embed(x), ratio=dropout) for x in xs]". 244 | However, calling the functions is one-shot and faster. 245 | 246 | Args: 247 | embed (callable): A :func:`~chainer.functions.embed_id` function 248 | or :class:`~chainer.links.EmbedID` link. 249 | xs (list of :class:`~chainer.Variable` or :class:`numpy.ndarray` or \ 250 | :class:`cupy.ndarray`): i-th element in the list is an input variable, 251 | which is a :math:`(L_i, )`-shaped int array. 252 | dropout (float): Dropout ratio. 253 | 254 | Returns: 255 | list of ~chainer.Variable: Output variables. i-th element in the 256 | list is an output variable, which is a :math:`(L_i, N)`-shaped 257 | float array. :math:`(N)` is the number of dimensions of word embedding. 258 | 259 | """ 260 | x_len = [len(x) for x in xs] 261 | x_section = np.cumsum(x_len[:-1]) 262 | ex = embed(F.concat(xs, axis=0)) 263 | ex = F.dropout(ex, ratio=dropout) 264 | exs = F.split_axis(ex, x_section, 0) 265 | return exs 266 | 267 | 268 | def block_embed(embed, x, dropout=0.): 269 | """Embedding function followed by convolution 270 | 271 | Args: 272 | embed (callable): A :func:`~chainer.functions.embed_id` function 273 | or :class:`~chainer.links.EmbedID` link. 274 | x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ 275 | :class:`cupy.ndarray`): Input variable, which 276 | is a :math:`(B, L)`-shaped int array. Its first dimension 277 | :math:`(B)` is assumed to be the *minibatch dimension*. 278 | The second dimension :math:`(L)` is the length of padded 279 | sentences. 280 | dropout (float): Dropout ratio. 281 | 282 | Returns: 283 | ~chainer.Variable: Output variable. A float array with shape 284 | of :math:`(B, N, L, 1)`. :math:`(N)` is the number of dimensions 285 | of word embedding. 286 | 287 | """ 288 | e = embed(x) 289 | e = F.dropout(e, ratio=dropout) 290 | e = F.transpose(e, (0, 2, 1)) 291 | e = e[:, :, :, None] 292 | return e 293 | 294 | 295 | class PredictiveEmbed(chainer.Chain): 296 | def __init__(self, n_vocab, n_units, bilm, 297 | dropout=0., initialW=embed_init): 298 | super(PredictiveEmbed, self).__init__() 299 | with self.init_scope(): 300 | self.embed = L.EmbedID(n_vocab, n_units, ignore_label=-1, 301 | initialW=initialW) 302 | self.bilm = bilm 303 | self.n_vocab = n_vocab 304 | self.n_units = n_units 305 | self.dropout = dropout 306 | 307 | def __call__(self, x): 308 | return self.embed(x) 309 | 310 | def setup(self, 311 | mode='weighted_sum', 312 | temp=1., 313 | word_lower_bound=0., 314 | gold_lower_bound=0., 315 | gumbel=True, 316 | residual=0., 317 | wordwise=True, 318 | add_original=1., 319 | augment_ratio=0.5, 320 | ignore_unk=-1): 321 | self.config = { 322 | 'dropout': self.dropout, 323 | 'mode': mode, 324 | 'temp': temp, 325 | 'word_lower_bound': 0., 326 | 'gold_lower_bound': 0., 327 | 'gumbel': gumbel, 328 | 'residual': residual, 329 | 'wordwise': wordwise, 330 | 'add_original': add_original, 331 | 'augment_ratio': augment_ratio 332 | } 333 | if ignore_unk >= 0: 334 | self.bilm.output.b.data[ignore_unk] = -1e5 335 | 336 | def embed_xs(self, xs, batch='concat'): 337 | if batch == 'concat': 338 | x_block = chainer.dataset.convert.concat_examples(xs, padding=-1) 339 | ex_block = block_embed(self.embed, x_block, self.dropout) 340 | return ex_block 341 | elif batch == 'list': 342 | exs = sequence_embed(self.embed, xs, self.dropout) 343 | return exs 344 | else: 345 | raise NotImplementedError 346 | 347 | def embed_xs_with_prediction(self, xs, labels=None, batch='concat'): 348 | predicted_exs = self.bilm.predict_embed( 349 | xs, self.embed.W, 350 | labels=labels, 351 | dropout=self.config['dropout'], 352 | mode=self.config['mode'], 353 | temp=self.config['temp'], 354 | word_lower_bound=self.config['word_lower_bound'], 355 | gold_lower_bound=self.config['gold_lower_bound'], 356 | gumbel=self.config['gumbel'], 357 | residual=self.config['residual'], 358 | wordwise=self.config['wordwise'], 359 | add_original=self.config['add_original'], 360 | augment_ratio=self.config['augment_ratio']) 361 | if batch == 'concat': 362 | predicted_ex_block = F.pad_sequence(predicted_exs, padding=0.) 363 | predicted_ex_block = F.transpose( 364 | predicted_ex_block, (0, 2, 1))[:, :, :, None] 365 | return predicted_ex_block 366 | elif batch == 'list': 367 | return predicted_exs 368 | else: 369 | raise NotImplementedError 370 | -------------------------------------------------------------------------------- /text_classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__init__.py -------------------------------------------------------------------------------- /text_classification/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /text_classification/__pycache__/nets.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__pycache__/nets.cpython-36.pyc -------------------------------------------------------------------------------- /text_classification/__pycache__/nlp_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__pycache__/nlp_utils.cpython-36.pyc -------------------------------------------------------------------------------- /text_classification/__pycache__/text_datasets.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IIEKES/cbert_aug_deprecated/3f88a7e34834f0384a1d8dd46669b41827f4dd4b/text_classification/__pycache__/text_datasets.cpython-36.pyc -------------------------------------------------------------------------------- /text_classification/nets.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import chainer 4 | import chainer.functions as F 5 | import chainer.links as L 6 | from chainer import reporter 7 | 8 | embed_init = chainer.initializers.Uniform(.25) 9 | 10 | 11 | def sequence_embed(embed, xs, dropout=0.): 12 | """Efficient embedding function for variable-length sequences 13 | 14 | This output is equally to 15 | "return [F.dropout(embed(x), ratio=dropout) for x in xs]". 16 | However, calling the functions is one-shot and faster. 17 | 18 | Args: 19 | embed (callable): A :func:`~chainer.functions.embed_id` function 20 | or :class:`~chainer.links.EmbedID` link. 21 | xs (list of :class:`~chainer.Variable` or :class:`numpy.ndarray` or \ 22 | :class:`cupy.ndarray`): i-th element in the list is an input variable, 23 | which is a :math:`(L_i, )`-shaped int array. 24 | dropout (float): Dropout ratio. 25 | 26 | Returns: 27 | list of ~chainer.Variable: Output variables. i-th element in the 28 | list is an output variable, which is a :math:`(L_i, N)`-shaped 29 | float array. :math:`(N)` is the number of dimensions of word embedding. 30 | 31 | """ 32 | x_len = [len(x) for x in xs] 33 | x_section = numpy.cumsum(x_len[:-1]) 34 | ex = embed(F.concat(xs, axis=0)) 35 | ex = F.dropout(ex, ratio=dropout) 36 | exs = F.split_axis(ex, x_section, 0) 37 | return exs 38 | 39 | 40 | def block_embed(embed, x, dropout=0.): 41 | """Embedding function followed by convolution 42 | 43 | Args: 44 | embed (callable): A :func:`~chainer.functions.embed_id` function 45 | or :class:`~chainer.links.EmbedID` link. 46 | x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ 47 | :class:`cupy.ndarray`): Input variable, which 48 | is a :math:`(B, L)`-shaped int array. Its first dimension 49 | :math:`(B)` is assumed to be the *minibatch dimension*. 50 | The second dimension :math:`(L)` is the length of padded 51 | sentences. 52 | dropout (float): Dropout ratio. 53 | 54 | Returns: 55 | ~chainer.Variable: Output variable. A float array with shape 56 | of :math:`(B, N, L, 1)`. :math:`(N)` is the number of dimensions 57 | of word embedding. 58 | 59 | """ 60 | e = embed(x) 61 | e = F.dropout(e, ratio=dropout) 62 | e = F.transpose(e, (0, 2, 1)) 63 | e = e[:, :, :, None] 64 | return e 65 | 66 | 67 | class TextClassifier(chainer.Chain): 68 | 69 | """A classifier using a given encoder. 70 | 71 | This chain encodes a sentence and classifies it into classes. 72 | 73 | Args: 74 | encoder (Link): A callable encoder, which extracts a feature. 75 | Input is a list of variables whose shapes are 76 | "(sentence_length, )". 77 | Output is a variable whose shape is "(batchsize, n_units)". 78 | n_class (int): The number of classes to be predicted. 79 | 80 | """ 81 | 82 | def __init__(self, encoder, n_class, dropout=0.1): 83 | super(TextClassifier, self).__init__() 84 | with self.init_scope(): 85 | self.encoder = encoder 86 | self.output = L.Linear(encoder.out_units, n_class) 87 | self.dropout = dropout 88 | 89 | def __call__(self, xs, ys=None): 90 | if ys is None: 91 | xs, ys = xs 92 | concat_outputs = self.predict(xs, ys=ys) 93 | concat_truths = F.concat(ys, axis=0) 94 | 95 | loss = F.softmax_cross_entropy(concat_outputs, concat_truths) 96 | accuracy = F.accuracy(concat_outputs, concat_truths) 97 | reporter.report({'loss': loss.data}, self) 98 | reporter.report({'accuracy': accuracy.data}, self) 99 | return loss 100 | 101 | def predict(self, xs, ys=None, softmax=False, argmax=False): 102 | concat_encodings = F.dropout(self.encoder(xs, labels=ys), 103 | ratio=self.dropout) 104 | concat_outputs = self.output(concat_encodings) 105 | if softmax: 106 | return F.softmax(concat_outputs).data 107 | elif argmax: 108 | return self.xp.argmax(concat_outputs.data, axis=1) 109 | else: 110 | return concat_outputs 111 | 112 | 113 | class RNNEncoder(chainer.Chain): 114 | 115 | """A LSTM-RNN Encoder with Word Embedding. 116 | 117 | This model encodes a sentence sequentially using LSTM. 118 | 119 | Args: 120 | n_layers (int): The number of LSTM layers. 121 | n_vocab (int): The size of vocabulary. 122 | n_units (int): The number of units of a LSTM layer and word embedding. 123 | dropout (float): The dropout ratio. 124 | 125 | """ 126 | 127 | def __init__(self, n_layers, n_vocab, n_units, dropout=0.1): 128 | super(RNNEncoder, self).__init__( 129 | embed=L.EmbedID(n_vocab, n_units, 130 | initialW=embed_init), 131 | encoder=L.NStepLSTM(n_layers, n_units, n_units, dropout), 132 | ) 133 | self.n_layers = n_layers 134 | self.out_units = n_units 135 | self.dropout = dropout 136 | self.use_predict_embed = False 137 | 138 | def __call__(self, xs, labels=None): 139 | exs = sequence_embed(self.embed, xs, self.dropout) 140 | if self.use_predict_embed and chainer.config.train: 141 | exs = self.embed.embed_xs_with_prediction( 142 | xs, labels=labels, batch='list') 143 | last_h, last_c, ys = self.encoder(None, None, exs) 144 | assert(last_h.shape == (self.n_layers, len(xs), self.out_units)) 145 | concat_outputs = last_h[-1] 146 | return concat_outputs 147 | 148 | 149 | class CNNEncoder(chainer.Chain): 150 | 151 | """A CNN encoder with word embedding. 152 | 153 | This model encodes a sentence as a set of n-gram chunks 154 | using convolutional filters. 155 | Following the convolution, max-pooling is applied over time. 156 | Finally, the output is fed into a multilayer perceptron. 157 | 158 | Args: 159 | n_layers (int): The number of layers of MLP. 160 | n_vocab (int): The size of vocabulary. 161 | n_units (int): The number of units of MLP and word embedding. 162 | dropout (float): The dropout ratio. 163 | 164 | """ 165 | 166 | def __init__(self, n_layers, n_vocab, n_units, dropout=0.1): 167 | out_units = n_units // 3 168 | super(CNNEncoder, self).__init__( 169 | embed=L.EmbedID(n_vocab, n_units, ignore_label=-1, 170 | initialW=embed_init), 171 | cnn_w3=L.Convolution2D( 172 | n_units, out_units, ksize=(3, 1), stride=1, pad=(2, 0), 173 | nobias=True), 174 | cnn_w4=L.Convolution2D( 175 | n_units, out_units, ksize=(4, 1), stride=1, pad=(3, 0), 176 | nobias=True), 177 | cnn_w5=L.Convolution2D( 178 | n_units, out_units, ksize=(5, 1), stride=1, pad=(4, 0), 179 | nobias=True), 180 | mlp=MLP(n_layers, out_units * 3, dropout) 181 | ) 182 | self.out_units = out_units * 3 183 | self.dropout = dropout 184 | self.use_predict_embed = False 185 | 186 | def __call__(self, xs, labels=None): 187 | x_block = chainer.dataset.convert.concat_examples(xs, padding=-1) 188 | ex_block = block_embed(self.embed, x_block, self.dropout) 189 | if self.use_predict_embed and chainer.config.train: 190 | ex_block = self.embed.embed_xs_with_prediction( 191 | xs, labels=labels, batch='concat') 192 | h_w3 = F.max(self.cnn_w3(ex_block), axis=2) 193 | h_w4 = F.max(self.cnn_w4(ex_block), axis=2) 194 | h_w5 = F.max(self.cnn_w5(ex_block), axis=2) 195 | h = F.concat([h_w3, h_w4, h_w5], axis=1) 196 | h = F.relu(h) 197 | h = F.dropout(h, ratio=self.dropout) 198 | h = self.mlp(h) 199 | return h 200 | 201 | 202 | class MLP(chainer.ChainList): 203 | 204 | """A multilayer perceptron. 205 | 206 | Args: 207 | n_vocab (int): The size of vocabulary. 208 | n_units (int): The number of units in a hidden or output layer. 209 | dropout (float): The dropout ratio. 210 | 211 | """ 212 | 213 | def __init__(self, n_layers, n_units, dropout=0.1): 214 | super(MLP, self).__init__() 215 | for i in range(n_layers): 216 | self.add_link(L.Linear(None, n_units)) 217 | self.dropout = dropout 218 | self.out_units = n_units 219 | 220 | def __call__(self, x): 221 | for i, link in enumerate(self.children()): 222 | x = F.dropout(x, ratio=self.dropout) 223 | x = F.relu(link(x)) 224 | return x 225 | 226 | 227 | class BOWEncoder(chainer.Chain): 228 | 229 | """A BoW encoder with word embedding. 230 | 231 | This model encodes a sentence as just a set of words by averaging. 232 | 233 | Args: 234 | n_vocab (int): The size of vocabulary. 235 | n_units (int): The number of units of word embedding. 236 | dropout (float): The dropout ratio. 237 | 238 | """ 239 | 240 | def __init__(self, n_vocab, n_units, dropout=0.1): 241 | super(BOWEncoder, self).__init__( 242 | embed=L.EmbedID(n_vocab, n_units, ignore_label=-1, 243 | initialW=embed_init), 244 | ) 245 | self.out_units = n_units 246 | self.dropout = dropout 247 | 248 | def __call__(self, xs): 249 | x_block = chainer.dataset.convert.concat_examples(xs, padding=-1) 250 | ex_block = block_embed(self.embed, x_block) 251 | x_len = self.xp.array([len(x) for x in xs], 'i')[:, None, None] 252 | h = F.sum(ex_block, axis=2) / x_len 253 | return h 254 | 255 | 256 | class BOWMLPEncoder(chainer.Chain): 257 | 258 | """A BOW encoder with word embedding and MLP. 259 | 260 | This model encodes a sentence as just a set of words by averaging. 261 | Additionally, its output is fed into a multilayer perceptron. 262 | 263 | Args: 264 | n_layers (int): The number of layers of MLP. 265 | n_vocab (int): The size of vocabulary. 266 | n_units (int): The number of units of MLP and word embedding. 267 | dropout (float): The dropout ratio. 268 | 269 | """ 270 | 271 | def __init__(self, n_layers, n_vocab, n_units, dropout=0.1): 272 | super(BOWMLPEncoder, self).__init__( 273 | bow_encoder=BOWEncoder(n_vocab, n_units, dropout), 274 | mlp_encoder=MLP(n_layers, n_units, dropout) 275 | ) 276 | self.out_units = n_units 277 | 278 | def __call__(self, xs): 279 | h = self.bow_encoder(xs) 280 | h = self.mlp_encoder(h) 281 | return h 282 | -------------------------------------------------------------------------------- /text_classification/nlp_utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import io 3 | 4 | import numpy 5 | 6 | import chainer 7 | from chainer import cuda 8 | 9 | 10 | def split_text(text, char_based=False): 11 | if char_based: 12 | return list(text) 13 | else: 14 | return text.split() 15 | 16 | 17 | def normalize_text(text): 18 | return text.strip().lower() 19 | 20 | 21 | def make_vocab(dataset, max_vocab_size=50000, min_freq=1): 22 | counts = collections.defaultdict(int) 23 | for tokens, _ in dataset: 24 | for token in tokens: 25 | counts[token] += 1 26 | 27 | vocab = {'': 0, '': 1} 28 | for w, c in sorted(counts.items(), key=lambda x: (-x[1], x[0])): 29 | if len(vocab) >= max_vocab_size or c < min_freq: 30 | break 31 | vocab[w] = len(vocab) 32 | return vocab 33 | 34 | 35 | def read_vocab_list(path, max_vocab_size=20000): 36 | vocab = {'': 0, '': 1} 37 | with io.open(path, encoding='utf-8', errors='ignore') as f: 38 | for l in f: 39 | w = l.strip() 40 | if w not in vocab and w: 41 | vocab[w] = len(vocab) 42 | if len(vocab) >= max_vocab_size: 43 | break 44 | return vocab 45 | 46 | 47 | def make_array(tokens, vocab, add_eos=True, add_bos=True): 48 | unk_id = vocab[''] 49 | eos_id = vocab[''] 50 | ids = [vocab.get(token, unk_id) for token in tokens] 51 | if add_eos: 52 | ids.append(eos_id) 53 | if add_bos: 54 | ids = [eos_id] + ids 55 | return numpy.array(ids, 'i') 56 | 57 | 58 | def transform_to_array(dataset, vocab, with_label=True): 59 | if with_label: 60 | return [(make_array(tokens, vocab), numpy.array([cls], 'i')) 61 | for tokens, cls in dataset] 62 | else: 63 | return [make_array(tokens, vocab) 64 | for tokens in dataset] 65 | 66 | 67 | def convert_seq(batch, device=None, with_label=True): 68 | def to_device_batch(batch): 69 | if device is None: 70 | return batch 71 | elif device < 0: 72 | return [chainer.dataset.to_device(device, x) for x in batch] 73 | else: 74 | xp = cuda.cupy.get_array_module(*batch) 75 | concat = xp.concatenate(batch, axis=0) 76 | sections = numpy.cumsum([len(x) for x in batch[:-1]], dtype='i') 77 | concat_dev = chainer.dataset.to_device(device, concat) 78 | batch_dev = cuda.cupy.split(concat_dev, sections) 79 | return batch_dev 80 | 81 | if with_label: 82 | return [to_device_batch([x for x, _ in batch]), 83 | to_device_batch([y for _, y in batch])] 84 | # return {'xs': to_device_batch([x for x, _ in batch]), 85 | # 'ys': to_device_batch([y for _, y in batch])} 86 | else: 87 | return to_device_batch([x for x in batch]) 88 | -------------------------------------------------------------------------------- /text_classification/text_datasets.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import glob 3 | import io 4 | import os 5 | import shutil 6 | import tarfile 7 | import tempfile 8 | 9 | import numpy 10 | 11 | import chainer 12 | 13 | from .nlp_utils import make_vocab 14 | from .nlp_utils import normalize_text 15 | from .nlp_utils import split_text 16 | from .nlp_utils import transform_to_array 17 | import json 18 | URL_DBPEDIA = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz' # NOQA 19 | URL_IMDB = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' 20 | URL_OTHER_BASE = 'https://raw.githubusercontent.com/harvardnlp/sent-conv-torch/master/data/' # NOQA 21 | 22 | 23 | def download_dbpedia(): 24 | path = chainer.dataset.cached_download(URL_DBPEDIA) 25 | tf = tarfile.open(path, 'r') 26 | return tf 27 | 28 | 29 | def read_dbpedia(tf, split, shrink=1, char_based=False): 30 | dataset = [] 31 | f = tf.extractfile('dbpedia_csv/{}.csv'.format(split)) 32 | for i, (label, title, text) in enumerate(csv.reader(f)): 33 | if i % shrink != 0: 34 | continue 35 | label = int(label) - 1 # Index begins from 1 36 | tokens = split_text(normalize_text(text), char_based) 37 | dataset.append((tokens, label)) 38 | return dataset 39 | 40 | 41 | def get_dbpedia(vocab=None, shrink=1, char_based=False): 42 | tf = download_dbpedia() 43 | 44 | print('read dbpedia') 45 | train = read_dbpedia(tf, 'train', shrink=shrink, char_based=char_based) 46 | test = read_dbpedia(tf, 'test', shrink=shrink, char_based=char_based) 47 | 48 | if vocab is None: 49 | print('constract vocabulary based on frequency') 50 | vocab = make_vocab(train) 51 | 52 | train = transform_to_array(train, vocab) 53 | test = transform_to_array(test, vocab) 54 | 55 | return train, test, vocab 56 | 57 | 58 | def download_imdb(): 59 | path = chainer.dataset.cached_download(URL_IMDB) 60 | tf = tarfile.open(path, 'r') 61 | # To read many files fast, tarfile is untared 62 | path = tempfile.mkdtemp() 63 | tf.extractall(path) 64 | return path 65 | 66 | 67 | def read_imdb(path, split, 68 | shrink=1, fine_grained=False, char_based=False): 69 | fg_label_dict = {'1': 0, '2': 0, '3': 1, '4': 1, 70 | '7': 2, '8': 2, '9': 3, '10': 3} 71 | 72 | def read_and_label(posneg, label): 73 | dataset = [] 74 | target = os.path.join(path, 'aclImdb', split, posneg, '*') 75 | for i, f_path in enumerate(glob.glob(target)): 76 | if i % shrink != 0: 77 | continue 78 | with io.open(f_path, encoding='utf-8', errors='ignore') as f: 79 | text = f.read().strip() 80 | tokens = split_text(normalize_text(text), char_based) 81 | if fine_grained: 82 | # extract from f_path. e.g. /pos/200_8.txt -> 8 83 | label = fg_label_dict[f_path.split('_')[-1][:-4]] 84 | dataset.append((tokens, label)) 85 | else: 86 | dataset.append((tokens, label)) 87 | return dataset 88 | 89 | pos_dataset = read_and_label('pos', 0) 90 | neg_dataset = read_and_label('neg', 1) 91 | return pos_dataset + neg_dataset 92 | 93 | 94 | def get_imdb(vocab=None, shrink=1, fine_grained=False, 95 | char_based=False): 96 | tmp_path = download_imdb() 97 | 98 | print('read imdb') 99 | train = read_imdb(tmp_path, 'train', 100 | shrink=shrink, fine_grained=fine_grained, 101 | char_based=char_based) 102 | test = read_imdb(tmp_path, 'test', 103 | shrink=shrink, fine_grained=fine_grained, 104 | char_based=char_based) 105 | 106 | shutil.rmtree(tmp_path) 107 | 108 | if vocab is None: 109 | print('constract vocabulary based on frequency') 110 | vocab = make_vocab(train) 111 | 112 | train = transform_to_array(train, vocab) 113 | test = transform_to_array(test, vocab) 114 | 115 | return train, test, vocab 116 | 117 | 118 | def download_other_dataset(name): 119 | if name in ['custrev', 'mpqa', 'rt-polarity', 'subj']: 120 | files = [name + '.all'] 121 | elif name == 'TREC': 122 | files = [name + suff for suff in ['.train.all', '.test.all']] 123 | else: 124 | files = [name + suff for suff in ['.train', '.test']] 125 | file_paths = [] 126 | for f_name in files: 127 | url = os.path.join(URL_OTHER_BASE, f_name) 128 | path = chainer.dataset.cached_download(url) 129 | file_paths.append(path) 130 | return file_paths 131 | 132 | 133 | def read_other_dataset(path, shrink=1, char_based=False): 134 | dataset = [] 135 | with io.open(path, encoding='utf-8', errors='ignore') as f: 136 | for i, l in enumerate(f): 137 | if i % shrink != 0 or not len(l.strip()) >= 3: 138 | continue 139 | label, text = l.strip().split(None, 1) 140 | label = int(label) 141 | tokens = split_text(normalize_text(text), char_based) 142 | dataset.append((tokens, label)) 143 | return dataset 144 | 145 | 146 | def _read_tsv(input_file, quotechar=None): 147 | """Reads a tab separated value file.""" 148 | with open(input_file, "r") as f: 149 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 150 | lines = [] 151 | for line in reader: 152 | lines.append((split_text(normalize_text(line[0])), line[1])) 153 | return lines[1:] 154 | 155 | def read_text_dataset(name, vocab=None, dir="datasets"): 156 | assert(name in ['TREC', 'stsa.binary', 'stsa.fine', 157 | 'custrev', 'mpqa', 'rt-polarity', 'subj']) 158 | train_path = os.path.join(dir, name, "train.tsv") 159 | eval_path = os.path.join("datasets", name, "dev.tsv") 160 | test_path = os.path.join("datasets", name, "test.tsv") 161 | 162 | train = _read_tsv(train_path) 163 | eval = _read_tsv(eval_path) 164 | test = _read_tsv(test_path) 165 | 166 | if vocab is None: 167 | print('constract vocabulary based on frequency') 168 | all_data = [] 169 | all_data.extend(train) 170 | all_data.extend(eval) 171 | vocab = make_vocab(all_data) 172 | 173 | train = transform_to_array(train, vocab) 174 | eval = transform_to_array(eval, vocab) 175 | test = transform_to_array(test, vocab) 176 | 177 | return train, eval, test, vocab 178 | 179 | def get_other_text_dataset(name, vocab=None, shrink=1, 180 | char_based=False, seed=777): 181 | assert(name in ['TREC', 'stsa.binary', 'stsa.fine', 182 | 'custrev', 'mpqa', 'rt-polarity', 'subj']) 183 | datasets = download_other_dataset(name) 184 | train = read_other_dataset( 185 | datasets[0], shrink=shrink, char_based=char_based) 186 | if len(datasets) == 2: 187 | test = read_other_dataset( 188 | datasets[1], shrink=shrink, char_based=char_based) 189 | else: 190 | numpy.random.seed(seed) 191 | alldata = numpy.random.permutation(train) 192 | train = alldata[:-len(alldata) // 10] 193 | test = alldata[-len(alldata) // 10:] 194 | 195 | if vocab is None: 196 | print('constract vocabulary based on frequency') 197 | vocab = make_vocab(train) 198 | 199 | train = transform_to_array(train, vocab) 200 | test = transform_to_array(test, vocab) 201 | 202 | return train, test, vocab 203 | -------------------------------------------------------------------------------- /train_text_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | import argparse 6 | import datetime 7 | import json 8 | import os 9 | import numpy 10 | 11 | import chainer 12 | from chainer import training 13 | from chainer.training import extensions 14 | import cupy 15 | import nets as bilm_nets 16 | 17 | from text_classification import nets as class_nets 18 | from text_classification.nlp_utils import convert_seq 19 | from text_classification import text_datasets 20 | 21 | from evaluator import MicroEvaluator 22 | 23 | import args_of_text_classifier 24 | from utils import UnkDropout, Outer 25 | 26 | #cupy.cuda.set_allocator(None) 27 | #cupy.cuda.set_pinned_memory_allocator(None) 28 | 29 | # default local parameters -- no need to change 30 | parser = args_of_text_classifier.get_basic_arg_parser() 31 | args = parser.parse_args() 32 | 33 | # load global parameters 34 | with open("global.config", 'rb') as f: 35 | configs_dict = json.load(f) 36 | 37 | args.dataset = configs_dict.get("dataset") 38 | args.model = configs_dict.get("eval_model") 39 | 40 | def main(): 41 | 42 | print(json.dumps(args.__dict__, indent=2)) 43 | train(dir="aug_data", print_log=True) 44 | 45 | 46 | def train(dir="datasets", print_log=False): 47 | chainer.CHAINER_SEED = args.seed 48 | numpy.random.seed(args.seed) 49 | 50 | vocab = None 51 | 52 | # Load a dataset 53 | if args.dataset == 'dbpedia': 54 | train, test, vocab = text_datasets.get_dbpedia( 55 | vocab=vocab) 56 | elif args.dataset.startswith('imdb.'): 57 | train, test, vocab = text_datasets.get_imdb( 58 | fine_grained=args.dataset.endswith('.fine'), 59 | vocab=vocab) 60 | elif args.dataset in ['TREC', 'stsa.binary', 'stsa.fine', 61 | 'custrev', 'mpqa', 'rt-polarity', 'subj']: 62 | train, test, real_test, vocab = text_datasets.read_text_dataset( 63 | args.dataset, vocab=None, dir=dir) 64 | #train, test, vocab = text_datasets.get_other_text_dataset( 65 | # args.dataset, vocab=vocab) 66 | #if args.validation: 67 | # real_test = test 68 | # dataset_pairs = chainer.datasets.get_cross_validation_datasets_random( 69 | # train, 10, seed=777) 70 | # train, test = dataset_pairs[0] 71 | 72 | print('# train data: {}'.format(len(train))) 73 | print('# test data: {}'.format(len(test))) 74 | print('# vocab: {}'.format(len(vocab))) 75 | n_class = len(set([int(d[1]) for d in train])) 76 | print('# class: {}'.format(n_class)) 77 | 78 | chainer.CHAINER_SEED = args.seed 79 | numpy.random.seed(args.seed) 80 | train = UnkDropout(train, vocab[''], 0.01) 81 | train_iter = chainer.iterators.SerialIterator(train, args.batchsize) 82 | test_iter = chainer.iterators.SerialIterator(test, args.batchsize, 83 | repeat=False, shuffle=False) 84 | 85 | # Setup a model 86 | chainer.CHAINER_SEED = args.seed 87 | numpy.random.seed(args.seed) 88 | if args.model == 'rnn': 89 | Encoder = class_nets.RNNEncoder 90 | elif args.model == 'cnn': 91 | Encoder = class_nets.CNNEncoder 92 | elif args.model == 'bow': 93 | Encoder = class_nets.BOWMLPEncoder 94 | encoder = Encoder(n_layers=args.layer, n_vocab=len(vocab), 95 | n_units=args.unit, dropout=args.dropout) 96 | model = class_nets.TextClassifier(encoder, n_class) 97 | 98 | if args.bilm: 99 | bilm = bilm_nets.BiLanguageModel( 100 | len(vocab), args.bilm_unit, args.bilm_layer, args.bilm_dropout) 101 | n_labels = len(set([int(v[1]) for v in test])) 102 | print('# labels =', n_labels) 103 | if not args.no_label: 104 | print('add label') 105 | bilm.add_label_condition_nets(n_labels, args.bilm_unit) 106 | else: 107 | print('not using label') 108 | chainer.serializers.load_npz(args.bilm, bilm) 109 | with model.encoder.init_scope(): 110 | initialW = numpy.array(model.encoder.embed.W.data) 111 | del model.encoder.embed 112 | model.encoder.embed = bilm_nets.PredictiveEmbed( 113 | len(vocab), args.unit, bilm, args.dropout, 114 | initialW=initialW) 115 | model.encoder.use_predict_embed = True 116 | 117 | model.encoder.embed.setup( 118 | mode=args.bilm_mode, 119 | temp=args.bilm_temp, 120 | word_lower_bound=0., 121 | gold_lower_bound=0., 122 | gumbel=args.bilm_gumbel, 123 | residual=args.bilm_residual, 124 | wordwise=args.bilm_wordwise, 125 | add_original=args.bilm_add_original, 126 | augment_ratio=args.bilm_ratio, 127 | ignore_unk=vocab['']) 128 | 129 | if args.gpu >= 0: 130 | # Make a specified GPU current 131 | chainer.cuda.get_device_from_id(args.gpu).use() 132 | model.to_gpu() # Copy the model to the GPU 133 | model.xp.random.seed(args.seed) 134 | chainer.CHAINER_SEED = args.seed 135 | numpy.random.seed(args.seed) 136 | 137 | # Setup an optimizer 138 | optimizer = chainer.optimizers.Adam(args.learning_rate) 139 | optimizer.setup(model) 140 | 141 | # Set up a trainer 142 | updater = training.StandardUpdater( 143 | train_iter, optimizer, 144 | converter=convert_seq, device=args.gpu) 145 | 146 | from triggers import FailMaxValueTrigger 147 | stop_trigger = FailMaxValueTrigger( 148 | key='validation/main/accuracy', trigger=(1, 'epoch'), 149 | n_times=args.stop_epoch, max_trigger=args.epoch) 150 | trainer = training.Trainer( 151 | updater, stop_trigger, out=args.out) 152 | 153 | # Evaluate the model with the test dataset for each epoch 154 | # VALIDATION SET 155 | trainer.extend(MicroEvaluator( 156 | test_iter, model, 157 | converter=convert_seq, device=args.gpu)) 158 | 159 | if args.validation: 160 | real_test_iter = chainer.iterators.SerialIterator( 161 | real_test, args.batchsize, 162 | repeat=False, shuffle=False) 163 | eval_on_real_test = MicroEvaluator( 164 | real_test_iter, model, 165 | converter=convert_seq, device=args.gpu) 166 | eval_on_real_test.default_name = 'test' 167 | trainer.extend(eval_on_real_test) 168 | 169 | # Take a best snapshot 170 | record_trigger = training.triggers.MaxValueTrigger( 171 | 'validation/main/accuracy', (1, 'epoch')) 172 | if args.save_model: 173 | trainer.extend(extensions.snapshot_object( 174 | model, 'best_model.npz'), 175 | trigger=record_trigger) 176 | 177 | # Write a log of evaluation statistics for each epoch 178 | out = Outer() 179 | trainer.extend(extensions.LogReport()) 180 | if print_log: 181 | trainer.extend(extensions.PrintReport( 182 | ['epoch', 'main/loss', 'validation/main/loss', 183 | 'main/accuracy', 'validation/main/accuracy', 184 | 'test/main/loss', 'test/main/accuracy', 185 | # 'elapsed_time'])) 186 | 'elapsed_time']), trigger=record_trigger) 187 | else: 188 | trainer.extend(extensions.PrintReport( 189 | ['main/accuracy', 'validation/main/accuracy', 190 | 'test/main/accuracy'], out=out), trigger=record_trigger) 191 | 192 | # Print a progress bar to stdout 193 | #trainer.extend(extensions.ProgressBar()) 194 | 195 | # Run the training 196 | trainer.run() 197 | 198 | # free all unused memory blocks “cached” in the memory pool 199 | mempool = cupy.get_default_memory_pool() 200 | mempool.free_all_blocks() 201 | #print("val_acc:{}, test_acc:{}\n", out[-2], out[-1]) 202 | return float(out[-1]) 203 | 204 | if __name__ == '__main__': 205 | main() 206 | -------------------------------------------------------------------------------- /triggers.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import chainer 4 | from chainer import cuda 5 | from chainer import function 6 | from chainer import reporter 7 | from chainer.training import util 8 | from chainer import utils 9 | from chainer.utils import type_check 10 | from chainer.training import triggers 11 | 12 | 13 | class FailBestValueTrigger(triggers.IntervalTrigger): 14 | 15 | """Trigger invoked when specific value fails to become best. 16 | 17 | Args: 18 | key (str): Key of value. 19 | compare (function): Compare function which takes current best value and 20 | new value and returns whether new value is better than current 21 | best. 22 | trigger: Trigger that decides the comparison interval between current 23 | best value and new value. This must be a tuple in the form of 24 | ``, 'epoch'`` or ``, 'iteration'`` which is passed to 25 | :class:`~chainer.training.triggers.IntervalTrigger`. 26 | 27 | """ 28 | 29 | def __init__(self, key, compare, trigger=(1, 'epoch'), 30 | n_times=5, max_trigger=None, print_triger=False): 31 | self.period = max_trigger 32 | self.unit = 'epoch' 33 | 34 | self._key = key 35 | self._best_value = None 36 | self._interval_trigger = util.get_trigger(trigger) 37 | self._init_summary() 38 | self._compare = compare 39 | self._print_triger = print_triger 40 | 41 | self._n_times = n_times 42 | self._n_fails = 0 43 | 44 | self._max_trigger = max_trigger 45 | self._n_triggers = 0 46 | 47 | def __call__(self, trainer): 48 | """Decides whether the extension should be called on this iteration. 49 | 50 | Args: 51 | trainer (~chainer.training.Trainer): Trainer object that this 52 | trigger is associated with. The ``observation`` of this trainer 53 | is used to determine if the trigger should fire. 54 | 55 | Returns: 56 | bool: ``True`` if the corresponding extension should be invoked in 57 | this iteration. 58 | 59 | """ 60 | 61 | observation = trainer.observation 62 | summary = self._summary 63 | key = self._key 64 | if key in observation: 65 | summary.add({key: observation[key]}) 66 | 67 | if not self._interval_trigger(trainer): 68 | return False 69 | 70 | stats = summary.compute_mean() 71 | value = float(stats[key]) # copy to CPU 72 | self._init_summary() 73 | 74 | self._n_triggers += 1 75 | if self._n_triggers == 1: 76 | return False 77 | if self._max_trigger is not None \ 78 | and self._n_triggers >= self._max_trigger: 79 | return True 80 | 81 | if self._best_value is None or self._compare(self._best_value, value): 82 | self._best_value = value 83 | self._n_fails = 0 84 | return False 85 | self._n_fails += 1 86 | if self._n_fails >= self._n_times: 87 | return True 88 | else: 89 | return False 90 | 91 | def _init_summary(self): 92 | self._summary = reporter.DictSummary() 93 | 94 | 95 | class FailMaxValueTrigger(FailBestValueTrigger): 96 | def __init__(self, key, trigger=(1, 'epoch'), n_times=5, max_trigger=None): 97 | super(FailMaxValueTrigger, self).__init__( 98 | key, lambda max_value, new_value: new_value > max_value, 99 | trigger, n_times, max_trigger) 100 | 101 | 102 | class FailMinValueTrigger(FailBestValueTrigger): 103 | def __init__(self, key, trigger=(1, 'epoch'), n_times=5, max_trigger=None): 104 | super(FailMinValueTrigger, self).__init__( 105 | key, lambda min_value, new_value: new_value < min_value, 106 | trigger, n_times, max_trigger) 107 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import division 3 | from __future__ import print_function 4 | import collections 5 | import io 6 | import json 7 | import os 8 | 9 | import numpy as np 10 | import progressbar 11 | 12 | import chainer 13 | from chainer import cuda 14 | from chainer.dataset import convert 15 | 16 | class Outer(list): 17 | def __init__(self): 18 | list.__init__(self) 19 | def write(self, item): 20 | if '\n' not in item: 21 | self.append(item.strip()) 22 | 23 | class UnkDropout(chainer.dataset.DatasetMixin): 24 | def __init__(self, dataset, unk, ratio=0.01): 25 | self.dataset = dataset 26 | self.unk = unk 27 | self.ratio = ratio 28 | 29 | def __len__(self): 30 | return len(self.dataset) 31 | 32 | def get_example(self, i): 33 | x, y = self.dataset[i] 34 | if chainer.config.train: 35 | rand = np.random.rand(x.size - 2) < self.ratio 36 | # keep bos and eos 37 | _x = x.copy() 38 | _x[1:-1] = np.where(rand, self.unk, _x[1:-1]) 39 | return (_x, y) 40 | return (x, y) 41 | 42 | 43 | def convert_xt_batch_seq(xt_batch_seq, gpu): 44 | batchsize = len(xt_batch_seq[0]) 45 | seq_len = len(xt_batch_seq) 46 | xt_batch_seq = np.array(xt_batch_seq, 'i') 47 | # (bproplen, batch, 2) 48 | xt_batch_seq = convert.to_device(gpu, xt_batch_seq) 49 | xp = cuda.get_array_module(xt_batch_seq) 50 | x_seq_batch = xp.split( 51 | xt_batch_seq[:, :, 0].T.reshape(batchsize * seq_len), 52 | batchsize, axis=0) 53 | t_seq_batch = xp.split( 54 | xt_batch_seq[:, :, 1].T.reshape(batchsize * seq_len), 55 | batchsize, axis=0) 56 | return x_seq_batch, t_seq_batch 57 | 58 | 59 | def count_words_from_file(counts, file_path): 60 | bar = progressbar.ProgressBar() 61 | for l in bar(io.open(file_path, encoding='utf-8')): 62 | # TODO: parallel 63 | if l.strip(): 64 | words = l.strip().split() 65 | for word in words: 66 | counts[word] += 1 67 | return counts 68 | 69 | 70 | def count_words(dataset, alpha=0.4): 71 | counts = collections.defaultdict(int) 72 | for w in dataset: 73 | counts[w] += 1 74 | counts = [counts[i] for i in range(len(counts))] 75 | counts = np.array(counts, 'f') 76 | counts /= counts.sum() 77 | counts = counts ** alpha 78 | counts = counts.tolist() 79 | return counts 80 | 81 | 82 | def make_chain_dataset(file_path, vocab={}, update_vocab=False, 83 | chain_length=2): 84 | dataset = [] 85 | chain = [] 86 | unk_id = vocab[''] 87 | 88 | def make_array(chain): 89 | array_chain = [] 90 | for words in chain: 91 | tokens = [] 92 | for word in words: 93 | if update_vocab: 94 | if word not in vocab: 95 | vocab[word] = len(vocab) 96 | tokens.append(vocab.get(word, unk_id)) 97 | array_chain.append(np.array(tokens, 'i')) 98 | return array_chain 99 | 100 | for line in io.open(file_path, encoding='utf-8'): 101 | if not line.strip(): 102 | if len(chain) >= chain_length: 103 | dataset.append(make_array(chain)) 104 | chain = [] 105 | continue 106 | words = line.strip().split() + [''] 107 | chain.append(words) 108 | if len(chain) >= chain_length: 109 | dataset.append(make_array(chain)) 110 | return dataset, vocab 111 | 112 | 113 | def tokenize_text(file_path, vocab={}, update_vocab=False): 114 | tokens = [] 115 | unk_id = vocab[''] 116 | with io.open(file_path, encoding='utf-8') as f: 117 | for line in f: 118 | words = line.split() + [''] 119 | for word in words: 120 | if update_vocab: 121 | if word not in vocab: 122 | vocab[word] = len(vocab) 123 | tokens.append(vocab.get(word, unk_id)) 124 | return tokens, vocab 125 | 126 | 127 | def get_wikitext_words_and_vocab( 128 | name='wikitext-2', base_dir='datasets', vocab=None): 129 | assert(name in ['wikitext-2', 'wikitext-103']) 130 | base_dir2 = os.path.join(base_dir, name) 131 | predata_path = os.path.join(base_dir2, 'preprocessed_data.json') 132 | if os.path.exists(predata_path) and vocab is None: 133 | train, valid, test, vocab = json.load(open(predata_path)) 134 | else: 135 | prepared_vocab = (vocab is not None) 136 | if not prepared_vocab: 137 | vocab = {'': 0, '': 1} 138 | train, vocab = tokenize_text( 139 | os.path.join(base_dir2, 'wiki.train.tokens'), 140 | vocab, update_vocab=not prepared_vocab) 141 | valid, _ = tokenize_text( 142 | os.path.join(base_dir2, 'wiki.valid.tokens'), 143 | vocab, update_vocab=False) 144 | test, _ = tokenize_text( 145 | os.path.join(base_dir2, 'wiki.test.tokens'), 146 | vocab, update_vocab=False) 147 | json.dump([train, valid, test, vocab], open(predata_path, 'w')) 148 | return train, valid, test, vocab 149 | --------------------------------------------------------------------------------