├── .gitignore ├── src ├── coreset │ ├── __init__.py │ ├── random.py │ ├── herding.py │ ├── coreset_utils.py │ └── rank_dilm.py ├── distillation │ ├── distilled_data.py │ └── __init__.py ├── dataset_attrs.py └── utils.py ├── configs ├── train │ ├── generator │ │ ├── pretrained_qqp.yaml │ │ ├── pretrained_mnli.yaml │ │ └── pretrained_sst2.yaml │ ├── lm.yaml │ └── dc.yaml └── test │ ├── dc.yaml │ ├── lm.yaml │ └── coreset.yaml ├── requirements.txt ├── LICENSE └── DiLM-synthetic-data ├── sst2 └── dilm.dc │ ├── dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95 │ └── dataset │ │ ├── dataset_18.json │ │ ├── dataset_0.json │ │ ├── dataset_17.json │ │ ├── dataset_14.json │ │ ├── dataset_11.json │ │ ├── dataset_9.json │ │ ├── dataset_15.json │ │ ├── dataset_2.json │ │ ├── dataset_5.json │ │ ├── dataset_7.json │ │ ├── dataset_3.json │ │ ├── dataset_6.json │ │ ├── dataset_10.json │ │ ├── dataset_13.json │ │ ├── dataset_12.json │ │ ├── dataset_16.json │ │ ├── dataset_4.json │ │ ├── dataset_8.json │ │ ├── dataset_19.json │ │ └── dataset_1.json │ └── dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95 │ └── dataset │ ├── dataset_4.json │ ├── dataset_11.json │ ├── dataset_7.json │ ├── dataset_14.json │ ├── dataset_19.json │ ├── dataset_10.json │ ├── dataset_9.json │ ├── dataset_12.json │ ├── dataset_6.json │ ├── dataset_1.json │ ├── dataset_8.json │ ├── dataset_15.json │ ├── dataset_16.json │ ├── dataset_3.json │ ├── dataset_2.json │ ├── dataset_0.json │ ├── dataset_5.json │ ├── dataset_13.json │ ├── dataset_18.json │ └── dataset_17.json ├── qqp └── dilm.dc │ ├── dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95 │ └── dataset │ │ ├── dataset_18.json │ │ ├── dataset_19.json │ │ ├── dataset_5.json │ │ ├── dataset_10.json │ │ ├── dataset_15.json │ │ ├── dataset_1.json │ │ ├── dataset_11.json │ │ ├── dataset_12.json │ │ ├── dataset_7.json │ │ ├── dataset_9.json │ │ ├── dataset_0.json │ │ ├── dataset_16.json │ │ ├── dataset_13.json │ │ ├── dataset_14.json │ │ ├── dataset_17.json │ │ ├── dataset_2.json │ │ ├── dataset_3.json │ │ ├── dataset_8.json │ │ ├── dataset_6.json │ │ └── dataset_4.json │ └── dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95 │ └── dataset │ ├── dataset_11.json │ ├── dataset_19.json │ ├── dataset_5.json │ ├── dataset_17.json │ ├── dataset_13.json │ ├── dataset_4.json │ ├── dataset_8.json │ ├── dataset_16.json │ ├── dataset_10.json │ ├── dataset_14.json │ ├── dataset_7.json │ ├── dataset_1.json │ ├── dataset_9.json │ ├── dataset_3.json │ ├── dataset_12.json │ ├── dataset_15.json │ ├── dataset_6.json │ ├── dataset_18.json │ ├── dataset_0.json │ └── dataset_2.json └── mnli └── dilm.dc └── dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95 └── dataset └── dataset_4.json /.gitignore: -------------------------------------------------------------------------------- 1 | mlruns/ 2 | save/ 3 | data/ 4 | analysis/ 5 | 6 | .vscode/ 7 | __pycache__/ 8 | -------------------------------------------------------------------------------- /src/coreset/__init__.py: -------------------------------------------------------------------------------- 1 | from .coreset_base import CoresetConfig, CoresetModule 2 | 3 | __all__ = ["CoresetConfig", "CoresetModule"] 4 | -------------------------------------------------------------------------------- /configs/train/generator/pretrained_qqp.yaml: -------------------------------------------------------------------------------- 1 | pretrained_model_dir: save/train.gpt2.bert-base-uncased.qqp/dilm.lm/step_80000/generator 2 | checkpoint_name: last-ckpt 3 | -------------------------------------------------------------------------------- /configs/train/generator/pretrained_mnli.yaml: -------------------------------------------------------------------------------- 1 | pretrained_model_dir: save/train.gpt2.bert-base-uncased.mnli/dilm.lm/step_80000/generator 2 | checkpoint_name: last-ckpt 3 | -------------------------------------------------------------------------------- /configs/train/generator/pretrained_sst2.yaml: -------------------------------------------------------------------------------- 1 | pretrained_model_dir: save/train.gpt2.bert-base-uncased.sst2/dilm.lm/step_80000/generator 2 | checkpoint_name: last-ckpt 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # python 3.10.0 2 | torch==2.0.0+cu118 3 | transformers==4.30.0 4 | hydra-core==1.3.2 5 | mlflow==2.2.2 6 | scikit-learn==1.2.2 7 | datasets==2.18.0 8 | evaluate==0.4.0 -------------------------------------------------------------------------------- /src/distillation/distilled_data.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class DistilledDataConfig: 6 | dpc: int 7 | n_dataset: int 8 | over_sample_ratio: float = 1.0 # if > 1.0, prune samples with k_center 9 | save_dataset_path: str = "path/to/save_dataset_dir" 10 | -------------------------------------------------------------------------------- /src/coreset/random.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from datasets import Dataset 4 | 5 | 6 | def random_selection(dataset: Dataset, dpc: int, seed: int) -> Dataset: 7 | random.seed(seed) 8 | 9 | assert len(dataset) >= dpc 10 | selected_sample_ids = random.sample(range(len(dataset)), dpc) 11 | 12 | return dataset.select(selected_sample_ids) 13 | -------------------------------------------------------------------------------- /src/distillation/__init__.py: -------------------------------------------------------------------------------- 1 | from .distilled_data import DistilledDataConfig 2 | from .trainer_base import TrainConfig, TrainerBase 3 | from .trainer_dc import TrainerDC 4 | from .trainer_lm import TrainerLM 5 | 6 | __all__ = ["TrainerConfig", "get_trainer", "DistilledDataConfig"] 7 | 8 | TRAINER_CLASSES = { 9 | "lm": TrainerLM, 10 | "dc": TrainerDC, 11 | } 12 | 13 | 14 | def get_trainer( 15 | config: TrainConfig, distilled_data_config: DistilledDataConfig 16 | ) -> TrainerBase: 17 | assert config.train_type in TRAINER_CLASSES 18 | return TRAINER_CLASSES[config.train_type](config, distilled_data_config) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Aru Maekawa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_18.json: -------------------------------------------------------------------------------- 1 | {"sentence":"should be tried as a war criminal instead of a human volcano","labels":0} 2 | {"sentence":"can't say it's on par with the first one","labels":0} 3 | {"sentence":"is so downbeat and nearly humorless it doesn't even qualify as a spoof of such.","labels":0} 4 | {"sentence":"this ugly, revolting film does little that is actually funny with the material.","labels":0} 5 | {"sentence":"an inexpressible drudgery in which the only entertainment is a silly fluke that never gets off the ground.","labels":0} 6 | {"sentence":"is endlessly inventive and playful.","labels":1} 7 | {"sentence":"a stunning new young talent in one of chabrol's most intense psychological mysteries","labels":1} 8 | {"sentence":"it is smart and dark - a treat for all audiences","labels":1} 9 | {"sentence":"a fine, focused piece of work that effectively captures the dry wit of dry storytelling","labels":1} 10 | {"sentence":"a terrific date film","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_0.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is too amateurishly square to work as storytelling, and the ensemble cast lacks depth and resonance.","labels":0} 2 | {"sentence":"is so lousy that you can not enjoy it","labels":0} 3 | {"sentence":"incredibly lifeless, with the lack-of-attention span","labels":0} 4 | {"sentence":"the script's contrived, lame screenplay and listless direction are just the ticket cost.","labels":0} 5 | {"sentence":"a cheap scam that only weak claims to dramatic impact and creepy-crawly humor.","labels":0} 6 | {"sentence":"is a wonderous accomplishment of veracity and narrative grace.","labels":1} 7 | {"sentence":"very best","labels":1} 8 | {"sentence":"a fully realized story with keen insights into parapsychological phenomena and the soulful nuances of the grieving process","labels":1} 9 | {"sentence":"it one of the best-sustained ideas i have ever seen on the screen.","labels":1} 10 | {"sentence":"a surprisingly sweet, tender drama that does a superb job contrasting the sleekness of the film's present with the playful paranoia of the film's past.","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_17.json: -------------------------------------------------------------------------------- 1 | {"sentence":"a shoddy product that does not live up to the exalted tagline","labels":0} 2 | {"sentence":"the trifecta of badness that is not even as daring and diverse a filmmaker as he thinks it is.","labels":0} 3 | {"sentence":"that manages to be even worse than its title","labels":0} 4 | {"sentence":"is an unimaginative screenwriter's invention that is completely lacking in execution and stylishness.","labels":0} 5 | {"sentence":"this sloppy, made-for-movie comedy special does no justice to the awfulness of the film and its surrounding pollution","labels":0} 6 | {"sentence":"a well-crafted psychological study of love and power","labels":1} 7 | {"sentence":"an enjoyable blend of dramatic action sequences and a haunting ode to humanity","labels":1} 8 | {"sentence":"it's a spirited film with a witty performance that deftly captures the dry wit that's so prevalent on the rock.","labels":1} 9 | {"sentence":"the funniest motion","labels":1} 10 | {"sentence":"is an intelligent romantic thriller of freshness that excites the imagination and tickles the funny bone.","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_14.json: -------------------------------------------------------------------------------- 1 | {"sentence":"a bad film that comes across as a complete waste of time.","labels":0} 2 | {"sentence":"a clich\u00e9d and shallow cautionary tale about the unsalvageability of being ugly to look at and not a hollywood product","labels":0} 3 | {"sentence":"is disgusting to begin with","labels":0} 4 | {"sentence":"that by the end it looks like some futile concoction that was developed hastily after oedekerk and his fellow moviemakers got through crashing a college keg party.","labels":0} 5 | {"sentence":"is so poorly paced it doesn't even qualify as a spoof of such.","labels":0} 6 | {"sentence":"a surprising and rewarding glimpse into the urban heart","labels":1} 7 | {"sentence":"a gripping coming-of-age drama with moments of sly humor and authentic warmth.","labels":1} 8 | {"sentence":"is unusual, food-for-thought cinema that's as entertaining as it is instructive.","labels":1} 9 | {"sentence":"this is a brilliant piece of filmmaking with an intriguing story of maternal instincts and misguided acts of affection for ailing grandmothers characters","labels":1} 10 | {"sentence":"a true delight for all audiences","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_11.json: -------------------------------------------------------------------------------- 1 | {"sentence":"misfiring, undermining the story's emotional thrust and distracting the misconceived final 5 minutes.","labels":0} 2 | {"sentence":"'s not going to be everyone's bag of popcorn","labels":0} 3 | {"sentence":"an unoriginal mess that even the guy from dahmer resorts to using stunt doubles and animal house reunions for laughs sake","labels":0} 4 | {"sentence":"boasts nothing but an unimaginative screenwriter's invention and a bull in a china shop, ensnaring its target audience in the act.","labels":0} 5 | {"sentence":"this clich\u00e9-riddled genre play is landlocked, witless, and devoid of anything resembling humor or even a decent story.","labels":0} 6 | {"sentence":"is a riveting profile of a modern israel in crisis.","labels":1} 7 | {"sentence":"a solid and refined piece of moviemaking","labels":1} 8 | {"sentence":"it actually provides a satisfying complete picture of this unique moment in american history.","labels":1} 9 | {"sentence":"a naturally funny, sweetly adventurous film that will keep you guessing at almost every turn.","labels":1} 10 | {"sentence":"a solid payoff for a satisfying evening at the multiplex","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_9.json: -------------------------------------------------------------------------------- 1 | {"sentence":"deteriorates into a terribly obvious melodrama and rough-hewn vanity project for lead actress andie macdowell.","labels":0} 2 | {"sentence":"this is a complete waste of time, money and celluloid.","labels":0} 3 | {"sentence":"a clich\u00e9 left unsaid","labels":0} 4 | {"sentence":"a loud, ugly, irritating comedy that never catches fire","labels":0} 5 | {"sentence":", the movie is essentially devoid of interesting characters or even a halfway intriguing plot.","labels":0} 6 | {"sentence":"it's an ambitious film that will thrill you with its subtly different tones of heart, humor and pathos.","labels":1} 7 | {"sentence":"is still funny, insightfully human and a delightful entree in the tradition of family fare that affirms the nourishing aspects of love and companionship","labels":1} 8 | {"sentence":"a solidly entertaining comedy that excites the imagination and tickles the funny bone","labels":1} 9 | {"sentence":"nifty premise","labels":1} 10 | {"sentence":"a masterpiece with a story that puts old-fashioned values under the microscope, and captures an extraordinary, original talent in one of chabrol's most intense psychological mysteries","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_15.json: -------------------------------------------------------------------------------- 1 | {"sentence":"-- where nothing's happening, everything's happening fast, and michele's personality becomes a caricature -- is a pale imitation.","labels":0} 2 | {"sentence":"a mess that is plainly dull and visually ugly when it isn't incomprehensible.","labels":0} 3 | {"sentence":"simply intrusive to the experience of being forty","labels":0} 4 | {"sentence":"is ultimately rather silly and overwrought, while never sure what its point is.","labels":0} 5 | {"sentence":"the uninspired scripts, acting and direction never rise above the level of an after-school tv special.","labels":0} 6 | {"sentence":"is gorgeously atmospheric meditation on life-changing chance encounters.","labels":1} 7 | {"sentence":"a worthy addition to the cinematic canon, this charming and evoking noir reminded us that the kind of lush, all-enveloping movie experience is possible with a real director's eye","labels":1} 8 | {"sentence":"a fully realized story that puts the dutiful efforts of more disciplined grade-grubbers to shame.","labels":1} 9 | {"sentence":"a refreshingly smart and newfangled variation","labels":1} 10 | {"sentence":"an interesting look at the rapidly changing face of fame","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_2.json: -------------------------------------------------------------------------------- 1 | {"sentence":"that plays like an extended dialogue exercise in retard 101 that takes no apparent joy and goes nowhere.","labels":0} 2 | {"sentence":"is as shallow and glib as tom green's pamela before her, just as imperious and sophomoric as a lost cause.","labels":0} 3 | {"sentence":"that if the movie isn't as beautifully shaped and as delicately calibrated in tone as it is, you can't shake the feeling that it was intended to be a different kind of film.","labels":0} 4 | {"sentence":"too short of an attention span","labels":0} 5 | {"sentence":"a terrible movie without any of its satirical salvos hitting.","labels":0} 6 | {"sentence":"'s a work both refreshingly different and reassuringly familiar.","labels":1} 7 | {"sentence":"just adorable","labels":1} 8 | {"sentence":"has inventive moments and an often engaging story","labels":1} 9 | {"sentence":"a winning comedy that excites the imagination and tickles the funny bone","labels":1} 10 | {"sentence":"also an exceptionally moving portrait of an intensely lived time, filled with nervous energy, moral ambiguity and great uncertainties that cross our hearts in the same way that each new movie tells a different story.","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_5.json: -------------------------------------------------------------------------------- 1 | {"sentence":"less funny than it probably should be","labels":0} 2 | {"sentence":"it's not scary in the slightest and a cheap scam that does no justice to either effort in three hours of screen time.","labels":0} 3 | {"sentence":"is not a compelling piece of moviemaking.","labels":0} 4 | {"sentence":"a woefully hackneyed movie that plays like some weird masterpiece theater sketch with neither a point of view, nor a compelling reason for being.","labels":0} 5 | {"sentence":"the story is even more ludicrous than you 'd expect from the guy-in-a-dress genre, which also seems to play on a 10-year delay.","labels":0} 6 | {"sentence":"a very sweet, sensitive tale of maternal instincts and misguided acts of affection that is more accurate than anything i have seen in an american film.","labels":1} 7 | {"sentence":"is a provocative piece of work that reopens an interesting controversy and never succumbs to sensationalism","labels":1} 8 | {"sentence":"from a cast that brings them to life, here's a must for genre fans.","labels":1} 9 | {"sentence":"the most thoughtful fictional examination","labels":1} 10 | {"sentence":"a genuine love story that puts old-fashioned values under the microscope","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_7.json: -------------------------------------------------------------------------------- 1 | {"sentence":", it is utterly misplaced in its message and lack of purpose in its stylish trimming of an already overused concept.","labels":0} 2 | {"sentence":"the worst kind of hubristic folly","labels":0} 3 | {"sentence":"the story is so overripe that it doesn't improve upon the experience of staring at a blank screen","labels":0} 4 | {"sentence":"is so poorly paced it doesn't even qualify as a spoof of such.","labels":0} 5 | {"sentence":"an inexcusable dim-witted pairing of teen-speak and animal gibberish that only seems to care about the bottom line.","labels":0} 6 | {"sentence":"is so intimate and sensual and funny and psychologically self-revealing that it makes up for in heart what it lacks in outright newness.","labels":1} 7 | {"sentence":"an engrossing entertainment of flavors and emotions","labels":1} 8 | {"sentence":"a true delight for all audiences","labels":1} 9 | {"sentence":"is a gorgeous film to experience, full of the sounds of the world trade center tragedy and the emotional shadow of that tragedy together with a haunting sense of malaise.","labels":1} 10 | {"sentence":"a sophisticated and engaging film about the catalytic effect a feel-good movie can have upon a family.","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_3.json: -------------------------------------------------------------------------------- 1 | {"sentence":"it's not nearly as fresh or enjoyable as its predecessor.","labels":0} 2 | {"sentence":"every plot contrivance that the clich\u00e9-riddled genre couldn't be better suited to a night at the multiplex","labels":0} 3 | {"sentence":"a disaster of a story with zero closure and zero aspirations to entertain or entertain the visually dumbed-down crowd.","labels":0} 4 | {"sentence":"it is not enough to give the film the substance it so desperately needs -- a droll social realism and a hail of bullets that don't miss.","labels":0} 5 | {"sentence":"is essentially juiceless and uncreative without any of its satirical or social message.","labels":0} 6 | {"sentence":", this imaginative director's versatile use of music and images gives a first-class, thoroughly involving b movie that will leave fans clamoring for another ride.","labels":1} 7 | {"sentence":"offers signs of life and small delights thanks to the actors welling up in the revelatory material.","labels":1} 8 | {"sentence":"a directorial tour de force, with an eye on preserving an old-fashioned sense of storytelling","labels":1} 9 | {"sentence":"a true cinematic knack","labels":1} 10 | {"sentence":"a beguiling evocation of the univac-like ethos that underlies the best of comedies","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_6.json: -------------------------------------------------------------------------------- 1 | {"sentence":"somewhere inside its fabric is a wishy-washy melodramatic version of `` based on a bad idea just came from a bad movie, starring the mediocre scooby again, just because it seems obligatory.","labels":0} 2 | {"sentence":"with a faulty premise, a film that is impostorless and does not have much eye-catching photography.","labels":0} 3 | {"sentence":"a mere plot pawn for two directors with far less endearing disabilities","labels":0} 4 | {"sentence":"is a negligible work of manipulation, an exploitation piece doing its usual worst to guilt-trip parents.","labels":0} 5 | {"sentence":"some of the actors would be undogmatic about admitting this movie is almost completely lacking in suspense, surprise and consistent emotional conviction, but it is almost entirely lacking in substance and believable subplots.","labels":0} 6 | {"sentence":"this is a winning family film that excites the imagination and tickles the funny bone.","labels":1} 7 | {"sentence":"is a thoroughly entertaining celebration of its sounds and images","labels":1} 8 | {"sentence":"powerful and absorbing look","labels":1} 9 | {"sentence":"a nice treat for all audiences","labels":1} 10 | {"sentence":"a smart, provocative drama that manages to find greatness in the hue of its drastic iconography.","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_10.json: -------------------------------------------------------------------------------- 1 | {"sentence":"'s not going to be everyone's bag of popcorn","labels":0} 2 | {"sentence":"it's not a brilliant piece of filmmaking, but it is difficult to shrug off the annoyance of that chatty fish after an hour and a half of vaguely interesting but nothing-there routine.","labels":0} 3 | {"sentence":"a weak and ineffective ghost story without any of the pleasure of its lurid fable","labels":0} 4 | {"sentence":"is too goofy to maintain interest during the long build-up of expository material.","labels":0} 5 | {"sentence":"barely distinguish one sci-fi work from another, with cliches that are repeatedly undercut by the flat acting, dull exposition and murky cinematography.","labels":0} 6 | {"sentence":"makes up for in heart what it lacks in outright newness","labels":1} 7 | {"sentence":"( the film ) is fully formed and remarkably assured, delivering an elegant and highly pleasurable experience that is magnetic for its moodiness and quality of delivery.","labels":1} 8 | {"sentence":"a film of intense character study about newcomers in a strange new world.","labels":1} 9 | {"sentence":"the film is impressive for the sights and sounds of the wondrous beats the world has to offer.","labels":1} 10 | {"sentence":"the special qualities that make it worth checking out at theaters","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_13.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is tantamount to insulting the intelligence of anyone who hasn't been living under a rock ( since sept. 11 ) and looking for a return ticket to realism.","labels":0} 2 | {"sentence":"it seems an impossible task to balance all the formulaic equations in the long-winded heist comedy looking for a return ticket to realism.","labels":0} 3 | {"sentence":", the film winds up merely pretentious -- in a grisly sort of way.","labels":0} 4 | {"sentence":"the worst kind of hubristic folly, the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy","labels":0} 5 | {"sentence":"an inexpressible bore whose valuable messages are forgotten 10 minutes after the last trombone","labels":0} 6 | {"sentence":"brilliant combination act","labels":1} 7 | {"sentence":", intriguing and honorable, the film is well worthwhile.","labels":1} 8 | {"sentence":"is a fine, focused piece of work that reopens an interesting controversy and never succumbs to sensationalism.","labels":1} 9 | {"sentence":"a fine, focused piece of work that excites the imagination and tickles the funny bone","labels":1} 10 | {"sentence":"a charming, banter-filled comedy with witty dialogue and a skillful cast that captures the french coming-of-age in a unique and entertaining fashion","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_12.json: -------------------------------------------------------------------------------- 1 | {"sentence":"the sorry state of the entire cast, with subplots involving the various silbersteins that it feels more like the pilot episode of a tv series than a feature film.","labels":0} 2 | {"sentence":"is so overwrought and derivative that the real issues are too simplistic to be interesting in a 90-minute movie that is definitely meaningless, vapid and devoid of substance.","labels":0} 3 | {"sentence":", dumb comedy rarely comes alive as its own fire-breathing entity in this picture.","labels":0} 4 | {"sentence":"the worst sin of attributable to a movie like this","labels":0} 5 | {"sentence":"a stiflingly unfunny, largely unfunny sandlerian stylist's invention, a mishmash that nearly wipes out the possibility of any story reaching any new audience.","labels":0} 6 | {"sentence":"a true cinematic knack","labels":1} 7 | {"sentence":"a refreshingly smart and newfangled variation","labels":1} 8 | {"sentence":"gives a perfect performance that captures the innocence and budding demons within a wallflower.","labels":1} 9 | {"sentence":"at 24 the film is a treat - a charming, banter-filled comedy that proves you can run away from home, but your ego can take you anywhere.","labels":1} 10 | {"sentence":"a gripping documentary that places the good-time shenanigans in welcome perspective.","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_16.json: -------------------------------------------------------------------------------- 1 | {"sentence":"that as a director washington demands and receives excellent performances, but it is really an excuse to get hold of this lame kiddie flick, a questionable kind of under-inspired dimwitsville where the only holes are found in the characters'moves.","labels":0} 2 | {"sentence":", irresponsible, hypocritical work that plays like a loosely-connected string of acting-workshop exercises","labels":0} 3 | {"sentence":"making a shallow rumination on the emptiness of success almost impossible","labels":0} 4 | {"sentence":"an excruciating demonstration of the unsalvageability of a movie saddled with an amateurish screenplay designed to garner the bare bones of byatt's uninspired philosophy.","labels":0} 5 | {"sentence":"is a complete waste of time.","labels":0} 6 | {"sentence":"a clever script and inventive moments","labels":1} 7 | {"sentence":"an interesting slice of history that would have been better off staying on the festival circuit","labels":1} 8 | {"sentence":"is a comic gem that relays the tale's undeniable emotional thrust without stooping to base melodrama.","labels":1} 9 | {"sentence":"a shrewd and effective film for young or old alike.","labels":1} 10 | {"sentence":"though many can aspire but none can equal, this is a film that delivers on the promise of excitement.","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_4.json: -------------------------------------------------------------------------------- 1 | {"sentence":"this short of a spoof, this is the opposite of a truly magical movie","labels":0} 2 | {"sentence":"a movie crammed with movie references that are repeatedly undercut by the brutality of the jokes, most at women's expense.","labels":0} 3 | {"sentence":"the movie's contrived, lame screenplay and listless direction leaves scant place for the viewer to really learn what makes wilco a deeply unpleasant experience.","labels":0} 4 | {"sentence":"the worst sin of attributable to a movie like this","labels":0} 5 | {"sentence":"is far too sentimental and out of place in what could have ( and probably should have ) been a lighthearted comedy","labels":0} 6 | {"sentence":"a complex story","labels":1} 7 | {"sentence":"a vibrant whirlwind of love, family and all that goes with it, is a film full of grace and purpose, one that is filled with humorous observations about the general absurdity of modern life as seen through the eyes outsiders, particularly those in urban south korea.","labels":1} 8 | {"sentence":"pure composition and form with a lyrical metaphor for the modern masculine journey","labels":1} 9 | {"sentence":"is a delightfully unpredictable, hilarious comedy that deserves more than a passing twinkle.","labels":1} 10 | {"sentence":"a good film that deserves recommendation","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_8.json: -------------------------------------------------------------------------------- 1 | {"sentence":"a slice of counterculture that might be best forgotten","labels":0} 2 | {"sentence":"is a complete waste of time.","labels":0} 3 | {"sentence":"a selection of scenes that are listless, witless, and devoid of anything resembling humor","labels":0} 4 | {"sentence":"about killing time, only for its visual gags to get in the way and stop telling us anything interesting except that kung pow is a downright hitchcockian -- an excuse to get to the closing bout... by which time it's impossible to care who wins.","labels":0} 5 | {"sentence":"the grey zone won't fly with most intelligent viewers, and cartoonists who would be well to abandon the theater for a more hackneyed movie instead.","labels":0} 6 | {"sentence":"is well-intentioned, and well-crafted film.","labels":1} 7 | {"sentence":"a fine job of updating white's dry wit to a new age","labels":1} 8 | {"sentence":"a solid, anguished performance that effortlessly draws you in without feeling pressure to rush through the intermediary passages, which pop up often in my book selection as an alternative history lesson.","labels":1} 9 | {"sentence":"very compelling coming-of-age drama","labels":1} 10 | {"sentence":"it has rewards, and the magnificent swooping aerial shots make this an exhilarating film for kids and adults alike.","labels":1} 11 | -------------------------------------------------------------------------------- /src/dataset_attrs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset attributes for loading and processing datasets. 3 | """ 4 | 5 | 6 | DATASET_ATTRS = { 7 | "sst2": { 8 | "load_args": ("glue", "sst2"), 9 | "sentence_keys": ("sentence",), 10 | "label_key": "label", 11 | "problem_type": "single_label_classification", 12 | "test_split_key": "validation", 13 | "num_labels": 2, 14 | "metric_args": ("glue", "sst2"), 15 | "max_length": 68, 16 | "metric_key": "accuracy", 17 | "label_dict": {0: "negative", 1: "positive"}, 18 | }, 19 | "mnli": { 20 | "load_args": ("glue", "mnli"), 21 | "sentence_keys": ("premise", "hypothesis"), 22 | "label_key": "label", 23 | "problem_type": "single_label_classification", 24 | "test_split_key": "validation_matched", 25 | "num_labels": 3, 26 | "metric_args": ("glue", "mnli"), 27 | "max_length": 421, 28 | "metric_key": "accuracy", 29 | "label_dict": {0: "entailment", 1: "neutral", 2: "contradiction"}, 30 | }, 31 | "qqp": { 32 | "load_args": ("glue", "qqp"), 33 | "sentence_keys": ("question1", "question2"), 34 | "label_key": "label", 35 | "problem_type": "single_label_classification", 36 | "test_split_key": "validation", 37 | "num_labels": 2, 38 | "metric_args": ("glue", "qqp"), 39 | "max_length": 313, 40 | "metric_key": "combined_score", 41 | "label_dict": {0: "unequal", 1: "equal"}, 42 | }, 43 | } 44 | -------------------------------------------------------------------------------- /src/coreset/herding.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | from datasets import Dataset 5 | from transformers import PreTrainedModel, PreTrainedTokenizer 6 | 7 | from .coreset_utils import get_embeddings, l2_dist 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @torch.no_grad() 13 | def herding( 14 | dataset: Dataset, 15 | dpc: int, 16 | model: PreTrainedModel, 17 | tokenizer: PreTrainedTokenizer, 18 | sentence_keys: list[str], 19 | ): 20 | assert len(dataset) >= dpc 21 | 22 | embeddings = get_embeddings(dataset, model, tokenizer, sentence_keys) 23 | embeddings = embeddings.cuda() 24 | 25 | indices = torch.arange(embeddings.size(0), device="cuda") 26 | select_results = torch.zeros(len(dataset), dtype=torch.bool, device="cuda") 27 | 28 | mean_original = embeddings.mean(0) 29 | 30 | logger.info("Selecting samples with herding") 31 | for i in range(dpc): 32 | assert sum(select_results) == i 33 | if i == 0: 34 | sum_selected = torch.zeros_like(mean_original).unsqueeze(0) 35 | else: 36 | sum_selected = embeddings[indices[select_results]].sum(0, keepdim=True) 37 | 38 | dists = l2_dist( 39 | sum_selected + embeddings[indices[~select_results]], 40 | mean_original * (i + 1), 41 | ) 42 | select_results[indices[~select_results][dists.argmin().item()]] = True 43 | 44 | selected_indices = indices[select_results].tolist() 45 | assert len(selected_indices) == dpc 46 | return dataset.select(selected_indices) 47 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_19.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is so formulaic and forgettable that it doesn't improve upon the experience of staring at a blank screen.","labels":0} 2 | {"sentence":"a trifle flat, and a half-hearted fluke, the movie is not up to the level of the direction.","labels":0} 3 | {"sentence":"an overexposed waste of film that, half an hour in, starts making water torture seem appealing rather than plausible.","labels":0} 4 | {"sentence":"that plays like some corny tv special that only had a tv show designed for the teenage crowd, only less technically proficient and without the pop-up comments so much as generic jennifer lopez rap references.","labels":0} 5 | {"sentence":"a dumb fartfest","labels":0} 6 | {"sentence":"a tasty treat","labels":1} 7 | {"sentence":"the film has a terrific look and salma hayek is a welcome relief from hollywood fluff to show us a slice of life that puts the kibosh on a seasonal holiday in unexpected places.","labels":1} 8 | {"sentence":"is a visual treat for all audiences -- a delightful comedy with charming dialogue and funny performances.","labels":1} 9 | {"sentence":"a remarkable film that reveals the ways in which a sultry evening or a beer-fueled afternoon in the sun can inspire even the most retiring heart to venture forth.","labels":1} 10 | {"sentence":"a beguiling freshness, imagination and insight into a subculture whose vibrant creative energy, history and","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_1.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is so amateurish, incoherent, and downright goofy that few will bother thinking it all through.","labels":0} 2 | {"sentence":"the worst sin of attributable to a movie like this","labels":0} 3 | {"sentence":"the worst kind of hollywood heart-string plucking, the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy about its execution and lack of narrative discipline.","labels":0} 4 | {"sentence":"an awkward hybrid of the sophomoric and the gross-out comedy.","labels":0} 5 | {"sentence":"a film so insecure about its capacity to excite that it churns up not one but two flagrantly fake thunderstorms to underscore the action.","labels":0} 6 | {"sentence":"a first-class, thoroughly involving b movie that effectively combines two surefire, beloved genres -- the prison flick and the fight film.","labels":1} 7 | {"sentence":"that underlies the best of comedies","labels":1} 8 | {"sentence":"a compelling investigation of faith versus intellect, tragedy and the delving into the characters themselves","labels":1} 9 | {"sentence":"offers an exploration that is more accurate than anything i have seen in an american film.","labels":1} 10 | {"sentence":"is top-notch, and beautifully acted by abel ferrara, this beautifully produced film is at once playful and haunting, a twisting and evoking examination of the twin problems of love and power.","labels":1} 11 | -------------------------------------------------------------------------------- /src/coreset/coreset_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from datasets import Dataset 3 | from torch.cuda import amp 4 | from transformers import BatchEncoding, PreTrainedModel, PreTrainedTokenizer 5 | 6 | 7 | def batch_to_cuda(batch: dict[str, torch.Tensor] | BatchEncoding): 8 | """Load batch on cuda device""" 9 | return {k: v.cuda() for k, v in batch.items()} 10 | 11 | 12 | def l2_dist(src: torch.Tensor, tgt: torch.Tensor): 13 | """Compute L2 distance 14 | Args: 15 | src (torch.Tensor): Source tensor of shape (n, d) 16 | tgt (torch.Tensor): Target tensor of shape (d,) 17 | Returns: 18 | dists (torch.Tensor): L2 distance of shape (n,) 19 | """ 20 | return (src - tgt.unsqueeze(0)).pow(2).sum(1).pow(0.5) 21 | 22 | 23 | def get_embeddings( 24 | dataset: Dataset, 25 | model: PreTrainedModel, 26 | tokenizer: PreTrainedTokenizer, 27 | sentence_keys: list[str], 28 | batch_size: int = 256, 29 | ) -> torch.Tensor: 30 | """Compute embeddings of training examples with encoder model""" 31 | 32 | model.cuda() 33 | model.eval() 34 | 35 | def _get_embedding(batch): 36 | sentences = tuple(batch[key] for key in sentence_keys) 37 | inputs = tokenizer( 38 | *sentences, padding=True, truncation=True, return_tensors="pt" 39 | ) 40 | with torch.inference_mode(): 41 | with amp.autocast(dtype=torch.bfloat16): 42 | outputs = model(**batch_to_cuda(inputs), output_hidden_states=True) 43 | embeddings = outputs.hidden_states[-1][:, 0].cpu() 44 | return {"embedding": embeddings} 45 | 46 | embed_dataset = dataset.map(_get_embedding, batched=True, batch_size=batch_size) 47 | 48 | return torch.tensor(embed_dataset["embedding"]) 49 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_18.json: -------------------------------------------------------------------------------- 1 | {"question1":"What does a (long, extremely long period) penis feel like?","question2":"What does it feel like to have a lot of erectile dysfunction?","labels":0} 2 | {"question1":"\"How do I remove \"\"@\"\" from UrlCsv?\"","question2":"Where can I get level 43 knight coins?","labels":0} 3 | {"question1":"Can the arrow key at the same time appear anywhere in all the books and videos on Google Books? How is it possible?","question2":"How do you find the first two numbers without any display button on all smart TVs?","labels":0} 4 | {"question1":"How do I study for nth grade?","question2":"Is it rude for a girl to show confidence to a boy?","labels":0} 5 | {"question1":"What is the probability that you get 100% marks in physics in Physics II M.D.?","question2":"What is the chance of getting 100+ marks in Physics II M.D.?","labels":0} 6 | {"question1":"How do you know if you have a high blood pressure?","question2":"How do I know if I have a high blood pressure?","labels":1} 7 | {"question1":"Can Quora make a website like Quoraplus.com? If yes what will Quora make it?","question2":"Can Quora make a website like Quoraplus.com? If yes, what will it make?","labels":1} 8 | {"question1":"How do I stop excessive masturbation?","question2":"How can I stop masturbating?","labels":1} 9 | {"question1":"How do I know if my college of pharmacy in India is good for me?","question2":"How do I know whether my college of pharmacy in India is good for me?","labels":1} 10 | {"question1":"How do I find who started a Quora account?","question2":"How can I find out who started a Quora account?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_19.json: -------------------------------------------------------------------------------- 1 | {"question1":"How should I change my laptop web browser style?","question2":"How should I change the style of a laptop?","labels":0} 2 | {"question1":"Can the lobes from a dragonfly\/hoax incubate into the foetal state?","question2":"Why did Michael Van Skooge fail in Babs?","labels":0} 3 | {"question1":"How is electronic circuit design supported?","question2":"What do circuit breakers and electronic components do?","labels":0} 4 | {"question1":"\"How do I find out what the \"\"part time\" hours are in my colleges \"\"students at summertime\"\" hours?\"","question2":"What type of numbers (frac5-frac2-frac5-frac5) are present on the periodic table?","labels":0} 5 | {"question1":"How many people per day (non-professionals) in the city of Mill Hall in London, ON make it their home?","question2":"What is the number of people a single person need to visit in a day?","labels":0} 6 | {"question1":"What is the password for a homemade live wallpaper or any of the original wallpaper\/papers?","question2":"What is the password to a homemade live wallpaper or any of the original wallpaper\/papers?","labels":1} 7 | {"question1":"How do you lose weight in a short time?","question2":"How do I lose weight in a short time?","labels":1} 8 | {"question1":"How can I start online trading?","question2":"How can I start online trading?","labels":1} 9 | {"question1":"Why do some people think Earth is flat when it is not?","question2":"Why do some people think Earth is flat when we have never seen it?","labels":1} 10 | {"question1":"How do I read real time?","question2":"How do I read real time?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_5.json: -------------------------------------------------------------------------------- 1 | {"question1":"How important is long distance relationship in urology?","question2":"What are the qualities of a positive long distance relationship?","labels":0} 2 | {"question1":"What is the relationship between thermal load and electromotive force?","question2":"How can I do CS in zoology with better academic advantages?","labels":0} 3 | {"question1":"Which brand of radios are best to buy from abroad or in Asia?","question2":"Which brand of watches is best to buy in the market as a renter?","labels":0} 4 | {"question1":"Are isometric bases considered wrong in core stability?","question2":"What are the references I need to remember in campus environment in hotel after audit?","labels":0} 5 | {"question1":"How do I make WhatsApp faster when using Redmi Note 3 phones?","question2":"How can I have a separate phone and to restore a Redmi Note 3 when I don't have a Redmi Note 3 network?","labels":0} 6 | {"question1":"Why does our mind keep thinking about things that we already know?","question2":"Why does our mind keep thinking about things that we already know?","labels":1} 7 | {"question1":"How do I get my tattoos fixed?","question2":"How do I get tattoos fixed?","labels":1} 8 | {"question1":"How can I get traffic for my website?","question2":"How can I get traffic for my website?","labels":1} 9 | {"question1":"How do I learn a new language quickly?","question2":"How can I learn to learn a new language faster?","labels":1} 10 | {"question1":"What is the best mind-blowing smart phone under INR 15,000?","question2":"What is the best mind blowing smartphone under INR 15,000?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_10.json: -------------------------------------------------------------------------------- 1 | {"question1":"How long does it take to get an ice cube?","question2":"Does the ice cube takes time to arrive?","labels":0} 2 | {"question1":"What is the first software and programming language to learn in Palo Alto, California?","question2":"What is the cheapest software to learn in Palo Alto, California?","labels":0} 3 | {"question1":"What is an electrical testing technique used for?","question2":"How do I hack a live electric test simulation?","labels":0} 4 | {"question1":"I want to review a book at the end of the book. Can someone tell me the review now?","question2":"How can I store the old notes of an album in a classical keyboard book?","labels":0} 5 | {"question1":"What are the primary\/primary symptoms of a pithopus perch?","question2":"How do I start a grocery store with high selling goods of different types?","labels":0} 6 | {"question1":"What is it like to live alone?","question2":"What is it like to live alone in person?","labels":1} 7 | {"question1":"How do you learn to love a stranger?","question2":"How do I love or learn to love a stranger?","labels":1} 8 | {"question1":"Why do we feel pain when somebody touches our vagina?","question2":"Why do we feel pain when someone touches our vagina?","labels":1} 9 | {"question1":"What is the difference between Gmail and Yahoo Mail?","question2":"What is the difference between Gmail and Yahoo Mail?","labels":1} 10 | {"question1":"What are your views on the 500 & 1000 rupee notes ban in India? What are the pros and cons of it?","question2":"What are your views on the ban on 500 and 1000 rupee notes in India? Pros and cons?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_15.json: -------------------------------------------------------------------------------- 1 | {"question1":"Which is the most important thing in life?","question2":"Is it necessary to have good working memory to make a C# website for a very long time?","labels":0} 2 | {"question1":"Why do we need to put iPad (or iPod nano) in a phone?","question2":"Why do I need to put my earphones in front of my eyes?","labels":0} 3 | {"question1":"What is something you should never say to an older colleague when you discuss financial sales?","question2":"What is something you should never say to an older colleague when discussing business process?","labels":0} 4 | {"question1":"How long should a thief's eye get for using a scintillating knife?","question2":"Can I take bleach along with the toothpaste for a toothpaste overdose?","labels":0} 5 | {"question1":"Which countries don't have any WhatsApp?","question2":"Why doesn't WhatsApp have any website where we can easily download videos or pictures? Is it a waste of space?","labels":0} 6 | {"question1":"What will happen if Donald Trump wins?","question2":"What will happen if Donald Trump is elected president?","labels":1} 7 | {"question1":"How do you know when you're in love with someone?","question2":"How do you know when you are in love with someone?","labels":1} 8 | {"question1":"How do I contact Google in India?","question2":"How do I contact Google in India?","labels":1} 9 | {"question1":"Does cellulose help to keep hair longer? If yes, how?","question2":"Does cellulose help to keep hair longer? If yes how?","labels":1} 10 | {"question1":"What is the best way to avoid procrastinating?","question2":"How can I avoid procrastination?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_1.json: -------------------------------------------------------------------------------- 1 | {"question1":"I want to connect the mouse in an HDFC dock to my HDMI port in laptops without using the HDMI port. How do I do it?","question2":"Can I use the laptop's HDMI port on a TV connected to HDMI cable without changing the software?","labels":0} 2 | {"question1":"What is \"\"right the choice to live\"\" and what is it about it that can make us choose to live?","question2":"Where can I get mid pune brand road touring wallets in france?","labels":0} 3 | {"question1":"How can I make 2D scenes of animals in 3D without modifying the image?","question2":"Do interest groups work?","labels":0} 4 | {"question1":"How is a family changing in North Dakota?","question2":"How is a family changing in Arizona?","labels":0} 5 | {"question1":"What is it like to develop intelligence (without significant schooling)?","question2":"Why do people care so much about IQ?","labels":0} 6 | {"question1":"What do you do when you feel like sleeping a full night?","question2":"What do you do when you feel like sleeping full nights?","labels":1} 7 | {"question1":"How can I earn money online?","question2":"How do I earn money online from home?","labels":1} 8 | {"question1":"How can I get a database for my apps?","question2":"How do I get a database of my apps?","labels":1} 9 | {"question1":"Why is cricket always a team sport? Is it because of population or talent?","question2":"Why cricket always a team sport? Is it because of population or talent?","labels":1} 10 | {"question1":"What is the most important business in the world and why?","question2":"What is the most important business business in the world? And why?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_11.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is the cost of traditional metals?","question2":"What is the cost of someone wearing a traditional polo shirt?","labels":0} 2 | {"question1":"What is Code fashion with kbeauty?","question2":"Is it true that the 10 x 10 flag used in Hebrew is a geometric symbol rather than a geometric symbol?","labels":0} 3 | {"question1":"What type of people does the police have?","question2":"I am getting better grades in school. But I still don't feel confident. How should I change it?","labels":0} 4 | {"question1":"What are some mind-blowing gadgets tools that most people don't know about?","question2":"What are some mind-blowing personal computers tools that exist that most people don't know about?","labels":0} 5 | {"question1":"How does Quora handle a misspelled username, date, etc.?","question2":"How do I reset my Quora account password by email?","labels":0} 6 | {"question1":"What does one learn in college?","question2":"What are the ways one must learn from one's college?","labels":1} 7 | {"question1":"Why do some people live in the sky after sunset?","question2":"Why do some people live in the sky after sunset?","labels":1} 8 | {"question1":"What are some tips on making it through the job interview process at American Resources?","question2":"What are some tips on making it through the job interview process at American Resources?","labels":1} 9 | {"question1":"How do you get rid of a greasy tongue?","question2":"How do I get rid of a greasy tongue?","labels":1} 10 | {"question1":"How do you get your e-golden account back on PayPal?","question2":"How do I get my PayPal account back?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_12.json: -------------------------------------------------------------------------------- 1 | {"question1":"How do I connect IR system on a Poweradvisor project to my Raspberry Pi?","question2":"How can I configure a Raspberry Pi and connect it to my 3G router using Ethernet?","labels":0} 2 | {"question1":"How do I give treatment for radiostasis in rats?","question2":"What are some examples of inorganic compounds? How do they are used?","labels":0} 3 | {"question1":"What is the first programming language I learn if I don't have a computer?","question2":"Which programming language do people start their own after doing a degree?","labels":0} 4 | {"question1":"What is the difference between plasticizers and ovens?","question2":"What's the difference between a tent and a microwave oven?","labels":0} 5 | {"question1":"How do you produce a picture of two moving masses due to a moving object (such as wind)?","question2":"How can I use a wine bottle in a news source?","labels":0} 6 | {"question1":"How do I lose weight without stopping?","question2":"How do I lose weight without quitting?","labels":1} 7 | {"question1":"Which is the best book on machine learning?","question2":"What is the best book on machine learning?","labels":1} 8 | {"question1":"What is the best way to learn things in a day?","question2":"How do I learn things in a day?","labels":1} 9 | {"question1":"Where can I find best place to rent an apartment in Vicksburg, West Bengal?","question2":"Where can I find best place to rent an apartment in Vicksburg, West Bengal?","labels":1} 10 | {"question1":"Where is the bank that holds the biggest shares in Google Google+?","question2":"Where is the bank that holds the largest share in Google+?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_7.json: -------------------------------------------------------------------------------- 1 | {"question1":"How does one create a workbook?","question2":"How can I create a working book\/weekly workbook for Reliance Jio?","labels":0} 2 | {"question1":"In the train, where was the alligator penis observed?","question2":"My pants keep getting buckled around my waist. Why does it happen?","labels":0} 3 | {"question1":"What is happening in office politics in India?","question2":"What is happening in office politics in China?","labels":0} 4 | {"question1":"My questions about Uber seem to have some incidents. But, no one is answering the questions. Is there a way to control my questions?","question2":"Is it possible to remove questions from Quora?","labels":0} 5 | {"question1":"How do you describe Flutter Bros. Forever To Reject Real Life Life Existence?","question2":"How can I let the computer crash for me if I remember everything in my head?","labels":0} 6 | {"question1":"How do I increase the traffic to a blog?","question2":"How can I increase the traffic to my blog page?","labels":1} 7 | {"question1":"What are some good things you can do with a love life?","question2":"What are some good things do I can do with a love life?","labels":1} 8 | {"question1":"How are car networks promoted online in India? How is online promotion organised?","question2":"How are car networks promoted online in India? How is online promotion organised?","labels":1} 9 | {"question1":"How can I learn math better?","question2":"How can I learn math for real?","labels":1} 10 | {"question1":"What is it like to go through the OyoDoor test at Algorithms?","question2":"What it is like to go through the oyodoor test at Algorithms?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_9.json: -------------------------------------------------------------------------------- 1 | {"question1":"Can I sell my bookmarks in Neocube to bookmarked publishers?","question2":"What is the difference between bookmarks and marks on Neocube and my bookmarked edits?","labels":0} 2 | {"question1":"How common is it for some Indian men to have boyfriends after marriage?","question2":"How common is it for some Indian men to have a girlfriend after marriage?","labels":0} 3 | {"question1":"What is America's political correctness about?","question2":"What are some good ideas for jokes to impress a cute girl?","labels":0} 4 | {"question1":"How is it possible to mumble effectively with no usage or chatting in chat on PC?","question2":"What is the difference between a call book and data management program?","labels":0} 5 | {"question1":"Why do you trust people?","question2":"Why do we trust people who don't trust others?","labels":0} 6 | {"question1":"How do I hack someone else\u2019s WhatsApp account without having access to his\/her phone?","question2":"How do I hack WhatsApp?","labels":1} 7 | {"question1":"How do I build a brain map of my mind?","question2":"How do I build my brain map?","labels":1} 8 | {"question1":"What causes a body to break down?","question2":"What causes a body to break down? How can it be stopped? How do I stop it?","labels":1} 9 | {"question1":"Why is it so hard to get a job in the government if you don't have any roots?","question2":"Why it is so hard to get a job in government if you don't have roots?","labels":1} 10 | {"question1":"What is average salary for a software engineer with a B.Tech degree in India?","question2":"What is average salary for a software engineer with a B.Tech degree in India?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_0.json: -------------------------------------------------------------------------------- 1 | {"question1":"How is the culture of Yannis Baratheon (character) different from the culture of Arya Stark (character)?","question2":"\"Is it possible to get \"\"of course\"\" marks at Yannis Baratheon in Fallout 4?\"","labels":0} 2 | {"question1":"What is it like to work in a corporate office?","question2":"What is it like to work at Hubspot.com?","labels":0} 3 | {"question1":"What is your review of Facepalm?","question2":"Does coffee have a caffeine profile? If not, what is its caffeine content?","labels":0} 4 | {"question1":"What is the corporate culture like at Electronic Arts? How is the culture different than other companies?","question2":"What is the corporate culture like at Amazon? How is the culture different than other companies?","labels":0} 5 | {"question1":"How does one use Compasein Finder?","question2":"How is the Internet linked with BIND and which one should we use?","labels":0} 6 | {"question1":"What is the one year plan of life that a person must follow when he is depressed?","question2":"What is the one year life plan of life that a person must follow when he is depressed?","labels":1} 7 | {"question1":"What will happen if India leaves Pakistan?","question2":"What will happen if India & Pakistan end their relationship now or later?","labels":1} 8 | {"question1":"How do I get messages from Facebook messenger without using phone number?","question2":"How do I get messages from Facebook messenger without using any phone number?","labels":1} 9 | {"question1":"How can I make good friends with people?","question2":"How do I make good friends with people?","labels":1} 10 | {"question1":"How do I get rid of hair loss?","question2":"How do I get rid of hair loss?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_16.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is personification? What are examples of this?","question2":"What does saying 'I want to know you' mean?","labels":0} 2 | {"question1":"How safe are the gyros in Indian malls that offer views from all sides of the mall, and there's no screen?","question2":"Are Gyro Stations safe for working women in India?","labels":0} 3 | {"question1":"Can I stop clicking button and clicking through the buttons of Google Plus all the time?","question2":"How do I control the various\/visible clicks on an e-commerce website?","labels":0} 4 | {"question1":"If you're a stranger, does anyone else have anyone you know set a good example?","question2":"Can I wear a full volume shirt for casual showroom showroom and designer showroom showroom in Delhi\/NCR to choose what I wear?","labels":0} 5 | {"question1":"What are some tips on making it through the job interview process at Honeywell Automation?","question2":"What are some tips on making it through the job interview process at AT&T?","labels":0} 6 | {"question1":"How do you learn to love something?","question2":"How can I learn to love something?","labels":1} 7 | {"question1":"How can I hack WiFi and get a list of my connected devices?","question2":"How can I hack wifi using my WiFi?","labels":1} 8 | {"question1":"How can I get a Reliance Jio SIM card by a Paytm associate from India?","question2":"How can I get a Reliance Jio SIM card by payingtm associate from India?","labels":1} 9 | {"question1":"How do I get rid of belly fat?","question2":"How can I get rid of belly fat naturally?","labels":1} 10 | {"question1":"What is your favorite country in the world and why?","question2":"What are your favorite countries in the world? Why?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_13.json: -------------------------------------------------------------------------------- 1 | {"question1":"What are the best headphones under Rs. 5000 for sound quality and connectivity?","question2":"What is the most amazing earphone under 2500 dollars and why?","labels":0} 2 | {"question1":"What are some great private parties in Berlin?","question2":"How much money are you earning by donating your kidney donation when you have not yet donated it?","labels":0} 3 | {"question1":"What is the corporate culture like at Ray-Ban? How is the culture different than other companies?","question2":"What is the corporate culture like at Myriad? How is the culture different than other companies?","labels":0} 4 | {"question1":"What is the global strength in terms of placements, awards, etc.?","question2":"Why does heat always pull me toward warm area?","labels":0} 5 | {"question1":"What's the difference between a car airbag and an aerodynamic deflection deflection?","question2":"What are some tactics for driving smart vehicles and preventing others?","labels":0} 6 | {"question1":"How much does a fresher get paid in Dubai?","question2":"How much can a fresher get paid in Dubai for the first year?","labels":1} 7 | {"question1":"How can I hack my partner phone through WhatsApp?","question2":"How can I hack WhatsApp by touching my phone?","labels":1} 8 | {"question1":"What is the best way to develop long distance relationships?","question2":"How can I develop a long distance relationship?","labels":1} 9 | {"question1":"Why do some people still believe that the earth is flat?","question2":"Why do people still believe the earth is flat?","labels":1} 10 | {"question1":"What is it like working at McKinsey for your first job?","question2":"What is it like to work at McKinsey for your first job?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_14.json: -------------------------------------------------------------------------------- 1 | {"question1":"Does skipping increase power output of a car battery?","question2":"Is it normal to have a low root resistance car battery?","labels":0} 2 | {"question1":"Which is the best introduction of Shabak(a bedroom)?","question2":"What does it feel like to be a doctor as a doctor?","labels":0} 3 | {"question1":"How do I get free mobile data sim and text messages?","question2":"How can I gain free data from a text message to my brother on Skype?","labels":0} 4 | {"question1":"What's the difference between frmat and mmat?","question2":"How do I calculate my GPA?","labels":0} 5 | {"question1":"How much does it cost to send a product from India to Singapore?","question2":"How much does it cost to send a product from India to the US?","labels":0} 6 | {"question1":"What is the hardest thing(s) about raising a child in a remote area and how does it differ from raising a child in a city or town?","question2":"What is the hardest thing(s) about raising a child in a remote area and how does it differ from raising a child in a city or town?","labels":1} 7 | {"question1":"How do I learn English in a short time?","question2":"How can I learn English with a short time?","labels":1} 8 | {"question1":"Are people on Quora pedophiles? If yes, why?","question2":"Are people on Quora pedophiles? If yes, why?","labels":1} 9 | {"question1":"How can I increase traffic to my website from a blog post on Quora?","question2":"How do I increase the traffic on my website from a blog post on Quora?","labels":1} 10 | {"question1":"What will be the impact of demonetization of Rs. 500\/1000 notes on Indian stock market?","question2":"What is the impact of demonetization of 500 and 1000 rupee notes on Indian stock market?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_17.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is the best iPod Touch and why?","question2":"What was the best unit sold in the USA?","labels":0} 2 | {"question1":"Do U&J companies use Java's ORM?","question2":"What is the difference between human\/virginal\/legal rights and natural, non-malicious or seriously harmful injurious effects of death effects for a non-human, non-paranoid species?","labels":0} 3 | {"question1":"What is a good website for CNC machining?","question2":"What is a good website to download some CNC machining books?","labels":0} 4 | {"question1":"Do astronauts like their legs being turned into furniture for their waist? If so, why?","question2":"What are the excuses a woman might give while having sex for men?","labels":0} 5 | {"question1":"How does one get bandh numbers from CM addressbook?","question2":"How can one make the bandh numbers as an index into their driver's license with no valid addressbook?","labels":0} 6 | {"question1":"Why do some people fall in love with only one person?","question2":"Why do people fall in love with one person?","labels":1} 7 | {"question1":"How do I have bad breath while driving?","question2":"How do I have bad breath while driving?","labels":1} 8 | {"question1":"How much real estate can I get in india per month with open house in bangalore?","question2":"How much real estate can I get in india per month with open house in bangalore?","labels":1} 9 | {"question1":"What is the best way to get better grades in school?","question2":"How do I get better grades in school?","labels":1} 10 | {"question1":"What are some mind blowing technology gadgets that most people don't know about?","question2":"What are some mind blowing technology gadgets that most people don't know about?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_2.json: -------------------------------------------------------------------------------- 1 | {"question1":"Why should I write a good backmatter for an international conference?","question2":"Where can I study internationally on business logic?","labels":0} 2 | {"question1":"How long does it take you to learn the German language?","question2":"How long does it take to learn the English language?","labels":0} 3 | {"question1":"What are some unexpected things first-time visitors to Colombia notice?","question2":"What are some unexpected things first-time visitors to Canada notice?","labels":0} 4 | {"question1":"Why is red in PFUS something I can't see when I tap PFUS?","question2":"Did one have a chance to see one of the real masterpieces being played by Richard Bachardo in MS Dhoni Cricket: Live Streaming, in the Permanent XI Test Center at Mumbai?","labels":0} 5 | {"question1":"How does digital gatekeeper disable ads on a WiFi band?","question2":"How can I enable\/disable my WiFi network on my HTC phone?","labels":0} 6 | {"question1":"How do I recover my Gmail account after recovery?","question2":"How do I recover my Gmail account from recovery?","labels":1} 7 | {"question1":"How do you prevent hair loss without touching hair?","question2":"How do I prevent hair loss without touching hair?","labels":1} 8 | {"question1":"How do I get successful in C.E.?","question2":"How can I get successful in C.E.?","labels":1} 9 | {"question1":"What is the best word or link you use to explain the meaning of a certain book to a friend?","question2":"What is the best word or link you use to explain the meaning of a certain book to a friend?","labels":1} 10 | {"question1":"How will the ban of Rs 500 and Rs 1000 notes affect Indian economy?","question2":"How will the 500 and 1000 rupee notes ban affect the Indian economy?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_3.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is an advantage of a vacuum?","question2":"What is the advantage of a flat magnet and how is it used?","labels":0} 2 | {"question1":"How do I get a Reliance Jio SIM card with the USB port?","question2":"How do I get the Maruti Jio SIM card with the USB port?","labels":0} 3 | {"question1":"What are some of the most amazing, practical situations where you could have gone insane?","question2":"What was the best Locker room or VST? Why?","labels":0} 4 | {"question1":"How do I write a well written, and entertaining How To pass the Test exam?","question2":"What is the way to know if I'm underrated in a field of study?","labels":0} 5 | {"question1":"How is the life of an outcast get a job in big IT companies?","question2":"How much capital do IT companies use to hire out people?","labels":0} 6 | {"question1":"How do I recover my Gmail account information if it is no longer online?","question2":"How can I recover my Gmail account information if it is no longer online?","labels":1} 7 | {"question1":"How do I know when I'm in love?","question2":"How do I know if I am in love?","labels":1} 8 | {"question1":"Can I get pregnant one day after my periods?","question2":"Can I get pregnant the day after periods? How can I get pregnant so that I can get my period a week after I have my period?","labels":1} 9 | {"question1":"How do I earn money online without investment?","question2":"How can I make money online without any investment?","labels":1} 10 | {"question1":"What is the most profitable business to start in India and how is the business model different than other businesses?","question2":"What is the most profitable business to start in India and how is the business model different than other businesses?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_8.json: -------------------------------------------------------------------------------- 1 | {"question1":"What are the best hotels for unmarried couples in Kiritimari, Mumbai?","question2":"Can unmarried couples stay at a hotel in city of Kiritimari, Mumbai, India for a long time in a stable way?","labels":0} 2 | {"question1":"Which programming language or programming language I should learn if I want to contribute in field of computer science?","question2":"Which is a good programming language to start as a software engineer?","labels":0} 3 | {"question1":"What is it like to work at Cognizant Group for your first job?","question2":"What is it like to work at Cognizant Group for your first job in India?","labels":0} 4 | {"question1":"How can I find the money back of a cheque which was withdrawn from ATM but later is back in the bank?","question2":"Why is the anti-recycling program difficult?","labels":0} 5 | {"question1":"Can I paint my car as a turbocharger for a Subaru?","question2":"What are the best stories to tell you about thinking out your self, as a different type of thinking is used as a means of understanding what's really worthiness of your self?","labels":0} 6 | {"question1":"How do I get traffic for my websites?","question2":"How do I get traffic for my website?","labels":1} 7 | {"question1":"If I eat a lot of pot salt, should I still be gaining weight?","question2":"If I eat lots of pot salt, should I still be gaining weight?","labels":1} 8 | {"question1":"What are some mind blowing gadgets that most people don't know about?","question2":"What are some mind blowing gadgets that most people don't know?","labels":1} 9 | {"question1":"How do I make friends easily?","question2":"How can I make friends easily?","labels":1} 10 | {"question1":"Which is the best cell phone under 25000?","question2":"Which is the best phone under 25000 in India?","labels":1} 11 | -------------------------------------------------------------------------------- /src/coreset/rank_dilm.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | from datasets import Dataset 5 | from transformers import DataCollatorForLanguageModeling 6 | 7 | from generator import GeneratorModel 8 | 9 | from .coreset_utils import batch_to_cuda 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def rank_with_dilm( 15 | dataset: Dataset, dpc: int, generator: GeneratorModel, sentence_keys: list[str] 16 | ): 17 | generator.cuda() 18 | generator.eval() 19 | 20 | data_collator = DataCollatorForLanguageModeling( 21 | tokenizer=generator.tokenizer, mlm=False, pad_to_multiple_of=8 22 | ) 23 | 24 | def compute_loss(batch: dict): 25 | batch_size = len(batch[list(batch.keys())[0]]) 26 | if -1 in batch["labels"]: 27 | bos_tokens = [generator.tokenizer.bos_token] * len(batch["labels"]) 28 | else: 29 | bos_tokens = [generator.bos_tokens_map[i] for i in batch["labels"]] 30 | 31 | # sentences 32 | batch_sentences = [[s.strip() for s in batch[key]] for key in sentence_keys] 33 | concat_sentences = [ 34 | f" {generator.sep_token} ".join(sents) for sents in zip(*batch_sentences) 35 | ] 36 | batch_sentences = [ 37 | f"{bos_token} {sent} {generator.tokenizer.eos_token}" 38 | for bos_token, sent in zip(bos_tokens, concat_sentences) 39 | ] 40 | batch = generator.tokenizer(batch_sentences) 41 | inputs = data_collator( 42 | [{k: v[i] for k, v in batch.items()} for i in range(batch_size)] 43 | ) 44 | with torch.inference_mode(): 45 | losses = generator.compute_loss(**batch_to_cuda(inputs)) 46 | assert losses.size() == (batch_size,) 47 | return {"loss": losses.tolist()} 48 | 49 | logger.info("Computing losses with generator") 50 | losses = dataset.map(compute_loss, batched=True, batch_size=256)["loss"] 51 | logger.info("Done!!") 52 | 53 | topk_indices = torch.topk(torch.tensor(losses), k=dpc, largest=False)[1] 54 | return dataset.select(topk_indices) 55 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_6.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is the name of the best American service center for sipping e-votas?","question2":"What is the name of the best American service center for drinking tea?","labels":0} 2 | {"question1":"Why does water always freeze in ice and freeze on the surface and not liquid at night?","question2":"Will you agree that she's going to do anything?","labels":0} 3 | {"question1":"I am looking for review material for a building exhibition that has a logo. Suggest ideas. If the logo does not need to identify itself, what is the best way to identify that design?","question2":"How do I play Logic through the PS2's built in clock?","labels":0} 4 | {"question1":"How do you find out whether you have a needle in your blood in the middle of your nose?","question2":"Is it normal to be a mom with a needle in your nose? If not, what is the meaning? How common is it?","labels":0} 5 | {"question1":"How difficult is it to find female partner in India?","question2":"What is it like to have a female Indian partner?","labels":0} 6 | {"question1":"What can I do to get better grades in school?","question2":"How do I get better grades in school?","labels":1} 7 | {"question1":"Why do some humans think they are born weak and can't adapt to their environment?","question2":"Why are some humans born weak and can't adapt to their environment?","labels":1} 8 | {"question1":"What are the views of the Indian media that demonetized 500 and 1000 rupee notes will help curb black money?","question2":"What are the views of Indian media that demonetized 500 and 1000 rupee notes will help curb black money?","labels":1} 9 | {"question1":"Why do some people still believe the Earth is flat?","question2":"Why are there people who still believe that the earth is flat?","labels":1} 10 | {"question1":"How do I increase the traffic on my blog?","question2":"How do I increase the traffic of my blog?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_4.json: -------------------------------------------------------------------------------- 1 | {"question1":"How is almagia Eco Eco Eco cell 1.6m?","question2":"How good is Eco Eco Eco Eco cell 1.6m in terms of electricity? What is your review of Eco Eco Eco Eco Cells in hyderabad?","labels":0} 2 | {"question1":"Why do girls cry when they find that their boyfriend throws up immediately after their separation?","question2":"I was with my boyfriend a couple of years back and I lost count of how many days he was already dating me. He needs to spend more time with me. If I don't let him go, does the rest of my boyfriend hate me and will hurt me if he spends more time with me?","labels":0} 3 | {"question1":"Why was Krishna exempt from oppression of men on Karna alone?","question2":"Who is the father of Java engine?","labels":0} 4 | {"question1":"What's the point of monthly mileage with the iPad battery when you don't want a fully charged device when it is almost two-and-a-half hours?","question2":"How do you calculate the fuel consumption of your car battery? What about the battery and outer shell?","labels":0} 5 | {"question1":"How do I find creative applications of calculus?","question2":"How does a powerful car piloting power compare to a muslim piloting power?","labels":0} 6 | {"question1":"What are some mind blowing phone inventions that most people don't know about?","question2":"What are the mind blowing mobile inventions that most people don't know about?","labels":1} 7 | {"question1":"How do I get good grades in school?","question2":"How can I get good grades in school?","labels":1} 8 | {"question1":"How do I get freedom in life?","question2":"How do I get free freedom in life?","labels":1} 9 | {"question1":"How do I lose weight without doing exercise?","question2":"How do you lose fat without doing exercise?","labels":1} 10 | {"question1":"How do I easily get a good result from Amazon Express?","question2":"How do I get a good result from Amazon Express?","labels":1} 11 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_4.json: -------------------------------------------------------------------------------- 1 | {"sentence":"the only thing remotely resembling humor","labels":0} 2 | {"sentence":"this is a complete waste of time, money and celluloid.","labels":0} 3 | {"sentence":"in a poorly cast, poorly written, murky, amateurish screenplay, it's really little more than a particularly slanted, gay s\/m fantasy, enervating and deadeningly drawn-out.","labels":0} 4 | {"sentence":"ends up falling short as a whole","labels":0} 5 | {"sentence":"is simply not enough of interest onscreen to sustain its seventy-minute running time about as sharp as a samurai sword.","labels":0} 6 | {"sentence":"single-mindedly biased, hypocritical and a dim-witted pairing of teen-speak and animal gibberish","labels":0} 7 | {"sentence":"a banal bore that plays like a loosely-connected string of acting-workshop exercises.","labels":0} 8 | {"sentence":"the dullest tangents","labels":0} 9 | {"sentence":"the story line is just too bland and obvious to be interesting.","labels":0} 10 | {"sentence":"puts too much emphasis on the disappointingly generic nature of the entire effort.","labels":0} 11 | {"sentence":"is both gripping and compelling.","labels":1} 12 | {"sentence":"a strong erotic spark to the most crucial lip-reading sequence.","labels":1} 13 | {"sentence":"a fascinating social tale that captures the innocence and budding demons within a wallflower","labels":1} 14 | {"sentence":"is a visual treat for all audiences","labels":1} 15 | {"sentence":"of clever concept","labels":1} 16 | {"sentence":"it has rewards, and the magnificent swooping aerial shots make this an exhilarating film for kids and adults alike.","labels":1} 17 | {"sentence":"a masterpiece and a must for genre fans.","labels":1} 18 | {"sentence":"is a winning comedy that excites the imagination and tickles the funny bone.","labels":1} 19 | {"sentence":"a core of decency and respect","labels":1} 20 | {"sentence":"change is a film that refreshes the mind and spirit along with the body, so transporting is its bold presentation.","labels":1} 21 | -------------------------------------------------------------------------------- /configs/test/dc.yaml: -------------------------------------------------------------------------------- 1 | base: 2 | experiment_name: test.${learner.model_name}.${data.task_name} 3 | method: dilm 4 | run_name: ${base.method}.dc.${base.sub_run_name} 5 | sub_run_name: dpc_${distilled_data.dpc} 6 | save_dir_root: ./save 7 | save_method_dir: ${base.save_dir_root}/${base.experiment_name}/${base.method}.dc 8 | save_dir: ${base.save_method_dir}/${base.sub_run_name} 9 | data_dir_root: ./data 10 | seed: 42 11 | device: null 12 | 13 | data: 14 | task_name: sst2 15 | datasets_path: ${base.data_dir_root}/${data.task_name}/datasets 16 | preprocessed_datasets_path: ${base.data_dir_root}/${data.task_name}/datasets_${generator.model_name}_${learner.model_name} 17 | train_batch_size: 64 18 | valid_batch_size: 256 19 | test_batch_size: 256 20 | num_proc: 1 21 | force_preprocess: False 22 | 23 | coreset: 24 | coreset_type: k_centers # {random, k_centers, herding, rank_dilm} 25 | model_name: bert-base-uncased 26 | save_dir: ${base.data_dir_root}/${data.task_name}/coresets/${coreset.coreset_type} 27 | 28 | generator: 29 | model_name: gpt2 30 | pretrained_model_dir: null 31 | checkpoint_name: last-ckpt 32 | top_p: 0.95 33 | top_k: null 34 | repetition_penalty: 1.0 35 | generate_batch_size: 512 36 | generate_max_length: null 37 | generate_fp16: False 38 | generate_bf16: True 39 | gradient_checkpointing: True 40 | 41 | learner: 42 | model_name: bert-base-uncased 43 | few_shot: ${evaluate.few_shot} 44 | use_pretrained_model: True 45 | disable_dropout: False 46 | freeze_bert: False 47 | gradient_checkpointing: True 48 | 49 | distilled_data: 50 | dpc: 20 51 | n_dataset: 20 52 | save_dataset_path: ${base.save_dir}/dataset 53 | over_sample_ratio: 1.0 54 | 55 | evaluate: 56 | task_name: ${data.task_name} 57 | n_eval_per_dataset: 5 58 | fp16: False 59 | bf16: True 60 | save_result_dir: ${base.save_dir}/final_results 61 | 62 | # training config 63 | few_shot: False 64 | 65 | optimizer_type: adamw # ["sgd", "momentum", "adam", "adamw"] 66 | scheduler_type: cosine 67 | lr: 1.0e-4 68 | max_grad_norm: 1.0 69 | weight_decay: 0.01 70 | warmup_ratio: 0.5 71 | 72 | train_step: 200 73 | batch_size: 64 74 | 75 | hydra: 76 | run: 77 | dir: ${base.save_dir} 78 | sweep: 79 | dir: ${base.save_method_dir} 80 | subdir: ${base.sub_run_name} 81 | -------------------------------------------------------------------------------- /configs/test/lm.yaml: -------------------------------------------------------------------------------- 1 | base: 2 | experiment_name: test.${learner.model_name}.${data.task_name} 3 | method: dilm 4 | run_name: ${base.method}.lm.${base.sub_run_name} 5 | sub_run_name: dpc_${distilled_data.dpc} 6 | save_dir_root: ./save 7 | save_method_dir: ${base.save_dir_root}/${base.experiment_name}/${base.method}.lm 8 | save_dir: ${base.save_method_dir}/${base.sub_run_name} 9 | data_dir_root: ./data 10 | seed: 42 11 | device: null 12 | 13 | data: 14 | task_name: sst2 15 | datasets_path: ${base.data_dir_root}/${data.task_name}/datasets 16 | preprocessed_datasets_path: ${base.data_dir_root}/${data.task_name}/datasets_${generator.model_name}_${learner.model_name} 17 | train_batch_size: 64 18 | valid_batch_size: 256 19 | test_batch_size: 256 20 | num_proc: 1 21 | force_preprocess: False 22 | 23 | coreset: 24 | coreset_type: k_centers # {random, k_centers, herding, rank_dilm} 25 | model_name: bert-base-uncased 26 | save_dir: ${base.data_dir_root}/${data.task_name}/coresets/${coreset.coreset_type} 27 | 28 | generator: 29 | model_name: gpt2 30 | pretrained_model_dir: null 31 | checkpoint_name: last-ckpt 32 | top_p: 0.95 33 | top_k: null 34 | repetition_penalty: 1.0 35 | generate_batch_size: 512 36 | generate_max_length: null 37 | generate_fp16: False 38 | generate_bf16: True 39 | gradient_checkpointing: True 40 | 41 | learner: 42 | model_name: bert-base-uncased 43 | few_shot: ${evaluate.few_shot} 44 | use_pretrained_model: True 45 | disable_dropout: False 46 | freeze_bert: False 47 | gradient_checkpointing: True 48 | 49 | distilled_data: 50 | dpc: 20 51 | n_dataset: 20 52 | save_dataset_path: ${base.save_dir}/dataset 53 | over_sample_ratio: 100.0 54 | 55 | evaluate: 56 | task_name: ${data.task_name} 57 | n_eval_per_dataset: 5 58 | fp16: False 59 | bf16: True 60 | save_result_dir: ${base.save_dir}/final_results 61 | 62 | # training config 63 | few_shot: False 64 | 65 | optimizer_type: adamw # ["sgd", "momentum", "adam", "adamw"] 66 | scheduler_type: cosine 67 | lr: 1.0e-4 68 | max_grad_norm: 1.0 69 | weight_decay: 0.01 70 | warmup_ratio: 0.5 71 | 72 | train_step: 200 73 | batch_size: 64 74 | 75 | hydra: 76 | run: 77 | dir: ${base.save_dir} 78 | sweep: 79 | dir: ${base.save_method_dir} 80 | subdir: ${base.sub_run_name} 81 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_11.json: -------------------------------------------------------------------------------- 1 | {"sentence":"a hokey piece of nonsense that lacks the spirit of the previous two, and fails to fully exploit its social and political potential.","labels":0} 2 | {"sentence":"elitist and lazy... that it makes your least favorite james bond movie seem as cleverly plotted as the usual suspects","labels":0} 3 | {"sentence":"is built on a faulty premise, one it follows into melodrama and silliness and is capped with pointless extremes.","labels":0} 4 | {"sentence":"the only thing this film fails on are the jokes, which never reach satisfying conclusions.","labels":0} 5 | {"sentence":"on the hype machine that gives movies about ordinary folk almost nothing to shout about.","labels":0} 6 | {"sentence":"this is a wretched movie with a single ill-timed explosion and no end of dumb humor.","labels":0} 7 | {"sentence":"was as bad at it was cruel","labels":0} 8 | {"sentence":"the lazy material and the finished product's unshapely look","labels":0} 9 | {"sentence":"a cold, calculated exercise in politics that gives political prisoners a bad name.","labels":0} 10 | {"sentence":"the worst of tragedies can seem like a collection of the worst of irresponsible screenwriting orders","labels":0} 11 | {"sentence":"a sophisticated and engaging film about entrapment in the maze of modern life","labels":1} 12 | {"sentence":"the result is worth seeing.","labels":1} 13 | {"sentence":"a universal human impulse","labels":1} 14 | {"sentence":"smartly written, well-acted and in a superb ensemble cast, `` xxx '' is a must for genre fans.","labels":1} 15 | {"sentence":"a worthy entry","labels":1} 16 | {"sentence":"a perfect family film to bring to imax in the future with a vision both painterly and literary.","labels":1} 17 | {"sentence":"brilliantly written, visually graceful","labels":1} 18 | {"sentence":"is every bit as distinctive as his visuals.","labels":1} 19 | {"sentence":"from its uncanny tale of love and communal discord to the inspired music of spielberg, the movie is a must for genre fans and a must for genre-watchers.","labels":1} 20 | {"sentence":"of the most genuinely sweet films of the year.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_7.json: -------------------------------------------------------------------------------- 1 | {"sentence":"a bad film that comes across as a complete waste of time.","labels":0} 2 | {"sentence":"more unoriginal crap","labels":0} 3 | {"sentence":", obnoxious 88-minute travesty that does no justice to the story itself.","labels":0} 4 | {"sentence":"is ultimately rather silly and overwrought, while never sure what its point is.","labels":0} 5 | {"sentence":"a cinematic disaster that never bothers to question why somebody might devote time to see it","labels":0} 6 | {"sentence":"the biting satire is ultimately rather short of both a good movie and a bad one.","labels":0} 7 | {"sentence":"is something of a stiff -- an extra-dry office comedy that seems twice as long as its 83 minutes.","labels":0} 8 | {"sentence":"this chicken is all bluster -- a self-parody that never catches fire.","labels":0} 9 | {"sentence":"it's not that funny ; that's a waste of de niro, mcdormand and the other good actors in the cast.","labels":0} 10 | {"sentence":"the immature provocations of its actors, which drains it of the dramatic substance that would shake us in our boots ( or cinema seats ) for a second.","labels":0} 11 | {"sentence":"is well worthwhile as a thoughtful history lesson.","labels":1} 12 | {"sentence":"this is a solidly entertaining family film that elevates the experience to a more mythic level.","labels":1} 13 | {"sentence":"a surprising and rewarding glimpse into the urban heart","labels":1} 14 | {"sentence":"an interesting look at the rapidly changing face of fame","labels":1} 15 | {"sentence":"a beautifully read, deeply absorbing piece of cinema that deftly captures the dry wit that's so prevalent on the rock.","labels":1} 16 | {"sentence":"about the best arnold machtner movies in years","labels":1} 17 | {"sentence":"is unusual, food-for-thought cinema that's as entertaining as it is instructive.","labels":1} 18 | {"sentence":"is as consistently engaging as it is revealing.","labels":1} 19 | {"sentence":"a jaw-droppingly beautiful work that upends nearly every clich\u00e9 of japanese animation for a satisfying entertainment destination.","labels":1} 20 | {"sentence":"vivid performances and originality","labels":1} 21 | -------------------------------------------------------------------------------- /configs/test/coreset.yaml: -------------------------------------------------------------------------------- 1 | base: 2 | experiment_name: test.${learner.model_name}.${data.task_name} 3 | method: coreset 4 | run_name: ${base.method}.${coreset.coreset_type}.${base.sub_run_name} 5 | sub_run_name: dpc_${distilled_data.dpc} 6 | save_dir_root: ./save 7 | save_method_dir: ${base.save_dir_root}/${base.experiment_name}/${base.method}.${coreset.coreset_type} 8 | save_dir: ${base.save_method_dir}/${base.sub_run_name} 9 | data_dir_root: ./data 10 | seed: 42 11 | device: null 12 | 13 | data: 14 | task_name: sst2 15 | datasets_path: ${base.data_dir_root}/${data.task_name}/datasets 16 | preprocessed_datasets_path: ${base.data_dir_root}/${data.task_name}/datasets_${generator.model_name}_${learner.model_name} 17 | train_batch_size: 64 18 | valid_batch_size: 256 19 | test_batch_size: 256 20 | num_proc: 1 21 | force_preprocess: False 22 | 23 | coreset: 24 | coreset_type: random # {random, k_centers, herding, rank_dilm} 25 | model_name: bert-base-uncased 26 | save_dir: ${base.data_dir_root}/${data.task_name}/coresets/${coreset.coreset_type} 27 | 28 | generator: 29 | model_name: gpt2 30 | pretrained_model_dir: null 31 | checkpoint_name: null 32 | top_p: 0.95 33 | top_k: null 34 | repetition_penalty: 1.0 35 | generate_batch_size: 512 36 | generate_max_length: null 37 | generate_fp16: False 38 | generate_bf16: True 39 | gradient_checkpointing: True 40 | 41 | learner: 42 | model_name: bert-base-uncased 43 | few_shot: ${evaluate.few_shot} 44 | use_pretrained_model: True 45 | disable_dropout: False 46 | freeze_bert: False 47 | gradient_checkpointing: True 48 | 49 | distilled_data: 50 | dpc: 20 51 | n_dataset: 20 52 | save_dataset_path: ${base.save_dir}/dataset 53 | over_sample_ratio: 1.0 54 | 55 | evaluate: 56 | task_name: ${data.task_name} 57 | n_eval_per_dataset: 5 58 | fp16: False 59 | bf16: True 60 | save_result_dir: ${base.save_dir}/final_results 61 | 62 | # training config 63 | few_shot: False 64 | 65 | optimizer_type: adamw # ["sgd", "momentum", "adam", "adamw"] 66 | scheduler_type: cosine 67 | lr: 1.0e-4 68 | max_grad_norm: 1.0 69 | weight_decay: 0.01 70 | warmup_ratio: 0.5 71 | 72 | train_step: 200 73 | batch_size: 64 74 | 75 | hydra: 76 | run: 77 | dir: ${base.save_dir} 78 | sweep: 79 | dir: ${base.save_method_dir} 80 | subdir: ${base.sub_run_name} 81 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_14.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is not as fresh or enjoyable as it ought to be.","labels":0} 2 | {"sentence":"the movie is less the product of loving, well integrated homage and more like a mere excuse for the wan, thinly sketched story.","labels":0} 3 | {"sentence":"a painfully flat gross-out comedy that tries too hard to be emotional instead of real.","labels":0} 4 | {"sentence":", the picture feels less like bad cinema than like being stuck in a dark pit having a nightmare about bad cinema.","labels":0} 5 | {"sentence":"is time bombs and verges on the amateurish.","labels":0} 6 | {"sentence":"a puppy dog story that never springs to life","labels":0} 7 | {"sentence":"the worst sense of the expression","labels":0} 8 | {"sentence":"'s no excuse for following up a delightful, well-crafted family film with a computer-generated cold fish.","labels":0} 9 | {"sentence":"as pretentious as allen's pink slip, auto focus bears out as your typical junkie opera... dull, lifeless, and lifeless.","labels":0} 10 | {"sentence":"sketchy characters and immature provocations","labels":0} 11 | {"sentence":"bring an irresistible blend of warmth and humor and a consistent embracing humanity in the face of life's harshness","labels":1} 12 | {"sentence":"may be one of the best-sustained ideas i have ever seen on the screen.","labels":1} 13 | {"sentence":"is outstanding for both originality and bite","labels":1} 14 | {"sentence":"the film is in many ways the perfect festival film : a calm, self-assured portrait of small town regret, love, duty and friendship that appeals to the storytelling instincts of a slightly more literate filmgoing audience.","labels":1} 15 | {"sentence":"a gorgeously strange movie that examines many different ideas from happiness to guilt in an intriguing bit of storytelling.","labels":1} 16 | {"sentence":"more than a worthwhile effort","labels":1} 17 | {"sentence":"the funniest jokes of any movie this year","labels":1} 18 | {"sentence":"with originality, humor and pathos","labels":1} 19 | {"sentence":"a magnificent documentary.","labels":1} 20 | {"sentence":"is a stunning film that captures the innocence and budding demons within a wallflower.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_19.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is painfully formulaic and stilted in its attempts to be more, more weirdly beautiful than it actually is.","labels":0} 2 | {"sentence":", unwieldy cast is painful to watch, offensive and redundant, waste of time that could have ( and probably should have ) been better spent on an animated sequel.","labels":0} 3 | {"sentence":"filled with flatly, uncreative nonsense.","labels":0} 4 | {"sentence":"this is a complete waste of time, money and celluloid.","labels":0} 5 | {"sentence":"rather the mediocre end of the pool","labels":0} 6 | {"sentence":"the mall movie is built on a faulty premise, one it follows into melodrama and silliness that is wholly unconvincing and stilted.","labels":0} 7 | {"sentence":"is too busy hitting all of its assigned marks to find a rhythm or develop a sense of narrative urgency","labels":0} 8 | {"sentence":"a loud, low-budget and tired formula film that is repeatedly undercut by the brutality of the jokes, most at women's expense.","labels":0} 9 | {"sentence":"needlessly confusing in ways you hope it will not","labels":0} 10 | {"sentence":"with an overly convenient plot and stop-and-start pacing that stalls in its lackluster gear of emotional blandness.","labels":0} 11 | {"sentence":"this odd, poetic road movie, spiked by jolts of pop music, is for you.","labels":1} 12 | {"sentence":"a powerful emotional wallop","labels":1} 13 | {"sentence":"is a masterpiece, offering an original, nonjudgmental kind of storytelling.","labels":1} 14 | {"sentence":"the pulse of discovery, imagination and insight","labels":1} 15 | {"sentence":"a naturally funny script.","labels":1} 16 | {"sentence":"this clever look at changing times offers a disturbing glimpse into the insecure class life in contemporary china.","labels":1} 17 | {"sentence":"a powerful, naturally dramatic piece that places the good-time shenanigans in welcome perspective","labels":1} 18 | {"sentence":", schaeffer's film has a true cinematic knack, and is always watchable.","labels":1} 19 | {"sentence":"the best date movie in years","labels":1} 20 | {"sentence":"is a visionary with an infectious enthusiasm that goes a long way toward keeping the picture compelling","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_10.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is painfully bad, an insultingly unbelievable trifle that fails to give an authentic feel.","labels":0} 2 | {"sentence":"a soulless hunk of pseudo-intellectual garbage that manages to be even worse than its title.","labels":0} 3 | {"sentence":"is clumsily sentimental and ineptly directed.","labels":0} 4 | {"sentence":"just an unhappy situation","labels":0} 5 | {"sentence":"the one-sided theme makes no sense, and the story is thin and obvious propaganda from a director who has no clue about making any movies.","labels":0} 6 | {"sentence":"would be as unoriginal as this long running piffle that doesn't give a damn.","labels":0} 7 | {"sentence":"never reach satisfying conclusions","labels":0} 8 | {"sentence":"this hackneyed movie, directed by scott kalvert, is even less capable of charming the masses with star power, a pop-induced score and sentimental moments that have no bearing on the story.","labels":0} 9 | {"sentence":"'s a boring, pretentious muddle that encourages you to skip it and go see an alternative.","labels":0} 10 | {"sentence":"the bombastic self-glorification of other feel-good fiascos like antwone fisher or the emperor's club any time","labels":0} 11 | {"sentence":"provide the perfect starting point for a national conversation about gun culture and diversity.","labels":1} 12 | {"sentence":"a witty, whimsical feature debut that excites the imagination and tickles the funny bone.","labels":1} 13 | {"sentence":"the best date movie in ages","labels":1} 14 | {"sentence":"the heart and the funnybone wonderfully captures the dry wit that's so prevalent on the rock.","labels":1} 15 | {"sentence":"a thoughtful and rewarding glimpse into the sort of heartache everyone","labels":1} 16 | {"sentence":"gives a real performance that captures the innocence and budding demons within a wallflower.","labels":1} 17 | {"sentence":"is a finely written, superbly acted offbeat thriller that excites the imagination and tickles the funny bone.","labels":1} 18 | {"sentence":"a funny little film with a bouncy score and a clutch of lively songs for deft punctuation -- there's plenty of entertainment value to be had here.","labels":1} 19 | {"sentence":"a delightful catch","labels":1} 20 | {"sentence":"vivid characters and a warm, moving message","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_9.json: -------------------------------------------------------------------------------- 1 | {"sentence":"miss it altogether.","labels":0} 2 | {"sentence":"the lazy material and the finished product's unshapely look","labels":0} 3 | {"sentence":"a bad sound system, slow pacing, and general boorishness... winds up as the kind of film that should be the target of something bigger than themselves.","labels":0} 4 | {"sentence":"on its empty head, searching for a missing link somewhere inside its fabric, the film is murder by numbers, a genre-busting cold turkey that doesn't offer much in terms of plot or acting.","labels":0} 5 | {"sentence":"an overexposed waste of film that, half an hour in, starts making water torture seem appealing rather than plausible.","labels":0} 6 | {"sentence":"the plot is just sooooo tired.","labels":0} 7 | {"sentence":"is a cinematic corpse, replete with the pubescent scandalous innuendo and the high-strung but flaccid drama.","labels":0} 8 | {"sentence":"like a lazy ripoff of the downright lame ryan seagal movie","labels":0} 9 | {"sentence":"a negligible work-in-progress that lacks both a purpose and a strong pulse.","labels":0} 10 | {"sentence":"is simply not smart or barbed enough for older viewers.","labels":0} 11 | {"sentence":"is thought-provoking without stooping to base melodrama.","labels":1} 12 | {"sentence":"the best date movie in years","labels":1} 13 | {"sentence":"the film is a brilliantly played, deeply unsettling experience that offers a thoughtful investigation of faith versus intellect.","labels":1} 14 | {"sentence":"a fascinating and intimate study of bourgeois adolescence","labels":1} 15 | {"sentence":"there is that rare family movie that accomplishes so much that one viewing can't possibly be enough.","labels":1} 16 | {"sentence":"that lift the movie above its playwriting 101 premise is a tribute to the actress, and to her inventive director, that the journey is such a mesmerizing one.","labels":1} 17 | {"sentence":"a naturally funny ensemble comedy with a variety of quirky characters and an engaging story about the crucial role of imagination in the soulful development of two rowdy teenagers.","labels":1} 18 | {"sentence":"be very informative","labels":1} 19 | {"sentence":"is an extraordinary film that puts the sting back into the con.","labels":1} 20 | {"sentence":"a terrific date film","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_12.json: -------------------------------------------------------------------------------- 1 | {"sentence":"... a dim-witted pairing of teen-speak and animal gibberish that doesn't give a damn about the plight of american indians","labels":0} 2 | {"sentence":"the argentine retread of gosling or tarantino would be lost on anyone who hasn't been living under a rock.","labels":0} 3 | {"sentence":"through this mess, no one will be able to find a movie character more unattractive or odorous ( than leon ), and that's because the film is built on a faulty premise, one it follows into melodrama and silliness.","labels":0} 4 | {"sentence":"a terrible movie that is painfully bad rather than painfully awful","labels":0} 5 | {"sentence":"an otherwise appalling, shamelessly manipulative deception carried out by men of marginal intelligence, with reactionary ideas about women and a total lack of empathy.","labels":0} 6 | {"sentence":"the waste of a good cast","labels":0} 7 | {"sentence":"is never quite able to overcome the cultural moat surrounding its ludicrous and contrived plot.","labels":0} 8 | {"sentence":"but it isn't worth sitting through because the plot simply isn't funny.","labels":0} 9 | {"sentence":"the empty stud knockabout of equilibrium, with all its flaws, is almost completely lacking in charm and charisma.","labels":0} 10 | {"sentence":"foul up shum's good intentions.","labels":0} 11 | {"sentence":", the film has a freshness and modesty that transcends its agenda to entertain and inspire.","labels":1} 12 | {"sentence":"'s to say the film has a genuine and singular voice and an engaging story that keeps you guessing at almost every turn.","labels":1} 13 | {"sentence":"a delightful study of quiet power.","labels":1} 14 | {"sentence":"a cool visual treat","labels":1} 15 | {"sentence":"a rich and intelligent film that manages to find greatness in the hue of its drastic iconography.","labels":1} 16 | {"sentence":"the best behind the music documentary since graffiti bridge.","labels":1} 17 | {"sentence":"is well worthwhile.","labels":1} 18 | {"sentence":"a gem of a movie that will stimulate hours of post viewing discussion, if only to be reminded of who did what to whom and why.","labels":1} 19 | {"sentence":"a beautiful study of flower-power liberation","labels":1} 20 | {"sentence":"is a high-spirited buddy movie that interweaves individual stories and is well worth seeing on the big screen.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_6.json: -------------------------------------------------------------------------------- 1 | {"sentence":"look elsewhere, seizing on george's haplessness and lucy's personality tics.","labels":0} 2 | {"sentence":"an overcooked, ham-fisted sermon on the needlessly opaque visual style of freddy gets fingered.","labels":0} 3 | {"sentence":"a bad improvisation exercise that fails on so many levels","labels":0} 4 | {"sentence":"for a mere plot pawn for two directors with far less endearing disabilities","labels":0} 5 | {"sentence":"that should be relegated to a dark video store corner is more silly than scary.","labels":0} 6 | {"sentence":"the hot chick is nothing more than a tepid exercise in manipulation and idiotic, threadbare comic setups that allow the mediocre to make boring bedfellows.","labels":0} 7 | {"sentence":"what's most offensive isn't the waste of a good cast, but the idea that the problem with wendigo, apart from the fact it is a shoddy imitation, isn't a terrible movie at all.","labels":0} 8 | {"sentence":"has no bearing on the story, which is relentlessly folksy and monty pryor wannabe-hip.","labels":0} 9 | {"sentence":"the whole dead-undead genre, too cynical and sloppy to function as comedy, too ludicrous and mean-spirited to function as drama","labels":0} 10 | {"sentence":"feels hackneyed and meanspirited, with no clear point of view and little insight into the historical, philosophical or ethical issues.","labels":0} 11 | {"sentence":"the greatest date movies in years","labels":1} 12 | {"sentence":", intriguing and honorable, the film is well worthwhile.","labels":1} 13 | {"sentence":"a rare treat that shows the promise of digital filmmaking","labels":1} 14 | {"sentence":"the feature film has a true cinematic knack, and while it does display a fine line of dialogue and a story that grips and holds you in rapt attention from start to finish.","labels":1} 15 | {"sentence":"a gripping documentary that places the good-time shenanigans in welcome perspective.","labels":1} 16 | {"sentence":"a celebrated wonder in the spotlight","labels":1} 17 | {"sentence":"is a fine, focused piece of work that reopens an interesting controversy and never succumbs to sensationalism.","labels":1} 18 | {"sentence":"a delightful take on a love affair that keeps you guessing at almost every turn","labels":1} 19 | {"sentence":"a sharp, amusing study of modern alienation","labels":1} 20 | {"sentence":"is infectious fun.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_1.json: -------------------------------------------------------------------------------- 1 | {"sentence":"it's all a shoddy exercise in manipulation and noise that falls far short of the peculiarly moral amorality of ( woo's ) best work.","labels":0} 2 | {"sentence":"is difficult to shrug off the annoyance of that chatty fish","labels":0} 3 | {"sentence":"a disaster of a story with zero closure and zero aspirations to entertain or entertain the visually dumbed-down crowd.","labels":0} 4 | {"sentence":"that plays like an extended dialogue exercise in retard 101 that takes no apparent joy and goes nowhere.","labels":0} 5 | {"sentence":"even in its desperation, the movie is too calm and thoughtful for agitprop, and the thinness of its characterizations makes it an insultingly easy target for Anonymous.","labels":0} 6 | {"sentence":"as saccharine monologues that doesn't make much sense","labels":0} 7 | {"sentence":"is essentially juiceless and uncreative without any of its satirical or social message.","labels":0} 8 | {"sentence":"is at once too steeped in fairy tales and old-fashioned in all the worst possible ways.","labels":0} 9 | {"sentence":"a bad adaptation, an insultingly inept and artificial examination of grief and its impacts upon the relationships of the survivors.","labels":0} 10 | {"sentence":"every plot contrivance that the clich\u00e9-riddled genre couldn't be better suited to a night at the multiplex","labels":0} 11 | {"sentence":"the best old-fashioned adventure film in years","labels":1} 12 | {"sentence":"is a cinematic treat.","labels":1} 13 | {"sentence":"a winning comedy that excites the imagination and tickles the funny bone.","labels":1} 14 | {"sentence":"an enthusiastic charm in fire that makes the formula fresh again","labels":1} 15 | {"sentence":"a directorial tour de force, with an eye on preserving an old-fashioned sense of storytelling","labels":1} 16 | {"sentence":"has inventive moments and an often engaging story","labels":1} 17 | {"sentence":"this gentle comedy-drama is suitable for all ages -- a romantic comedy with a story that puts old-fashioned values under the microscope.","labels":1} 18 | {"sentence":"'s a zinger-filled crowd-pleaser that will stimulate hours of post viewing discussion, if only to be reminded of who did what to whom and why.","labels":1} 19 | {"sentence":"a fascinating curiosity piece","labels":1} 20 | {"sentence":", the novel has a tremendous ability to document both sides of this emotional car-wreck.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_8.json: -------------------------------------------------------------------------------- 1 | {"sentence":"that as a director washington demands and receives excellent performances, but it is really an excuse to get hold of this lame kiddie flick, a questionable kind of under-inspired dimwitsville where the only holes are found in the characters'moves.","labels":0} 2 | {"sentence":"is prime imitator junk.","labels":0} 3 | {"sentence":"an excruciating demonstration of the unsalvageability of a movie saddled with an amateurish screenplay designed to garner the bare bones of byatt's uninspired philosophy.","labels":0} 4 | {"sentence":", irresponsible, hypocritical work that plays like a loosely-connected string of acting-workshop exercises","labels":0} 5 | {"sentence":"can fully succeed at cheapening it to a cheap clich\u00e9","labels":0} 6 | {"sentence":"in an entirely irony-free zone and uses a totally unnecessary prologue, the movie is too interested in kicking around a raison d'etre that is completely lacking in charm and charisma.","labels":0} 7 | {"sentence":"is an unimaginative screenwriter's invention that is completely lacking in execution and stylishness.","labels":0} 8 | {"sentence":"with plenty of negatives","labels":0} 9 | {"sentence":"the cartoon is a lame kiddie flick, an insultingly unbelievable lark that was forgotten two years ago.","labels":0} 10 | {"sentence":"incoherence and redundancy","labels":0} 11 | {"sentence":"be hard pressed to find a film more accomplished or generous than the man who is seated next to her at the end of the show.","labels":1} 12 | {"sentence":"a work that will stimulate hours of post viewing discussion, if only to be reminded of who did what to whom and why.","labels":1} 13 | {"sentence":"a fine, focused piece that reopens an interesting controversy and never succumbs to sensationalism.","labels":1} 14 | {"sentence":"well worth seeing at least once","labels":1} 15 | {"sentence":"though many can aspire but none can equal, this is a film that delivers on the promise of excitement.","labels":1} 16 | {"sentence":"a clever script and inventive moments","labels":1} 17 | {"sentence":"moving and weighty exploration of the very human need for power in the face of adversity","labels":1} 18 | {"sentence":"it's a must for genre fans","labels":1} 19 | {"sentence":"the best espionage picture to come out in weeks","labels":1} 20 | {"sentence":"is a comic gem that relays the tale's undeniable emotional thrust without stooping to base melodrama.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_15.json: -------------------------------------------------------------------------------- 1 | {"sentence":"does not really make sense, since the movie is filled with nothing more than the scuzzy underbelly of nyc's drug scene to savor after an 88-minute rip-off of the rock with the words `` black on white '' and `` getaway '' in various wet t-shirt and shower scenes.","labels":0} 2 | {"sentence":"a dim-witted pairing of teen-speak and animal gibberish that only the most practiced curmudgeon could fail to crack a smile at.","labels":0} 3 | {"sentence":"an ungainly movie that's unfocused, tediously exasperating and visually ugly after an hour and a half of unfocused, meandering, almost humdrum approach to character development.","labels":0} 4 | {"sentence":"the worst kind of hubristic folly","labels":0} 5 | {"sentence":"is never quite able to overcome the cultural moat surrounding its ludicrous and contrived plot.","labels":0} 6 | {"sentence":"too big of a mess to function as a tv series without the tv cow","labels":0} 7 | {"sentence":"in a bad-movie way, way that we only know as an evil, monstrous lunatic against decent people and against the very foundation of human decency","labels":0} 8 | {"sentence":", this is as flat as convention and nothing more than warmed-over cold war paranoia.","labels":0} 9 | {"sentence":"in cheapo animation so overwrought it borders on facile, the movie is essentially ransacked by the tired old tired old vision of the west to be a more diverse, accepting viewpoint be damned.","labels":0} 10 | {"sentence":"the sentimental plucking of melodrama and marginal performances","labels":0} 11 | {"sentence":"two of ( witherspoon's ) better films, `` kidman and `` coming HOME ''","labels":1} 12 | {"sentence":"the world's most fascinating stories","labels":1} 13 | {"sentence":"in its bold presentation and stylish filmmaking, emerges a film that accomplishes so much that one viewing can't possibly be enough.","labels":1} 14 | {"sentence":"the best date movie in years","labels":1} 15 | {"sentence":"a moving documentary with an emotionally stirring exploration of love and destruction in the face of death.","labels":1} 16 | {"sentence":"a classy, sprightly spin on film","labels":1} 17 | {"sentence":"a fantastic film that captures the innocence and budding demons within a wallflower","labels":1} 18 | {"sentence":"this is a handsome and provocative film","labels":1} 19 | {"sentence":"is a feast for the eyes.","labels":1} 20 | {"sentence":"is top-notch and beautifully acted.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_16.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is little more than a mall movie designed to kill time.","labels":0} 2 | {"sentence":"it's not that funny.","labels":0} 3 | {"sentence":"one uninteresting, insignificant story with zero action or an uninspired philosophical ambivalence about identity or heritage.","labels":0} 4 | {"sentence":"a dim-witted pairing of teen-speak and animal gibberish that makes edward burns'sidewalks of new york look like oscar wilde.","labels":0} 5 | {"sentence":"a disjointed mess that never quite gel","labels":0} 6 | {"sentence":"do anything as stomach-turning as the way adam sandler's new movie rapes, pillages and incinerates frank capra's classic... while not totally wreaked, drags out too many of the same ideas from the start, making the proceedings more repetitive and designed to fill time, leaving the same old bad trip","labels":0} 7 | {"sentence":"instead play like a bad soap opera, in which the questionable acting is shoehorned in to the premise, the entire exercise is a lame kiddie flick with no real point of view.","labels":0} 8 | {"sentence":"this cinematic corpse is so clumsily sentimental it doesn't even qualify as a spoof of such.","labels":0} 9 | {"sentence":"is boring without being insightful.","labels":0} 10 | {"sentence":"the tiresome rant of a man who has little clue about either the nature of women or of friendship","labels":0} 11 | {"sentence":"is a shining example of his charming, always entertaining cast.","labels":1} 12 | {"sentence":"a well-crafted psychological study","labels":1} 13 | {"sentence":"a freshness and modesty in which to resonate","labels":1} 14 | {"sentence":"a great alternative","labels":1} 15 | {"sentence":"the addition of a hugely enjoyable comedy with a variety of quirky characters and an engaging story is the kind of film that gives everyone something to chew on.","labels":1} 16 | {"sentence":"a very well-meaning movie, and one that deserves recommendation","labels":1} 17 | {"sentence":"that transforms the dreary expanse of dead-end distaste the characters inhabit into a poem of art, music and metaphor.","labels":1} 18 | {"sentence":"is an intelligent flick that examines many different ideas from happiness to guilt in an intriguing bit of storytelling","labels":1} 19 | {"sentence":"a polished and vastly entertaining caper film that should win box office money the way chris gold trains his men.","labels":1} 20 | {"sentence":"to be the best espionage picture to come out in weeks","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_3.json: -------------------------------------------------------------------------------- 1 | {"sentence":", wooden dialogue and plot lapses sink this ` sub'- standard thriller cliche into a hokey piece of nonsense that doesn't know what it wants to be when it grows up.","labels":0} 2 | {"sentence":"replaced by the forced funniness found in the dullest kiddie flicks","labels":0} 3 | {"sentence":"a movie that is definitely meaningless, vapid and devoid of substance, without any of the inspiration for it, not even the most repellent twaddle you've ever seen on a tv screen.","labels":0} 4 | {"sentence":"a decomposition of healthy eccentric inspiration and ambition... by way too much exploitation, too little action and too little romance.","labels":0} 5 | {"sentence":"makes to sit through about 90 minutes of a so-called ` comedy'and not laugh once is enough to give you brain strain -- and neither is it the most creative, well-intentioned project for a while.","labels":0} 6 | {"sentence":"the idea of deleting emotion isn't even one i've seen that wasn't at least watchable.","labels":0} 7 | {"sentence":"that steals so freely from other movies and combines so many disparate types of films that it makes the silly, over-the-top coda especially disappointing.","labels":0} 8 | {"sentence":"is so poorly paced it doesn't even qualify as a spoof of such.","labels":0} 9 | {"sentence":"the worst kind of hubristic folly","labels":0} 10 | {"sentence":"the recycled aspects, implausibility, and sub-sophomoric sexual banter","labels":0} 11 | {"sentence":"lushly photographed with color and depth, the film is pleasant enough and the story provides some unexpected moments of playful kindness toward the nonconformist in us all.","labels":1} 12 | {"sentence":"a supremely hopeful cautionary tale about the influence of philosophy, music and art on our times","labels":1} 13 | {"sentence":"the best date movie in years","labels":1} 14 | {"sentence":"a nice treat for all audiences","labels":1} 15 | {"sentence":"a smart, provocative drama that manages to find greatness in the hue of its drastic iconography.","labels":1} 16 | {"sentence":"a well-balanced fashion for entertaining","labels":1} 17 | {"sentence":"this is a winning family film that excites the imagination and tickles the funny bone.","labels":1} 18 | {"sentence":"is a thoroughly entertaining celebration of its sounds and images","labels":1} 19 | {"sentence":"a guarantee of laughs throughout.","labels":1} 20 | {"sentence":"thanks to the wonderful cinematography and naturalistic acting, the movie is a must for genre fans.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_2.json: -------------------------------------------------------------------------------- 1 | {"sentence":"one-sidedness is fatal for a movie that relies on personal relationships in an unmemorable episode of `` 7th heaven. ''","labels":0} 2 | {"sentence":"is a disaster of a story, full of holes and completely lacking in chills.","labels":0} 3 | {"sentence":"a blasphemous muddle in an interminable, shapeless documentary about the swinging subculture of wasted potential.","labels":0} 4 | {"sentence":"the movie's contrived, lame screenplay and listless direction leaves scant place for the viewer to really learn what makes wilco a deeply unpleasant experience.","labels":0} 5 | {"sentence":"a sloppy, badly cobbled look into an allegedly credible account of a violence-for-hire plutocrat who is instead playing one lewd, amateurish sex comedy about exhibitionism.","labels":0} 6 | {"sentence":"certainly not to be taken seriously","labels":0} 7 | {"sentence":"is so busy making reference to other films and trying to be other films that it fails to have a heart, mind or humor of its own.","labels":0} 8 | {"sentence":"it's not scary in the slightest and a cheap scam that does no justice to either effort in three hours of screen time.","labels":0} 9 | {"sentence":"the pathetic idea that the pocket monster movie franchise is nearly ready to keel over","labels":0} 10 | {"sentence":"a flat, unconvincing drama that never catches fire","labels":0} 11 | {"sentence":"the most thoughtful fictional examination","labels":1} 12 | {"sentence":"a vibrant whirlwind of love, family and all that goes with it, is a film full of grace and purpose, one that is filled with humorous observations about the general absurdity of modern life as seen through the eyes outsiders, particularly those in urban south korea.","labels":1} 13 | {"sentence":"a stately sense of composition","labels":1} 14 | {"sentence":"a stunning new young talent in one of chabrol's most intense psychological mysteries","labels":1} 15 | {"sentence":"an undeniably moving film to experience as a tour de force of modern cinema.","labels":1} 16 | {"sentence":"is a delightfully unpredictable, hilarious comedy that deserves more than a passing twinkle.","labels":1} 17 | {"sentence":"from a cast that brings them to life, here's a must for genre fans.","labels":1} 18 | {"sentence":"maintains a beguiling serenity and poise that make it accessible for a non-narrative feature.","labels":1} 19 | {"sentence":"pure composition and form with a lyrical metaphor for the modern masculine journey","labels":1} 20 | {"sentence":"the greatest date movies in years","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_0.json: -------------------------------------------------------------------------------- 1 | {"sentence":"is amateurish, an insultingly inept and artificial examination of grief accompanied by a tepid and dull performance that feels like it was co-written by mattel executives and lobbyists for the tinsel industry.","labels":0} 2 | {"sentence":"is an utterly static picture.","labels":0} 3 | {"sentence":"this retread story has virtually nothing going for it other than its exploitive array of obligatory cheap hysterics.","labels":0} 4 | {"sentence":"a fudged opportunity of gigantic proportions wasted by a screenplay that takes few chances and manages to insult the intelligence of everyone in the audience.","labels":0} 5 | {"sentence":", the worst kind of hubristic folly, when the very idea of making a film is to release a movie with nothing but a cheap lawn chair and no good alternative.","labels":0} 6 | {"sentence":"is clumsily sentimental and ineptly directed, lacking the necessary self-awareness and respect to honor its source material.","labels":0} 7 | {"sentence":"an unpleasantly shallow rumination on the emptiness of success which the characters inhabit into an artificial structure which only seems to care about the bottom line.","labels":0} 8 | {"sentence":"a cheap scam that only weak claims to dramatic impact and creepy-crawly humor.","labels":0} 9 | {"sentence":"the problem is that the movie is extremely formulaic and stilted, with nary an original idea or visual spin to speak about for it.","labels":0} 10 | {"sentence":"the worst sin of attributable to a movie like this","labels":0} 11 | {"sentence":"a fully realized story with keen insights into parapsychological phenomena and the soulful nuances of the grieving process","labels":1} 12 | {"sentence":"a film in which the talent is undeniable","labels":1} 13 | {"sentence":"offers rare insight into the structure of relationships.","labels":1} 14 | {"sentence":"very best","labels":1} 15 | {"sentence":"the grandeur and sense of discovery","labels":1} 16 | {"sentence":"offers an exploration that is more accurate than anything i have seen in an american film.","labels":1} 17 | {"sentence":"is top-notch, and beautifully acted by abel ferrara, this beautifully produced film is at once playful and haunting, a twisting and evoking examination of the twin problems of love and power.","labels":1} 18 | {"sentence":"a heartwarming film with a simple message about death and love that appeals to us all.","labels":1} 19 | {"sentence":"a terrific piece of behind the music documentary entertainment","labels":1} 20 | {"sentence":", it never fails to entertain.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_5.json: -------------------------------------------------------------------------------- 1 | {"sentence":"muddled, predictable rehash.","labels":0} 2 | {"sentence":"a faulty premise, one it follows into melodrama and silliness that is downright doltish and uneventful.","labels":0} 3 | {"sentence":"'s not going to be everyone's bag of popcorn","labels":0} 4 | {"sentence":"totally lacking in wit and heart, a sour little film with lots of surface bickering and where the whole has no point.","labels":0} 5 | {"sentence":"'s pretty stupid, dumb, sexist, racist humor that we get when we're trying to decide what kind of movie to make after next.","labels":0} 6 | {"sentence":"just plain bad, a hokey piece of nonsense that fails on so many levels, it almost seems a waste of de niro, mcdormand and the other good actors in the cast.","labels":0} 7 | {"sentence":"is simply not enough of interest onscreen to sustain interest beyond the halfway mark.","labels":0} 8 | {"sentence":"is cliched, contrived and stilted, like some futile concoction that was developed hastily after oedekerk and his fellow moviemakers got wet and fuzzy on the big screen.","labels":0} 9 | {"sentence":"the sorriest and most sordid of human behavior toward an alien species that does no quarter to investigate and only seems to care about the bottom line","labels":0} 10 | {"sentence":"... a flat, plodding picture that doesn't have anything really interesting to say about its protagonist or its motives.","labels":0} 11 | {"sentence":"is deeply concerned with preserving a sense of mystery","labels":1} 12 | {"sentence":"( the film ) is fully formed and remarkably assured, delivering an elegant and highly pleasurable experience that is magnetic for its moodiness and quality of delivery.","labels":1} 13 | {"sentence":"that is a work of enthralling drama.","labels":1} 14 | {"sentence":"a masterful work of art of their own","labels":1} 15 | {"sentence":"like a true masterpiece","labels":1} 16 | {"sentence":"a captivating documentary that reveals the ways in which a sultry evening or a beer-fueled afternoon in the sun can inspire even the most retiring heart to venture forth.","labels":1} 17 | {"sentence":"to find a timeless, non-trivial non-firsthand experience","labels":1} 18 | {"sentence":"a strong directorial stamp on every frame of this stylish film that is able to visualize schizophrenia but is still confident enough to step back and look at the sick character with a sane eye","labels":1} 19 | {"sentence":"is a fine example of the kind of lush, all-enveloping movie experience it rhapsodizes.","labels":1} 20 | {"sentence":"a perfect family film to bring to imax with a big heart","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_13.json: -------------------------------------------------------------------------------- 1 | {"sentence":"an inferior level of mindless violence that only glorifies the bad reputation of the company.","labels":0} 2 | {"sentence":"get dismissed as the tiresome rant of an aging filmmaker still thumbing his nose at convention when it should be building suspense","labels":0} 3 | {"sentence":"is repulsive and depressing without any of its satirical salvos ever reaching its intended audience.","labels":0} 4 | {"sentence":"bland for a film that tries too hard to be quirky and light ( maybe a tad too anarchic ), the grey zone ultimately becomes a desert of dullness and stumbles over every cheap trick in the book trying to make the absurd seem funny.","labels":0} 5 | {"sentence":"an utter tripe between a depressingly retrograde morality play and slack execution that carries virtually no organic intrigue as a government \/ marine\/legal mystery.","labels":0} 6 | {"sentence":"an endless trailer and stuck on a pointless exercise in stereotypes.","labels":0} 7 | {"sentence":"... tepid and tedious spin-off of the matrix, heaven is essentially devoid of interesting characters or even a halfway intriguing plot.","labels":0} 8 | {"sentence":"a flat, unconvincing drama that never catches fire","labels":0} 9 | {"sentence":"the empty stud knockabout of equilibrium, with all its botches, is so clumsily sentimental and ineptly directed it doesn't even qualify as a spoof of such.","labels":0} 10 | {"sentence":"the worst kind of hubristic folly","labels":0} 11 | {"sentence":"do justice to both men's unique residences and that of his beautiful women.","labels":1} 12 | {"sentence":"this is a stunning film that reveals the ways in which a sultry evening or a beer-fueled afternoon in the sun can inspire even the most retiring heart to venture forth.","labels":1} 13 | {"sentence":"is a visual treat for all audiences.","labels":1} 14 | {"sentence":"a role that excites the imagination and tickles the funny bone","labels":1} 15 | {"sentence":"a masterpiece","labels":1} 16 | {"sentence":"a powerful look at individual moments in the lives of the campaign-trail press, a resourceful, engaging film that fights a good fight on behalf of the world's endangered reefs","labels":1} 17 | {"sentence":"a gorgeously strange film that rises above easy, cynical potshots at morally bankrupt characters... one of those rare pictures that paints a grand picture of an era and makes the journey feel like a party.","labels":1} 18 | {"sentence":"a perfect family film to bring to imax","labels":1} 19 | {"sentence":"the charm of the first film adds a freshness and modesty that transcends the normal divisions between fiction and nonfiction film.","labels":1} 20 | {"sentence":"a well-put-together piece of urban satire that offers food for thought.","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_18.json: -------------------------------------------------------------------------------- 1 | {"sentence":"the worst kind of hubristic folly","labels":0} 2 | {"sentence":"the movie's plot is less than adorable when it isn't incomprehensible, and the forced funniness is gone, replaced by the forced funniness found in the dullest kiddie flicks.","labels":0} 3 | {"sentence":"the tiresome rant of a culture that's become numbingly repetitive, despite several attempts at lengthy dialogue scenes.","labels":0} 4 | {"sentence":"a poor fit with kieslowski's lyrical pessimism","labels":0} 5 | {"sentence":"that's neither original nor terribly funny","labels":0} 6 | {"sentence":"funny stuff, not particularly well done, but what amounts to little more than punishment is just a corny examination of young adult life in a flat series of vignettes that doesn't give a damn about any of them.","labels":0} 7 | {"sentence":"a confluence of farcical and loathsome, shamelessly manipulative claptrap, about nothing delivered by either the actor or director.","labels":0} 8 | {"sentence":"is a disservice to the audience and to the genre -- a slapdash mess that doesn't add up to much more than trite observations on the human condition.","labels":0} 9 | {"sentence":"is only surface deep ; from its awkwardly contrived plot to the dullest kiddie flicks in ages, this latest skewering goes nowhere and goes there very, very slowly.","labels":0} 10 | {"sentence":"the condescending stereotypes that plastered all over this `` un-bear-able '' project is a complete mess... lacking any of the rollicking dark humor so necessary to make this kind of stuff funny on the screen.","labels":0} 11 | {"sentence":"the prettiest pictures of the year","labels":1} 12 | {"sentence":"the high-buffed gloss of manual animation gives this gorgeous film a soul and an unabashed sense of good old-fashioned escapism.","labels":1} 13 | {"sentence":"one good family film in its right mind","labels":1} 14 | {"sentence":"just about more stately than any contemporary movie this year... a true study, a film with a questioning heart and mind that isn't afraid to admit it","labels":1} 15 | {"sentence":"all the element of charming, a charming tale of family responsibility and care","labels":1} 16 | {"sentence":"a stylish and delicately performed piece of work that will stimulate hours of post viewing discussion, to be had for hours","labels":1} 17 | {"sentence":"a solid action piece that manages to find greatness in the hue of its drastic iconography.","labels":1} 18 | {"sentence":"is one of those rare pictures that perfectly captures the wonders and worries of childhood in a compelling and evocative fashion","labels":1} 19 | {"sentence":"interesting combination act","labels":1} 20 | {"sentence":"the light comedic work, the energetic and always surprising performance","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_11.json: -------------------------------------------------------------------------------- 1 | {"question1":"What are some unexpected things first-time visitors to Argentina notice?","question2":"What are some unexpected things first-time visitors to Japan notice?","labels":0} 2 | {"question1":"How do you fix a bird if it does not sound to me?","question2":"Should I get a Spanish Army Seals certificate?","labels":0} 3 | {"question1":"How do you install Stinson lighting?","question2":"How does Stinson light work?","labels":0} 4 | {"question1":"How can I get back my Apple ID\/Passbook?","question2":"Why can't we leave ibpsu empty?","labels":0} 5 | {"question1":"Which one-liners convey a feeling of empowerment?","question2":"What would happen if we send a 30 ton Wi-Fi router to a bakery?","labels":0} 6 | {"question1":"How can I search for my photo if the search criteria is English only?","question2":"How do I create my filter in a search engine?","labels":0} 7 | {"question1":"What is the best coworking space in India?","question2":"Which is the best coworking space in Bangalore?","labels":0} 8 | {"question1":"What is selenium?","question2":"What is crystal yield?","labels":0} 9 | {"question1":"How do I recover deleted Google Drive data files?","question2":"How can I remove a Google Drive file from my computer?","labels":0} 10 | {"question1":"What is civil design? What are some examples?","question2":"What are some examples of self expression in public administration?","labels":0} 11 | {"question1":"What are the best mobile phones under 25000 in India?","question2":"What are the best mobile phones under 50000?","labels":1} 12 | {"question1":"What is the funniest joke you have ever heard?","question2":"What is the funniest joke you ever hear?","labels":1} 13 | {"question1":"How do I get rid of pimples permanently?","question2":"How do I get rid of pimples permanently?","labels":1} 14 | {"question1":"What is natural resistance? How is it determined?","question2":"What is natural resistance? How are it determined?","labels":1} 15 | {"question1":"What is a laptop laptop? What are some examples?","question2":"What is a laptop? What are some examples?","labels":1} 16 | {"question1":"How do I improve my English vocabulary when I speak only one language at a time?","question2":"How do I improve my English vocabulary when I speak only one language at a time?","labels":1} 17 | {"question1":"What is the best method to lose weight safely?","question2":"How do I lose weight safely?","labels":1} 18 | {"question1":"How can I find someone who is willing to listen to me?","question2":"How do I find someone who is willing to listen to me?","labels":1} 19 | {"question1":"How do I reset my gmail password?","question2":"How do I reset my Gmail password with a different email account?","labels":1} 20 | {"question1":"How can I improve my English speaking skills?","question2":"How can I improve my spoken English wit?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/sst2/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_200_1.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_17.json: -------------------------------------------------------------------------------- 1 | {"sentence":"a complete waste of time that should be directing from a directionless screen","labels":0} 2 | {"sentence":"a whole heap of nothing at the core of this slight coming-of-age\/coming-out tale that takes few chances and manages to insult the intelligence of everyone in the audience... by making absurdly inappropriate and redundant a motion picture filmed on digital video.","labels":0} 3 | {"sentence":"paints a bleak picture of an urban in-jokey culture where the only entertainment is the pretentious self-congratulation usually found in movies that demand four hankies.","labels":0} 4 | {"sentence":"'s just so boring that it makes your least favorite james bond movie seem as cleverly plotted as the usual suspects.","labels":0} 5 | {"sentence":"is so out of place in what could have ( and probably should have ) been a lighthearted comedy would be a complete waste of time.","labels":0} 6 | {"sentence":"an 88-minute travesty of a hollywood plot that doesn't offer any insights that haven't been thoroughly debated in the media already, back in the dahmer heyday of the mid - '90s.","labels":0} 7 | {"sentence":"the sick sense of humor about adults that only seem to care about the bottom line","labels":0} 8 | {"sentence":"to the feeble examples of big-screen poke-mania that have preceded it, this schlocky horror\/action hybrid is downright silly without ever being smart.","labels":0} 9 | {"sentence":"the film is more silly than scary, while the pathetic script makes little attempt to give voice to the other side.","labels":0} 10 | {"sentence":"an appalling ` ace ventura'rip-off that only ever walked the delicate tightrope between farcical and loathsome.","labels":0} 11 | {"sentence":"a smart and solid action pic that deftly, gradually reveals a real human soul buried beneath a spellbinding serpent's smirk.","labels":1} 12 | {"sentence":"certainly an ambitious film with enough zingers for a satisfying evening at the multiplex","labels":1} 13 | {"sentence":"it's a funny, puzzling movie that returns the magician to his pulpy thrillers of the early '80s.","labels":1} 14 | {"sentence":"a clever little thriller that explains the tangled relationships in unexpected places.","labels":1} 15 | {"sentence":"is a true cinematic fluidity and sense of intelligence that places the good-time shenanigans in welcome perspective.","labels":1} 16 | {"sentence":"the film is a glorious spectacle of quiet power, a tribute to the power of women to heal.","labels":1} 17 | {"sentence":"a fascinating film about the right thing at the right time in the history of our country by tapping into the primal fears of young people trying to cope with the mysterious and brutal nature of adults","labels":1} 18 | {"sentence":"the dramatic conviction that underlies the best of comedies","labels":1} 19 | {"sentence":"his promise of making movies that resonate with profundity","labels":1} 20 | {"sentence":"better than a fanciful motion picture","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_19.json: -------------------------------------------------------------------------------- 1 | {"question1":"How much does a short-term iPhone repair cost in India?","question2":"Is short term car repair expensive in India? How much can I get for using one for $120 in India?","labels":0} 2 | {"question1":"What are some good websites for hacking RC (Funds) in Goa?","question2":"How do I know all is fair in women's?","labels":0} 3 | {"question1":"Why don't animal cells shed cells?","question2":"Why aren't animals able to shed blood?","labels":0} 4 | {"question1":"How do I get experience in icm programming?","question2":"How much time does it take to learn NetHack?","labels":0} 5 | {"question1":"What is the easiest way to learn single page apps?","question2":"How do I learn single page apps on PC?","labels":0} 6 | {"question1":"What is the average salary for a CPM\/CBO in India?","question2":"What are the average salary of masters in Canada for a CPM-CBO in India?","labels":0} 7 | {"question1":"What are some other movies similar to Christine and Jeanne in the world of the 1990s?","question2":"Who were the Seven Kingdoms' king?","labels":0} 8 | {"question1":"What is the purpose of drinking milk if there is nothing in it?","question2":"What does it mean to drink milk without milk?","labels":0} 9 | {"question1":"How does CX softwares worked in the last minute?","question2":"How does Kinetic and Cosmological Modelling work?","labels":0} 10 | {"question1":"Can my husband and I get married? What should I do?","question2":"Should I get married?","labels":0} 11 | {"question1":"How can I use Quora to answer questions that are not so useful?","question2":"How can I answer questions that are not very useful in Quora?","labels":1} 12 | {"question1":"What is the easiest way to make money online?","question2":"What is the easiest way to make money online?","labels":1} 13 | {"question1":"Why do people believe that the Earth is flat?","question2":"Why do many people believe in flat Earth or geocentrism?","labels":1} 14 | {"question1":"How do I become so good at forex trading that I can easily trade in the real estate market?","question2":"How do I become so good at forex trading that I can easily trade in the real estate market?","labels":1} 15 | {"question1":"What is the best way to stop procrastinating?","question2":"How do I stop procrastinating?","labels":1} 16 | {"question1":"Which is the best place to visit in Bengaluru, India?","question2":"Which is the best place to visit in Bengaluru, India?","labels":1} 17 | {"question1":"How do I avoid procrastination?","question2":"How can I avoid procrastination?","labels":1} 18 | {"question1":"How do you get friendfic into a Facebook page?","question2":"How do I get friendfic into a Facebook page?","labels":1} 19 | {"question1":"What is the one thing that you regret not doing in your life?","question2":"What is the one thing you regret not doing in life?","labels":1} 20 | {"question1":"What will be the impact of banning 500 and 1000 rupee notes on Indian economy?","question2":"What will be the impact of the ban on Rs. 500 and Rs. 1000 notes on Indian economy?","labels":1} 21 | -------------------------------------------------------------------------------- /configs/train/lm.yaml: -------------------------------------------------------------------------------- 1 | base: 2 | experiment_name: train.${generator.model_name}.${learner.model_name}.${data.task_name} 3 | method: dilm 4 | run_name: ${base.method}.lm.${base.sub_run_name} 5 | sub_run_name: step_${train.total_train_step} 6 | save_dir_root: ./save 7 | save_method_dir: ${base.save_dir_root}/${base.experiment_name}/${base.method}.lm 8 | save_dir: ${base.save_method_dir}/${base.sub_run_name} 9 | data_dir_root: ./data 10 | seed: 42 11 | device: null 12 | 13 | data: 14 | task_name: sst2 15 | datasets_path: ${base.data_dir_root}/${data.task_name}/datasets 16 | preprocessed_datasets_path: ${base.data_dir_root}/${data.task_name}/datasets_${generator.model_name}_${learner.model_name} 17 | train_batch_size: 64 18 | valid_batch_size: 256 19 | test_batch_size: 256 20 | num_proc: 1 21 | force_preprocess: False 22 | 23 | coreset: 24 | coreset_type: k_centers # {random, k_centers, herding, rank_dilm} 25 | model_name: bert-base-uncased 26 | save_dir: ${base.data_dir_root}/${data.task_name}/coresets/${coreset.coreset_type} 27 | 28 | generator: 29 | model_name: gpt2 30 | pretrained_model_dir: null 31 | checkpoint_name: null 32 | top_p: 0.95 33 | top_k: null 34 | repetition_penalty: 1.0 35 | generate_batch_size: 512 36 | generate_max_length: null 37 | generate_fp16: False 38 | generate_bf16: True 39 | gradient_checkpointing: True 40 | 41 | learner: 42 | model_name: bert-base-uncased 43 | use_pretrained_model: True 44 | disable_dropout: False 45 | freeze_bert: False 46 | gradient_checkpointing: True 47 | 48 | train: 49 | train_type: lm 50 | 51 | gm_syn_dpc: null 52 | gm_real_dpc: null 53 | gm_real_grad_accum_step: null 54 | 55 | lm_lambda: null 56 | lm_batch_size: 64 57 | 58 | repset_teacher: null 59 | n_repset: null 60 | 61 | classifier_grad_only: False 62 | 63 | normalize_temperature: 1.0 64 | use_generated_data: True 65 | 66 | total_train_step: 80000 67 | inner_loop: null 68 | model_step_per_inner_step: null 69 | 70 | lr: 1.0e-5 71 | optimizer_type: adamw # [sgd, adam, adamw] 72 | scheduler_type: cosine 73 | warmup_ratio: 0.1 74 | weight_decay: 0.01 75 | max_grad_norm: 1.0 76 | val_interval: 5000 77 | val_skip_step: 1 78 | log_interval: 100 79 | 80 | save_model_dir: ${base.save_dir}/generator 81 | save_valid_result_dir: ${base.save_dir}/valid_results 82 | fp16: False 83 | bf16: True 84 | 85 | distilled_data: 86 | dpc: 20 87 | n_dataset: 10 88 | save_dataset_path: ${base.save_dir}/dataset 89 | over_sample_ratio: 100.0 90 | 91 | evaluate: 92 | task_name: ${data.task_name} 93 | n_eval_per_dataset: 5 94 | fp16: False 95 | bf16: True 96 | save_result_dir: ${base.save_dir}/final_results 97 | 98 | # training config 99 | optimizer_type: adamw # ["sgd", "momentum", "adam", "adamw"] 100 | scheduler_type: cosine 101 | lr: 1.0e-4 102 | max_grad_norm: 1.0 103 | weight_decay: 0.01 104 | warmup_ratio: 0.5 105 | 106 | train_step: 200 107 | batch_size: 64 108 | 109 | hydra: 110 | run: 111 | dir: ${base.save_dir} 112 | sweep: 113 | dir: ${base.save_method_dir} 114 | subdir: ${base.sub_run_name} 115 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_5.json: -------------------------------------------------------------------------------- 1 | {"question1":"How do I insert my address details into my Google Name Servers?","question2":"How can I have my URL not show up on Google name Servers?","labels":0} 2 | {"question1":"How do I calculate OPM requirements on the GRE exam?","question2":"How can I give permission to free pass from being a UX Writer for my laptop?","labels":0} 3 | {"question1":"Why has man been so heavily used in the modern world for solving global problems?","question2":"How do I get nanoscience student?","labels":0} 4 | {"question1":"Is there a way to make your wrist into a mole finger?","question2":"What are some ways of making your wrist wrist into a mole finger?","labels":0} 5 | {"question1":"What are the primary\/primary symptoms of a pithopus perch?","question2":"How do I start a grocery store with high selling goods of different types?","labels":0} 6 | {"question1":"How do I get jobs with LPG.?","question2":"What are some tips to get employed with lpg in India and what are some tips on doing a job to get an RPO (reserve based jobs)?","labels":0} 7 | {"question1":"Why do I need to start school?","question2":"Why should I start school?","labels":0} 8 | {"question1":"What is a good one liner on web design?","question2":"What should I write a vector HTML template for a website using PHP?","labels":0} 9 | {"question1":"How long does it take to get an ice cube?","question2":"Does the ice cube takes time to arrive?","labels":0} 10 | {"question1":"What are some mind-blowing gadgets tools that most people don't know about?","question2":"What are some mind-blowing personal computers tools that exist that most people don't know about?","labels":0} 11 | {"question1":"Is it possible to follow someone on Quora?","question2":"How is it possible to follow someone else on Quora?","labels":1} 12 | {"question1":"Why do some people believe that aliens don't exist?","question2":"Why do some people believe that aliens don't exist?","labels":1} 13 | {"question1":"How do I gain weight after a long distance relationship?","question2":"How can I gain weight after a long distance relationship?","labels":1} 14 | {"question1":"How do I know if my dog is becoming infertile?","question2":"How do I know if my dog is becoming infertile?","labels":1} 15 | {"question1":"What is the best way to improve my grades in school?","question2":"How do I improve my grades?","labels":1} 16 | {"question1":"What are some best books for learning formal classes?","question2":"What are the best books to learn formal classes?","labels":1} 17 | {"question1":"How do you learn to love a stranger?","question2":"How do I love or learn to love a stranger?","labels":1} 18 | {"question1":"How do I make easy money online without spending money?","question2":"How do I make easy money online without any extra investment?","labels":1} 19 | {"question1":"What are your views on the 500 & 1000 rupee notes ban in India? What are the pros and cons of it?","question2":"What are your views on the ban on 500 and 1000 rupee notes in India? Pros and cons?","labels":1} 20 | {"question1":"What is the difference between Gmail and Yahoo Mail?","question2":"What is the difference between Gmail and Yahoo Mail?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_17.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is the procedure to import one package of Gambia only to Kenya?","question2":"Why do some speakers get their children to hear all the English speakers at the end of the play when I, thinking everyone speaks it through the speakers?","labels":0} 2 | {"question1":"What is green leaf is the dream life for?","question2":"What is the dream life of a shy boy?","labels":0} 3 | {"question1":"How can I talk to my teacher who has a teaching style I don't speak English well?","question2":"How can I find someone to communicate with on Skype?","labels":0} 4 | {"question1":"What is the corporate culture like at Resource Capital? How is the culture different than other companies?","question2":"What is the corporate culture like at Resource Capital? How is the culture different than other companies?","labels":0} 5 | {"question1":"What factors affect our brain?","question2":"What are some things that determines our brain strength?","labels":0} 6 | {"question1":"Is working at a company a challenge?","question2":"What's the best way to go to the bank to deposit money?","labels":0} 7 | {"question1":"I want to start a small NGO in Bangalore. Which city should I start with it? What are the good locations that I should have in Bangalore?","question2":"Where are my small NGO's in Bangalore?","labels":0} 8 | {"question1":"What is real number?","question2":"What is a real number that is not real?","labels":0} 9 | {"question1":"What are good apps for downloading games and saving them to a USB?","question2":"How can I use a USB game folder to download games?","labels":0} 10 | {"question1":"How do I get into a good company in Hyderabad?","question2":"How do I get into good company in Hyderabad?","labels":0} 11 | {"question1":"How can I get rid of body fat without running?","question2":"How do I get rid of body fat without running?","labels":1} 12 | {"question1":"How do I get over a losing friendship?","question2":"How do I get over my lost friendship?","labels":1} 13 | {"question1":"Which is best laptop under 60k in India?","question2":"Which laptop is best to buy under 60k in India?","labels":1} 14 | {"question1":"How do I get traffic on my site?","question2":"How can I get more traffic on my website?","labels":1} 15 | {"question1":"What are ways to stop procrastinating?","question2":"How do I stop procrastination?","labels":1} 16 | {"question1":"What is the main function of the cell membrane? What are some examples?","question2":"What is the main function of the cell membrane? What are some examples?","labels":1} 17 | {"question1":"How do I start a website like invoices.com?","question2":"How do I start a website like invoices.com?","labels":1} 18 | {"question1":"What is the process to fill In-flight 1,2,3 and 4 tickets in a corporate aircraft?","question2":"What is the process to fill in an in-flight 1,2,3 and 4 tickets in a corporate aircraft?","labels":1} 19 | {"question1":"How do I find out if someone blocked me on whatsapp?","question2":"How can I find out if somebody blocked me on whatsapp?","labels":1} 20 | {"question1":"What is your worst experience with time travel?","question2":"What is your worst experience with time travel?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_13.json: -------------------------------------------------------------------------------- 1 | {"question1":"What kind of placements can a 4-year MBBS student get after completing B.Tech in engineering?","question2":"What kind of jobs after graduation in engineering can I get after completing a B.Tech in mechanical engineering in India?","labels":0} 2 | {"question1":"How do I fall asleep without having any hope in sleep apnea?","question2":"What do I do when I do fall asleep?","labels":0} 3 | {"question1":"How hard is it to get a job in zero gravity after high school?","question2":"What are the chances of getting a job at PMC, HCL or Platts if your whole family is 100% Hungarian?","labels":0} 4 | {"question1":"\"What is the meaning of the name \"\"Kaiser\"\"?\"","question2":"Does this video on Youtube contains content that I can't legally use in Australia?","labels":0} 5 | {"question1":"What is wet bedding used for?","question2":"What is wet bedding used for?","labels":0} 6 | {"question1":"Which bird is really-mad?","question2":"Is it possible for someone to create a \u201cair plane\u201d without a structuralist?","labels":0} 7 | {"question1":"What is the best book to read as a beginner to learn web development?","question2":"What is the best book to read as a beginner to learn arithmetical web development?","labels":0} 8 | {"question1":"What is a polar molecule called?","question2":"How many molecules are in a molecule of a polar molecule?","labels":0} 9 | {"question1":"How do I use Jio sim in k8\/o?","question2":"What will be the push of a young car driven by 12 year old in a populated area?","labels":0} 10 | {"question1":"Where can I get free WiFi of an LTE router?","question2":"How can I find free WiFi on the iPhone 5?","labels":0} 11 | {"question1":"What is the best song you have ever heard?","question2":"What are the best songs you have ever heard?","labels":1} 12 | {"question1":"Who will win the 2016 US election?","question2":"Who will win the 2016 presidential election?","labels":1} 13 | {"question1":"What are some mind-blowing bike tools that exist that most people don't know about?","question2":"What are some mind-blowing bike tools that exist that most people don't know about?","labels":1} 14 | {"question1":"How do I lose some weight in a short time?","question2":"How do I lose weight in a short time?","labels":1} 15 | {"question1":"How does one reset my Gmail password after I recover it from a Gmail account that I forgot it's password?","question2":"How do I reset my password when I don't remember my recovery information or my Gmail account password?","labels":1} 16 | {"question1":"What happens to your brain if it is eating a plant?","question2":"What happens to your brain if it eats a plant?","labels":1} 17 | {"question1":"Can I find my employer in India if I work in India?","question2":"Can I find my employer in India if I work in India?","labels":1} 18 | {"question1":"What is the best way to lose weight safely?","question2":"How can I lose weight safely?","labels":1} 19 | {"question1":"How do I start a crowdfunding campaign for my book published in india?","question2":"How do I start a crowdfunding campaign for a book published in india?","labels":1} 20 | {"question1":"What is life after death?","question2":"What is life after death?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_4.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is it like to work at Cognizant Group for your first job?","question2":"What is it like to work at Cognizant Group for your first job in India?","labels":0} 2 | {"question1":"What should one do after two years and some place is closed, that will provide lots of people safe drinking water for cars, trucks, motorcycles, etc.?","question2":"What is the field in which you can run many daily running spaces after completing a double degree?","labels":0} 3 | {"question1":"How do I get into Etihad after engineering?","question2":"Is it possible to get in Etihad after engineering?","labels":0} 4 | {"question1":"How can I send WhatsApp using the iPhone?","question2":"Can I send an SMS without an iPhone?","labels":0} 5 | {"question1":"How do you figure out what it is with a million heads of a million heads?","question2":"What do we have to do when a car is full of more than m\/l players?","labels":0} 6 | {"question1":"Can I paint my car as a turbocharger for a Subaru?","question2":"What are the best stories to tell you about thinking out your self, as a different type of thinking is used as a means of understanding what's really worthiness of your self?","labels":0} 7 | {"question1":"What is the meaning of I really love you?","question2":"What does this mean for a woman who doesn't love me and loves someone else, that she loves someone else to marry her?","labels":0} 8 | {"question1":"What do glass bottles do?","question2":"Why do glass bottles smell bad to everyone?","labels":0} 9 | {"question1":"What is a property tax rate? What is that rate used for?","question2":"What is the property tax rate for self housing?","labels":0} 10 | {"question1":"How can I make a project easy?","question2":"How do I create a PDF file by sketching drawings\/graphic file and preparing for presentation?","labels":0} 11 | {"question1":"How do I add photos to question details on Quora?","question2":"How can I add a picture to a question on Quora?","labels":1} 12 | {"question1":"Why do some people kill themselves everyday?","question2":"Why do people kill themselves everyday?","labels":1} 13 | {"question1":"What are some mind blowing gadgets that most people don't know about?","question2":"What are some mind blowing gadgets that most people don't know?","labels":1} 14 | {"question1":"What are the best, easy, best ways to lose weight?","question2":"How can I lose weight successfully?","labels":1} 15 | {"question1":"How do I start to speak fluent English with confidence?","question2":"How should I start to speak English with confidence?","labels":1} 16 | {"question1":"How do I get traffic for my websites?","question2":"How do I get traffic for my website?","labels":1} 17 | {"question1":"If I eat a lot of pot salt, should I still be gaining weight?","question2":"If I eat lots of pot salt, should I still be gaining weight?","labels":1} 18 | {"question1":"How can I overcome bad memory?","question2":"How can I overcome bad memory?","labels":1} 19 | {"question1":"What is the first question that someone asked you?","question2":"What is the first question that someone asked you?","labels":1} 20 | {"question1":"Which is the best cell phone under 25000?","question2":"Which is the best phone under 25000 in India?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_8.json: -------------------------------------------------------------------------------- 1 | {"question1":"Can I stop clicking button and clicking through the buttons of Google Plus all the time?","question2":"How do I control the various\/visible clicks on an e-commerce website?","labels":0} 2 | {"question1":"What is a good website for CNC machining?","question2":"What is a good website to download some CNC machining books?","labels":0} 3 | {"question1":"What are the differences between a POUM model and a American Snap Chat?","question2":"Do mass communication zones exist in a country?","labels":0} 4 | {"question1":"\"What is the meaning of this sentence: \"\"had it not been raining for three hours, rain would have been deadly\"\"?\"","question2":"Do you have to pay for lunch in USA?","labels":0} 5 | {"question1":"What is the purpose of a liquid jug in glass, and what's the function of an acid-based fluid in glass?","question2":"What is the function of fluid in glass?","labels":0} 6 | {"question1":"What is the difference between an integrated systems engineering and data science engineering?","question2":"What are the different branches of CS and BI engineering?","labels":0} 7 | {"question1":"How can I view UberGo sales results without leaving the app?","question2":"Why do some people not like studying to have more fun as much as you do?","labels":0} 8 | {"question1":"Why do White Gold Flakes have a very high affinity for LOR but a darker affinity for UDH and IOF?","question2":"What are the best car brands to try for car races in Spain?","labels":0} 9 | {"question1":"Why do dogs pee inside dogs?","question2":"What does it feel like to pee inside of a dog?","labels":0} 10 | {"question1":"What's the best way to download movies directly to my iPad?","question2":"How do I use bookmarked iPhoto to download movies to my iPad?","labels":0} 11 | {"question1":"What is the best way to get better grades in school?","question2":"How do I get better grades in school?","labels":1} 12 | {"question1":"How can I improve my English speaking skills?","question2":"How can I improve my English language?","labels":1} 13 | {"question1":"How can I get a Reliance Jio SIM card by a Paytm associate from India?","question2":"How can I get a Reliance Jio SIM card by payingtm associate from India?","labels":1} 14 | {"question1":"How do I earn online without spending money?","question2":"How can I make money online without spending money?","labels":1} 15 | {"question1":"How do you know when you're in love with someone?","question2":"How do you know if you're in love with someone?","labels":1} 16 | {"question1":"Why do people believe the world is flat?","question2":"Why do some people think the earth is flat?","labels":1} 17 | {"question1":"How do I transfer WhatsApp contacts from one iPhone to another?","question2":"How do I transfer WhatsApp contacts from one iPhone to another?","labels":1} 18 | {"question1":"What are some mind blowing technology gadgets that most people don't know about?","question2":"What are some mind blowing technology gadgets that most people don't know about?","labels":1} 19 | {"question1":"How do I get rid of belly fat?","question2":"How can I get rid of belly fat naturally?","labels":1} 20 | {"question1":"What does it mean when you feel like death?","question2":"What does it mean when you feel like death?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_16.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is portfolio management? What are some examples?","question2":"What is portfolio management from accounting?","labels":0} 2 | {"question1":"What is the corporate culture like at Bunsen Company? How is the culture different than other companies?","question2":"What is the corporate culture like at Amway? How is the culture different than other companies?","labels":0} 3 | {"question1":"What if Jose was smart enough to tell the truth about Einstein in a meeting?","question2":"How do I prepare for open centre Fedex exam in 12th board test 2016?","labels":0} 4 | {"question1":"What is the N-factor in the character name of Johnny from Love at First Sight?","question2":"How can I create a character name for my IT guy?","labels":0} 5 | {"question1":"Why do some people create paintings in their minds while others do so at work?","question2":"What is a meeting situation like at cozen even after first meeting with a company or whoever gave you a first opportunity on setting the meeting aside for a chat?","labels":0} 6 | {"question1":"Can a 14-year-old grow to 13.5 inches tall without growing his eyebrows?","question2":"How can I grow my eyebrows without growing them?","labels":0} 7 | {"question1":"Which college is better: MIT or Caltech?","question2":"Which one is better, IIT Bombay or Caltech?","labels":0} 8 | {"question1":"What is it like being a part of a non-profits organization?","question2":"What is the scope of working at an organization if one is a full time employee and a part time employee?","labels":0} 9 | {"question1":"What is it like to work in a JS team?","question2":"How much do commercial jobs in Java make in India?","labels":0} 10 | {"question1":"Does google collage stats work?","question2":"How does a Google data scientist work?","labels":0} 11 | {"question1":"Is there anyone who has ever had a near death experience and developed a lucid dream while at the same time being transported to a living life?","question2":"Is there anyone who has ever had a near death experience and developed a lucid dream while at the same time being transported to a living life?","labels":1} 12 | {"question1":"What is your favorite place in Spain and why?","question2":"What is your favorite place in Spain and why?","labels":1} 13 | {"question1":"What is the benefit of living in Tushar, Karnataka during UPOP Day?","question2":"What is the benefits to living in Tushar, Karnataka during UPOP Day?","labels":1} 14 | {"question1":"How do I lose weight safely?","question2":"How can I lose weight quickly and effectively?","labels":1} 15 | {"question1":"What is the meaning of life?","question2":"What is the meaning of life?","labels":1} 16 | {"question1":"Why do people love driving?","question2":"Why do people like to drive?","labels":1} 17 | {"question1":"How do I get rid of a fat belly?","question2":"How do I get rid of belly fat and abs?","labels":1} 18 | {"question1":"What is the formula for when the Earth is flat?","question2":"What is the formula for when the Earth is flat?","labels":1} 19 | {"question1":"How can I increase the traffic to a story blog?","question2":"How do I increase traffic on a blog?","labels":1} 20 | {"question1":"How can I make friends with people?","question2":"How can I make friends with people?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_10.json: -------------------------------------------------------------------------------- 1 | {"question1":"Which is better for mirroring, Wacom Duos or Curveslide Magic Basis?","question2":"How are dimples used?","labels":0} 2 | {"question1":"What is a solar panel installation provider in a village of 8,000 people?","question2":"What is a solar panel installation provider in a village of 10,000 people?","labels":0} 3 | {"question1":"How long does it take for dry ice to melt overnight?","question2":"How long does it take to melt ice cubes at room temperature?","labels":0} 4 | {"question1":"What software can I use to start my own website and publish it?","question2":"Is it necessary to bring a laptop to bootstrap the markup to Google Page Builder?","labels":0} 5 | {"question1":"What happened to the 2 humans that jumped out of a black hole in the first 100 years?","question2":"Would you recommend someone who achieved great growth and self confidence from the age of 13 to be an entrepreneur in your 30s?","labels":0} 6 | {"question1":"How do I learn about investment markets in India?","question2":"How do I start learning about venture capital investing in India?","labels":0} 7 | {"question1":"What is the job market in Australia for UI designers?","question2":"Does my match need out of the team?","labels":0} 8 | {"question1":"How do you know when your iPhone's Home button is OFF, or is it just an act?","question2":"How do I save a short cut at home when you're connected to a credit card using an iPhone and and a home button is not connected?","labels":0} 9 | {"question1":"How many slices of bacon should a 8 year old eat?","question2":"How many roasted chickens can a 12-year old eat?","labels":0} 10 | {"question1":"What is 100% safe medicine?","question2":"What is safety meditation?","labels":0} 11 | {"question1":"How do I get rid of pimples on the nose?","question2":"How do I get rid of pimples in my nose?","labels":1} 12 | {"question1":"How do I lose weight quickly?","question2":"How can I lose weight quickly in a healthy way?","labels":1} 13 | {"question1":"What are the things that can prevent mother grandparents from being comfortable with her children?","question2":"What are the things that prevents mother grandparents from being comfortable with her children?","labels":1} 14 | {"question1":"How do I control one's emotion?","question2":"How do I control emotion?","labels":1} 15 | {"question1":"How will the banning of old 500 and 1000 rupee notes help in curbing corruption in India?","question2":"How will the ban on 500 and 1000 rupee note stop corruption?","labels":1} 16 | {"question1":"How do I improve my spoken English?","question2":"How can I improve my spoken English ability to read and write easily?","labels":1} 17 | {"question1":"How do I reset the Bootloader of an Android smartphone?","question2":"How can I reset the bootloader in an Android smartphone?","labels":1} 18 | {"question1":"Who is a better presidential candidate Donald Trump or Hillary Clinton?","question2":"Who is better Donald Trump or Hillary Clinton?","labels":1} 19 | {"question1":"What is the one thing you regret not doing in your life?","question2":"What is the one thing you regret not doing in your life?","labels":1} 20 | {"question1":"Where can I find some good examples of zero gravity?","question2":"Where can I find examples of zero gravity?","labels":1} 21 | -------------------------------------------------------------------------------- /configs/train/dc.yaml: -------------------------------------------------------------------------------- 1 | base: 2 | experiment_name: train.${generator.model_name}.${learner.model_name}.${data.task_name} 3 | method: dilm 4 | run_name: ${base.method}.dc.${base.sub_run_name} 5 | sub_run_name: ${now:%Y-%m-%d.%H-%M-%S} 6 | save_dir_root: ./save 7 | save_method_dir: ${base.save_dir_root}/${base.experiment_name}/${base.method}.dc 8 | save_dir: ${base.save_method_dir}/${base.sub_run_name} 9 | data_dir_root: ./data 10 | seed: 42 11 | device: null 12 | 13 | data: 14 | task_name: sst2 15 | datasets_path: ${base.data_dir_root}/${data.task_name}/datasets 16 | preprocessed_datasets_path: ${base.data_dir_root}/${data.task_name}/datasets_${generator.model_name}_${learner.model_name} 17 | train_batch_size: 64 18 | valid_batch_size: 256 19 | test_batch_size: 256 20 | num_proc: 1 21 | force_preprocess: False 22 | 23 | coreset: 24 | coreset_type: k_centers # {random, k_centers, herding, rank_dilm} 25 | model_name: bert-base-uncased 26 | save_dir: ${base.data_dir_root}/${data.task_name}/coresets/${coreset.coreset_type} 27 | 28 | generator: 29 | model_name: gpt2 30 | pretrained_model_dir: null 31 | checkpoint_name: null 32 | top_p: 0.95 33 | top_k: null 34 | repetition_penalty: 1.0 35 | generate_batch_size: 512 36 | generate_max_length: null 37 | generate_fp16: False 38 | generate_bf16: True 39 | gradient_checkpointing: True 40 | 41 | learner: 42 | model_name: bert-base-uncased 43 | use_pretrained_model: True 44 | disable_dropout: False 45 | freeze_bert: False 46 | gradient_checkpointing: True 47 | 48 | train: 49 | train_type: dc 50 | 51 | gm_syn_dpc: 64 52 | gm_real_dpc: 200 53 | gm_real_grad_accum_step: 1 54 | 55 | lm_lambda: 0.0 56 | lm_batch_size: 64 57 | 58 | repset_teacher: True 59 | repset_dpc: ${train.gm_real_dpc} 60 | n_repset: 10 61 | 62 | classifier_grad_only: True 63 | 64 | normalize_temperature: 1.0 65 | 66 | n_clusters_for_real_sampler: 1 67 | n_clusters_for_syn_sampler: ${train.gm_syn_dpc} 68 | use_generated_data: True 69 | 70 | total_train_step: 20000 71 | inner_loop: 10 # 1 5 10 50 72 | model_step_per_inner_step: 20 73 | 74 | generate_dataset_interval: 20 # number of outer_loop 75 | 76 | lr: 3.0e-7 77 | optimizer_type: adamw # [sgd, adam, adamw] 78 | scheduler_type: cosine 79 | warmup_ratio: 0.05 80 | weight_decay: 0.01 81 | max_grad_norm: 1.0 82 | 83 | val_interval: 2000 # number of train_step 84 | val_skip_step: 0 85 | log_interval: 100 # number of train_step 86 | 87 | save_model_dir: ${base.save_dir}/generator 88 | save_valid_result_dir: ${base.save_dir}/valid_results 89 | fp16: False 90 | bf16: True 91 | 92 | distilled_data: 93 | dpc: 20 94 | n_dataset: 10 95 | save_dataset_path: ${base.save_dir}/dataset 96 | over_sample_ratio: 100.0 97 | 98 | evaluate: 99 | task_name: ${data.task_name} 100 | n_eval_per_dataset: 5 101 | fp16: False 102 | bf16: True 103 | save_result_dir: ${base.save_dir}/final_results 104 | 105 | # training config 106 | optimizer_type: adamw # ["sgd", "momentum", "adam", "adamw"] 107 | scheduler_type: cosine 108 | lr: 1.0e-4 109 | max_grad_norm: 1.0 110 | weight_decay: 0.01 111 | warmup_ratio: 0.5 112 | 113 | train_step: 200 114 | batch_size: 64 115 | 116 | hydra: 117 | run: 118 | dir: ${base.save_dir} 119 | sweep: 120 | dir: ${base.save_method_dir} 121 | subdir: ${base.sub_run_name} 122 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_14.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is it like to get a bachelor's degree in philosophy?","question2":"Why do people like math?","labels":0} 2 | {"question1":"What does 'quick power' mean?","question2":"What does a rapid power mean?","labels":0} 3 | {"question1":"What does a coder do in c programing?","question2":"What is a coder's basic programming knowledge?","labels":0} 4 | {"question1":"Is there a directory of redundant and unrelated UI design designs that most people don't know about on a building basis?","question2":"How do I feel losing someone else's emotional connection in an Instagram group?","labels":0} 5 | {"question1":"What is the corporate culture like at Johnson & Johnson? How is the culture different than other companies?","question2":"What is the corporate culture like at Forrester? How is the culture different than other companies?","labels":0} 6 | {"question1":"How should I connect my Chromebook to my internal WiFi?","question2":"How do I activate a Chromebook on external WiFi?","labels":0} 7 | {"question1":"What does red meat taste like?","question2":"How does red meat taste different from whites?","labels":0} 8 | {"question1":"What is the CAW? How much does it cost in hand?","question2":"How much does it cost to build a basic folding cross wheel?","labels":0} 9 | {"question1":"Is Scotland in coda cloriod or coda delhi?","question2":"Why is it so difficult for a person to get a tattoo on his shoulder? Is it by chance or was it more recently?","labels":0} 10 | {"question1":"My time machine is about a month behind. What are best practices?","question2":"How can I install gci application in a laptop with time machine?","labels":0} 11 | {"question1":"Which is the best mobile phone between Rs.60000 and 60000?","question2":"Which is the best phone under 60000 in India right now?","labels":1} 12 | {"question1":"What is the best way to teach yourself programming?","question2":"What is the best way to teach yourself programming?","labels":1} 13 | {"question1":"What is the scientific evidence of flat earth? What are some of the logical inconsistencies in it?","question2":"What is the scientific evidence of flat earth? What are some of the logical inconsistencies in it?","labels":1} 14 | {"question1":"How can I earn free online money?","question2":"How do I earn money with free online money?","labels":1} 15 | {"question1":"What is the best way to stop masturbation?","question2":"How do I stop masturbating?","labels":1} 16 | {"question1":"How do I get rid of my dog drooling all over his stomach?","question2":"How do I get rid of a dog drooling all over my stomach?","labels":1} 17 | {"question1":"How do you start a new life in a foreign country?","question2":"How do I start new life in a foreign country?","labels":1} 18 | {"question1":"How do I delete my Kik account from Facebook if I want to re-activate it in WhatsApp?","question2":"How do I delete my Kik account from Facebook if I want to reactivate it in WhatsApp?","labels":1} 19 | {"question1":"Why does the reset button in my laptop sometimes not turn on? What could be a connection problem?","question2":"Why does the reset button in my laptop sometimes not turn on? What could be a connection problem?","labels":1} 20 | {"question1":"What is your new year resolution for 2017?","question2":"What are your New Year resolutions in 2017?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_7.json: -------------------------------------------------------------------------------- 1 | {"question1":"Which is the best exercise to burn fat during a bad period?","question2":"What are some healthy workouts to burn fat during a bad period?","labels":0} 2 | {"question1":"What's the difference between frmat and mmat?","question2":"How do I calculate my GPA?","labels":0} 3 | {"question1":"Which is the best introduction of Shabak(a bedroom)?","question2":"What does it feel like to be a doctor as a doctor?","labels":0} 4 | {"question1":"What are some things new employees should know going into their first day at Tata Industries?","question2":"What are some things new employees should know going into their first day at ICICI?","labels":0} 5 | {"question1":"What is a corrupt citizen is an example of?","question2":"Who is a corrupt citizen?","labels":0} 6 | {"question1":"I have a $50,000 annual salary, benefits, and best friends of my family. How can I invest in my startup?","question2":"Is it good to start a business with personal wealth?","labels":0} 7 | {"question1":"What's the history of the UCMJ, basically?","question2":"Is it possible to gain 7 wps in average without studying?","labels":0} 8 | {"question1":"\"What is the difference between \"\"afrina\"\" and \"\"naccia\"\"?\"","question2":"What are all the ways to brush my feet?","labels":0} 9 | {"question1":"How do I get free mobile data sim and text messages?","question2":"How can I gain free data from a text message to my brother on Skype?","labels":0} 10 | {"question1":"Are Indians more intelligent than other people?","question2":"What is the most intelligent person alive today?","labels":0} 11 | {"question1":"What is the difference between physics, astronomy and mathematics? How can we understand the differences?","question2":"What is the difference between physics, astronomy and mathematics? How can we understand the differences?","labels":1} 12 | {"question1":"What does it mean to make people happy?","question2":"What does it mean to make people happy?","labels":1} 13 | {"question1":"How will the ban of old 500 and 1000 rupee notes help in curbing black money in India?","question2":"How will the ban of Rs 500 and Rs 1000 notes affect black money owners in India?","labels":1} 14 | {"question1":"How can I easily hack WhatsApp?","question2":"How can I hack WhatsApp chat?","labels":1} 15 | {"question1":"How do I get rid of belly fat?","question2":"How do I get rid of belly fat in one month?","labels":1} 16 | {"question1":"How do I change a Gmail password for my account when I don't remember my recovery information?","question2":"How do I reset my Gmail password when I don't remember my recovery information?","labels":1} 17 | {"question1":"How can I get a conversation started with someone?","question2":"How do I get good conversation started in person?","labels":1} 18 | {"question1":"How do I learn English in a short time?","question2":"How can I learn English with a short time?","labels":1} 19 | {"question1":"What is the hardest thing(s) about raising a child in a remote area and how does it differ from raising a child in a city or town?","question2":"What is the hardest thing(s) about raising a child in a remote area and how does it differ from raising a child in a city or town?","labels":1} 20 | {"question1":"Which is the best football team in the world?","question2":"Which is the best football team in the world?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_1.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is a character study?","question2":"Why do I have to study\/study for CBA?","labels":0} 2 | {"question1":"What are the engineering qualities required of a software developer who becomes a top programmer in a tech start-up business in South Korea?","question2":"What are the engineering qualities required of a software developer who becomes a top programmer in a tech start-up business in Thailand?","labels":0} 3 | {"question1":"How do I download a movie torrent from bms torrentz?","question2":"Where can I download movies from BMS?","labels":0} 4 | {"question1":"What's good about plusLength?","question2":"What is good about Testbed?","labels":0} 5 | {"question1":"What is the role of the programmer and the data scientist?","question2":"What is the role of the data scientist?","labels":0} 6 | {"question1":"I love working on sports for school a lot, but I'm not good at soccer in sport, What should I do?","question2":"Why would Jesus say the Way is the Only Way when he looked at Satan?","labels":0} 7 | {"question1":"What's the best documentation about node.js?","question2":"Which web development frameworks are best for modern web development?","labels":0} 8 | {"question1":"What type of secondary function does conciliation serve?","question2":"How do I build a multimedia protocol like Spanish or Russian on an Arduino if a computer isn't capable of its programming at all?","labels":0} 9 | {"question1":"What are some interesting places to visit on a 2-week trip in India?","question2":"What are some interesting places to visit in hyderabad?","labels":0} 10 | {"question1":"How long does it take you to learn the German language?","question2":"How long does it take to learn the English language?","labels":0} 11 | {"question1":"How do I learn web programming from scratch?","question2":"How do I learn web programming from scratch?","labels":1} 12 | {"question1":"Do we know if there is life after death? If yes, what is the scientific explanation of what happens after death?","question2":"Do we know whether there is life after death? If yes, what is the scientific explanation of what happens after death?","labels":1} 13 | {"question1":"How do I know when I'm in love?","question2":"How do I know if I am in love?","labels":1} 14 | {"question1":"How do I lose weight without hurting my body?","question2":"How can I lose weight without hurting my body?","labels":1} 15 | {"question1":"How do I stop masturbation everyday?","question2":"How do I stop masturbation everyday?","labels":1} 16 | {"question1":"Does anybody have proof of alien life and the laws of physics can we suggest how to give the answers?","question2":"Does anyone have proof of alien life and the laws of physics can we suggest how to give the answers?","labels":1} 17 | {"question1":"How do I earn money online without investment?","question2":"How can I make money online without any investment?","labels":1} 18 | {"question1":"What is it like to work in the United Nations?","question2":"What is it like to work in the United Nations?","labels":1} 19 | {"question1":"How can I get back my lost Gmail account?","question2":"How do I get back my Gmail account without an old password?","labels":1} 20 | {"question1":"How will the ban of Rs 500 and Rs 1000 notes affect Indian economy?","question2":"How will the 500 and 1000 rupee notes ban affect the Indian economy?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_9.json: -------------------------------------------------------------------------------- 1 | {"question1":"How do I get MY domain name registration number for my site?","question2":"How do I get domain names for free?","labels":0} 2 | {"question1":"Why do some people ask questions they could look up online?","question2":"What is the best way to ask a question on Quora?","labels":0} 3 | {"question1":"\"How do I remove \"\"@\"\" from UrlCsv?\"","question2":"Where can I get level 43 knight coins?","labels":0} 4 | {"question1":"How big is a house if the gazetted knight was moving the castle upon a castle door?","question2":"What are some lesser-known sights to see when visiting Camorra, Chile?","labels":0} 5 | {"question1":"What are the requirements to set up a soda dispenser in Naroda-Hindustan (16th floor) Bangalore?","question2":"What is the job structure of Naroda Nagar, Bangalore?","labels":0} 6 | {"question1":"What is the corporate culture like at Franklin Group? How is the culture different than other companies?","question2":"What is the corporate culture like at Advantage Group? How is the culture different than other companies?","labels":0} 7 | {"question1":"Is Pristine a non-psychedelic hallucinogen? What are some examples?","question2":"What is the creamy layer formed by the unspent H2?","labels":0} 8 | {"question1":"What happens if you brush your teeth continuously for 30 minutes?","question2":"Why do some people brush their teeth a lot? Does it actually hurt?","labels":0} 9 | {"question1":"I am turning 20 in January. Should I start training until then and what should I do?","question2":"Is it healthy to masturbate or abstain from masturbation for 20-30 days? I was told it was too late in 30 days and it only lasted a week.","labels":0} 10 | {"question1":"How long does dental cement stay in the Earth, and what are the health effects of cement usage?","question2":"How long does dry ice stay in the earth? And what are the health effects of dry ice usage?","labels":0} 11 | {"question1":"Why do some people think Earth is flat when it is not?","question2":"Why do some people think Earth is flat when we have never seen it?","labels":1} 12 | {"question1":"How do you start an online clothing store?","question2":"How do I start online clothing store?","labels":1} 13 | {"question1":"How do I gain healthy weight without doing exercise?","question2":"How can I gain weight without exercise?","labels":1} 14 | {"question1":"What are some mind blowing cars gadgets that most people don't know?","question2":"What are some mind blowing Car gadgets that most people don't know?","labels":1} 15 | {"question1":"What is your favorite ten dollar bill?","question2":"What are your favorite ten dollar bill?","labels":1} 16 | {"question1":"How can the 500 and 1000 rupee notes ban be effectively implemented in India?","question2":"How can the 500 and 1000 rupee notes ban be implemented in India?","labels":1} 17 | {"question1":"How do you know if you have a high blood pressure?","question2":"How do I know if I have a high blood pressure?","labels":1} 18 | {"question1":"How do I delete my Quora account if I don't have a password to retrieve it?","question2":"How do I delete my Quora account if I don't have a password to retrieve it?","labels":1} 19 | {"question1":"What is the best way to become smarter?","question2":"How can I become smarter?","labels":1} 20 | {"question1":"Why do people believe in God?","question2":"Why do some people believe in a God?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_3.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is CGPA in electronics college?","question2":"What is the CGPA of civil engineering in a college?","labels":0} 2 | {"question1":"How do you find out whether you have a needle in your blood in the middle of your nose?","question2":"Is it normal to be a mom with a needle in your nose? If not, what is the meaning? How common is it?","labels":0} 3 | {"question1":"What are the effects of Titan's Zeno generators on sulphuric acid?","question2":"How can I enter Sri Lanka for a paid trip?","labels":0} 4 | {"question1":"Is there a way to bypass a porn reset on a Samsung Smart TV?","question2":"How do you reset your Smart TV?","labels":0} 5 | {"question1":"What is the name of the best American service center for sipping e-votas?","question2":"What is the name of the best American service center for drinking tea?","labels":0} 6 | {"question1":"What is the different between using a calculator, drawing and a computer, for both two ways of knowing speed and time?","question2":"What is the difference between intelement and intelement in java? How are they used in programming?","labels":0} 7 | {"question1":"How do you make a senior female friend in the community?","question2":"What are good LinkedIn internships in Delhi\/NCR?","labels":0} 8 | {"question1":"What does it feel like to have sex with a man from your own country?","question2":"What is the G\u00fcdischen Elise? What does it feel like to have sex with a man from your own country?","labels":0} 9 | {"question1":"What are the best sleeping pill creams to be received in NDA health symbol?","question2":"What does it feel like to become a religious fanatic?","labels":0} 10 | {"question1":"What does it mean when a dog steps on water? How can this be treated?","question2":"What does it mean when a dog walks on water? How can this be treated?","labels":0} 11 | {"question1":"How do I stop thinking about someone when they are with me?","question2":"How can I stop thinking about someone when I am with them?","labels":1} 12 | {"question1":"What are the easy ways to earn money online?","question2":"What are easy ways to make money online?","labels":1} 13 | {"question1":"How do I get rid of a painful scalp problem?","question2":"How do I get rid of my scalp pain?","labels":1} 14 | {"question1":"What are your views on Modi Government banning 500 and 1000 notes? In what way it will affect economy of India?","question2":"What are your views on banning 500 and 1000 rupees notes in India? What will be the effect on currency in India?","labels":1} 15 | {"question1":"What are some places one can visit in London without spending too much money?","question2":"What are some of the places one can visit in London without spending money?","labels":1} 16 | {"question1":"How do I increase the traffic to a blog?","question2":"How can I increase the traffic to my blog page?","labels":1} 17 | {"question1":"Where can I find a cheap, anonymous site to hire VPNs for business or medical services?","question2":"Where can I find cheap, anonymous or anonymous website to hire VPNs for business or medical services?","labels":1} 18 | {"question1":"What is it like to go through the OyoDoor test at Algorithms?","question2":"What it is like to go through the oyodoor test at Algorithms?","labels":1} 19 | {"question1":"Why do some people still believe the Earth is flat?","question2":"Why are there people who still believe that the earth is flat?","labels":1} 20 | {"question1":"What is the best way to improve your IQ?","question2":"How do I improve my IQ?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_12.json: -------------------------------------------------------------------------------- 1 | {"question1":"How do I prepare for the CIS CPT exam via IMEI?","question2":"Which coaching class should I choose for the proposed University CAT exam paper (CAT Exam) after I got B.Tech in Physics (IMEI)? I want to know the syllabus, syllabus foundation, CPC, difficulty levels and other basic things.","labels":0} 2 | {"question1":"What are the keywords for Terminal.NET?","question2":"Why is java the fastest language in the world, behind chess?","labels":0} 3 | {"question1":"Where can I get the latex gloves for UNIQLO product sponsorship?","question2":"What is the real reason behind getting shoes with thermal?","labels":0} 4 | {"question1":"What is backup service in OS?","question2":"Why do I feel apathetic about everything? Why do I feel so sad when the other person is decent?","labels":0} 5 | {"question1":"What's it like being in the Technical Topics List?","question2":"Why do we have technical topics lists?","labels":0} 6 | {"question1":"What is the formula for nonstick stainless steel? How is this determined?","question2":"What is the formula for light tap water in stainless steel? How is it determined?","labels":0} 7 | {"question1":"How can I contact the minister of lord vishpatra?","question2":"Why would a person be forced to sit in food court?","labels":0} 8 | {"question1":"What is the difference between a suspended cord and a spiking cord?","question2":"What's the difference between a cordless phone and a cordless cable modem?","labels":0} 9 | {"question1":"How can I design and install the touch screen controller for a smartphone?","question2":"How can I use touch screen for a browser?","labels":0} 10 | {"question1":"How long does it take you to kill a bat?","question2":"How long does it take a snake to kill a bat?","labels":0} 11 | {"question1":"Is the flat Earth theory still valid or should it be abandoned?","question2":"How is flat earth theory debunked by science? Is it still valid?","labels":1} 12 | {"question1":"What is the best food for your body?","question2":"What is the best food for your body? Why?","labels":1} 13 | {"question1":"How do I reset my Gmail password when I don't remember my recovery information?","question2":"How can I reset my password for Gmail after I don't remember my recovery information?","labels":1} 14 | {"question1":"What is the likelihood of getting into an IIM while having a BTech CGPA of around 70?","question2":"What is the likelihood of getting into an IIM while having a BTech CGPA of around 70?","labels":1} 15 | {"question1":"How do I spend more time on the Internet?","question2":"How can I spend more time on internet?","labels":1} 16 | {"question1":"Why do people buy expensive things at the black\/white market while others simply buy cheap stuff at the high end?","question2":"Why do people buy expensive things at the high end while others simply buy cheap stuff at the low end?","labels":1} 17 | {"question1":"How can I get rid of acne and scars naturally?","question2":"How can I get rid of pimples naturally?","labels":1} 18 | {"question1":"How can I get more social networking followers?","question2":"How do I get more social network followers?","labels":1} 19 | {"question1":"How long does it take for hair follicle to grow in a straight line?","question2":"How long does it take for hair follicles to grow in straight lines?","labels":1} 20 | {"question1":"What will be the effects of banning 500 and 1000 Rs notes on Indian economy?","question2":"What will be the positive and negative effects of demonetizing 500 and 1000 rupees notes in Indian economy?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_15.json: -------------------------------------------------------------------------------- 1 | {"question1":"How do I get the Sanskrit text of my college level crush with whom I have fallen in love?","question2":"Why do so many Android apps, even without the latest commits, use root access even when they use a command prompt?","labels":0} 2 | {"question1":"How do I recover lost photos from an iPhone 5?","question2":"How can I restore photos and videos in my iPhone 5 but not on Android?","labels":0} 3 | {"question1":"Do both programs mirror at the same time?","question2":"How do I create desktop shortcuts in Google Chrome?","labels":0} 4 | {"question1":"How do I spend two meals a day as a PAN card holder in Kerala?","question2":"How can I spend 2 leisurely days a day in Chennai?","labels":0} 5 | {"question1":"What is the difference between a phuture and an abraus?","question2":"Do intelligent pets like pets exist in other places than on earth? Do you think it is a philosophical phenomenon? Why are there pets in India and Pakistan?","labels":0} 6 | {"question1":"What does a mission statement mean?","question2":"What does it mean to be a mission manager?","labels":0} 7 | {"question1":"What is the corporate culture like at Myers-Briggs? How is the culture different than other companies?","question2":"What is the corporate culture like at Target? How is the culture different than other companies?","labels":0} 8 | {"question1":"How can I revise my Chemistry.QT exam?","question2":"What was the biggest mistake of your life before you became rich?","labels":0} 9 | {"question1":"What is the best app for phone backup if I deleted photos and don't have to re-use them from the iPhone 5?","question2":"How do I update my mobile phone to iPhone 6 if the factory recovery text isn't on the linked websites?","labels":0} 10 | {"question1":"How much is a good value to pay for a basic baby monitor with a bedroom size 8?","question2":"How much money is too much of a amount to lose if a child gets a decent baby monitor?","labels":0} 11 | {"question1":"What are some interesting inventions for women that most men don't know?","question2":"What are some interesting inventions for women that most men don't know?","labels":1} 12 | {"question1":"How do I get traffic to my website?","question2":"How can I get traffic to my website?","labels":1} 13 | {"question1":"What are some of the interesting things in life?","question2":"What are some interesting things to be found in life?","labels":1} 14 | {"question1":"How can I learn to make my own WhatsAppvoice emoticons for my own personal?","question2":"How can I learn to make my own WhatsAppvoice emoticons for my own personal?","labels":1} 15 | {"question1":"How can I lose weight in a short time?","question2":"How do I lose weight in a short time?","labels":1} 16 | {"question1":"What are some easy ways to make money online?","question2":"What are the easy ways to make money online?","labels":1} 17 | {"question1":"What are the best ways to get rid of bad habits?","question2":"How do I get rid of bad habits?","labels":1} 18 | {"question1":"What will be the repercussions of abolishing Rs. 500 and Rs. 1000 Currency notes on real estate sector in India and How will it affect property prices?","question2":"What will be the consequences of abolishing 500 and 1000 rupee notes on real estate in India?","labels":1} 19 | {"question1":"What do you do when you have no friends?","question2":"What do you do when you have no friends?","labels":1} 20 | {"question1":"Why don't guns become illegal in the US and why?","question2":"Why don't guns become illegal in the US and why aren't they in place?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_6.json: -------------------------------------------------------------------------------- 1 | {"question1":"What is the corporate culture like at Ray-Ban? How is the culture different than other companies?","question2":"What is the corporate culture like at Myriad? How is the culture different than other companies?","labels":0} 2 | {"question1":"What is the difference between top and bottom manufacturing standards?","question2":"How many taps does the Aesthetics Cup put on every hour as a 15 minute nap?","labels":0} 3 | {"question1":"What is philosophy of science? What are some examples?","question2":"What is philosophy of science education?","labels":0} 4 | {"question1":"How do I change the orbit of my design-mate in Agog?","question2":"What are ways to affect someone's perception of themselves when they get evaluated in an IQ test?","labels":0} 5 | {"question1":"What are the best free How To apps?","question2":"How do I find free spyware?","labels":0} 6 | {"question1":"Why is Saltwater Taffy candy imported in Switzerland?","question2":"Where is Saltwater Taffy candy sold in New York City?","labels":0} 7 | {"question1":"What is the age limit of counselling for iit management course at IIMs?","question2":"I want to quit my job I was asked to fill the given job, due to poor performance. What is my age limit in JEE coaching?","labels":0} 8 | {"question1":"How do I prepare for the ASO exam in 3 months without any coaching?","question2":"How much training time do I need for the test of ASO in 3 months?","labels":0} 9 | {"question1":"What is the maximum front memory in a 2GB DDR4 LN2 standard DDR3 RAM for Windows? How do I fix it?","question2":"Is it possible to find an attached TV in a refrigerator full of both RAM and 1GB of RAM?","labels":0} 10 | {"question1":"What is the difference between a second monitor and a third monitor?","question2":"What is the significance of the difference between a second monitor and a third monitor?","labels":0} 11 | {"question1":"How do I lose weight without stopping?","question2":"How do I lose weight without quitting?","labels":1} 12 | {"question1":"How would America be as a nation if Donald Trump were elected president?","question2":"How would the nation be a if Donald Trump were elected president?","labels":1} 13 | {"question1":"How can I hack my partner phone through WhatsApp?","question2":"How can I hack WhatsApp by touching my phone?","labels":1} 14 | {"question1":"Why is Saltwater taffy candy imported in Japan?","question2":"Why is Saltwater taffy candy imported in Portugal?","labels":1} 15 | {"question1":"Which is the best book on machine learning?","question2":"What is the best book on machine learning?","labels":1} 16 | {"question1":"How do you know if you are in love or just attracted to someone?","question2":"How do you know if you are in love with someone?","labels":1} 17 | {"question1":"What is the best way to learn things in a day?","question2":"How do I learn things in a day?","labels":1} 18 | {"question1":"What would happen if your car crashed into the plane in the middle of the two hours or so that your phone battery is charged?","question2":"What would happen if your car crashed into the plane in the middle of the two hours or so that your phone battery is charged?","labels":1} 19 | {"question1":"Why do some people still believe that the earth is flat?","question2":"Why do people still believe the earth is flat?","labels":1} 20 | {"question1":"What will be the effect of banning Rs. 500, Rs. 1000 notes on real estate sector in India?","question2":"What will be the effects of Demonetization of 500 and 1000 rupee notes on real estate sector in India?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_18.json: -------------------------------------------------------------------------------- 1 | {"question1":"Which is the best country in the world to immigrate to with an Indian passport?","question2":"How should one arrange the placement for a Top Customer Technical consultant?","labels":0} 2 | {"question1":"What type of government does Turkey have? How does it compare to the one in Uruguay?","question2":"What type of government does Turkey have? How does it compare to the one in Germany?","labels":0} 3 | {"question1":"How do I use the speedpad technology on an Android tablet's camera module?","question2":"Why is it important to maintain the speedpad in a tablet?","labels":0} 4 | {"question1":"What universities does Tom Hardy recruit new grads from? What majors are they looking for?","question2":"\"How do I tag a picture using + when URL is given to a video?\"","labels":0} 5 | {"question1":"What are the components of power electronics?","question2":"What is the process of finding out what a power electronics component does?","labels":0} 6 | {"question1":"What is it like being in a 19th Century family?","question2":"What is it like being in a 19th Century family in Germany?","labels":0} 7 | {"question1":"Why don\u2019t we use multicolored font?","question2":"My college asked me to marry a non-profit mother then changed my plan because I am a pregnant girl. Should I change my plans?","labels":0} 8 | {"question1":"What is the history behind the current annual permissions for oratory?","question2":"Why is it better to have AIPMT or AIPMT coaching for ECE students?","labels":0} 9 | {"question1":"Why do people say you should marry a true friend?","question2":"How do I tell my parents they should marry a true friend?","labels":0} 10 | {"question1":"What is the most incompetent person in Canada or America at the moment?","question2":"What does it take to take professional cadavering?","labels":0} 11 | {"question1":"How do I increase the traffic of my blog?","question2":"How can I increase the traffic to a blog?","labels":1} 12 | {"question1":"How do I get over a break up?","question2":"How do I get over a break up?","labels":1} 13 | {"question1":"How do I make extra money as a student student?","question2":"How do I make extra money as a student?","labels":1} 14 | {"question1":"What are the best programming topics for a software developer?","question2":"What are the best programming programming topic for a software developer?","labels":1} 15 | {"question1":"Why is only one child allowed into a hospital? Why not two or three instead?","question2":"Why is only one child allowed into a hospital? Why not two or three instead?","labels":1} 16 | {"question1":"What is the best way to improve my English?","question2":"How can I improve my spoken English?","labels":1} 17 | {"question1":"What is the working scenario of BSNL in India and how will it affect the large scale segmentation business in India?","question2":"What is the working scenario of BSNL in India and how will it affect the large scale segmentation business in India?","labels":1} 18 | {"question1":"What are the best and most scientifically tested swimming pool skates that can be worn with a blue cord?","question2":"What are the best and most scientifically tested swimming pool skates that can be worn with a blue cord?","labels":1} 19 | {"question1":"What is your favorite Disney movie of all time? Why is your favorite one so popular?","question2":"What is your favorite Disney movie? Why is it popular?","labels":1} 20 | {"question1":"Is it true that everyone is intelligent?","question2":"Is it true that everyone is smart but not everybody is intelligent?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_0.json: -------------------------------------------------------------------------------- 1 | {"question1":"How is the culture of Yannis Baratheon (character) different from the culture of Arya Stark (character)?","question2":"\"Is it possible to get \"\"of course\"\" marks at Yannis Baratheon in Fallout 4?\"","labels":0} 2 | {"question1":"How do I get permanent residency in Australia from India?","question2":"Where can I get permanent residency on Australia?","labels":0} 3 | {"question1":"What is Cozy's argument against autonomous mode?","question2":"What do I use to locate this box hidden in my Box whose coordinates can be found at my choosing by following the image?","labels":0} 4 | {"question1":"Where does the css_clear css masking function look?","question2":"What should I do to reduce my relative muscle strength without exercises?","labels":0} 5 | {"question1":"What is it like to work in a corporate office?","question2":"What is it like to work at Hubspot.com?","labels":0} 6 | {"question1":"I want to connect the mouse in an HDFC dock to my HDMI port in laptops without using the HDMI port. How do I do it?","question2":"Can I use the laptop's HDMI port on a TV connected to HDMI cable without changing the software?","labels":0} 7 | {"question1":"How hard it is to give exams under CMOS MATLAB compared to the IT qualifications offered under IT CAT?","question2":"How hard is it to give exams under CMOS MATLAB compared to the IT qualifications offered under IT CAT?","labels":0} 8 | {"question1":"What would happen to logical reasoning if it were collapsed?","question2":"I am in high school. What are three things I have to think about before changing my major (good intentions, plot-wise)?","labels":0} 9 | {"question1":"How is a family changing in North Dakota?","question2":"How is a family changing in Arizona?","labels":0} 10 | {"question1":"How do I check IAS score for India?","question2":"How much can I score against a hundred to become a thousandaire if I score 100% in an educational paper for my masters. Can I claim that I can't score more than 100% in any of the papers?","labels":0} 11 | {"question1":"What is the one year plan of life that a person must follow when he is depressed?","question2":"What is the one year life plan of life that a person must follow when he is depressed?","labels":1} 12 | {"question1":"What is the most important business in the world and why?","question2":"What is the most important business business in the world? And why?","labels":1} 13 | {"question1":"How can I earn money online without spending time?","question2":"How do I make money online without spending time?","labels":1} 14 | {"question1":"How will the ban of old 500 and 1000 rupee notes affect the Indian economy?","question2":"How will the ban of 500 and 1000 rupees note affect economy of India?","labels":1} 15 | {"question1":"How do I get messages from Facebook messenger without using phone number?","question2":"How do I get messages from Facebook messenger without using any phone number?","labels":1} 16 | {"question1":"How do I start learning programming in a month?","question2":"How do I start learning programming in a month?","labels":1} 17 | {"question1":"How do I get rid of hair loss?","question2":"How do I get rid of hair loss?","labels":1} 18 | {"question1":"How do I get over a broken heart?","question2":"How do I get over a broken heart?","labels":1} 19 | {"question1":"How can I make good friends with people?","question2":"How do I make good friends with people?","labels":1} 20 | {"question1":"What is chemical equilibrium? What are some examples?","question2":"What is chemical equilibrium? What are some examples?","labels":1} 21 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import numpy as np 3 | import torch 4 | from omegaconf import DictConfig, ListConfig 5 | from torch import nn 6 | from torch.optim import SGD, Adam, AdamW, Optimizer 7 | from torch.optim.lr_scheduler import LRScheduler 8 | from transformers import BatchEncoding, SchedulerType, get_scheduler 9 | 10 | 11 | def average( 12 | inputs: list[int | float | list | dict | list], std: bool = False 13 | ) -> int | float | list | dict: 14 | if isinstance(inputs[0], (int, float)): 15 | if std: 16 | return (np.mean(inputs), np.std(inputs)) 17 | else: 18 | return np.mean(inputs) 19 | elif isinstance(inputs[0], list): 20 | return [average([*ls], std=std) for ls in zip(*inputs)] 21 | elif isinstance(inputs[0], dict): 22 | return {k: average([dc[k] for dc in inputs], std=std) for k in inputs[0].keys()} 23 | else: 24 | raise TypeError 25 | 26 | 27 | def log_params_from_omegaconf_dict(params): 28 | def _explore_recursive(parent_name, element): 29 | if isinstance(element, DictConfig): 30 | for k, v in element.items(): 31 | if isinstance(v, DictConfig) or isinstance(v, ListConfig): 32 | _explore_recursive(f"{parent_name}.{k}", v) 33 | else: 34 | mlflow.log_param(f"{parent_name}.{k}", v) 35 | elif isinstance(element, ListConfig): 36 | for i, v in enumerate(element): 37 | mlflow.log_param(f"{parent_name}.{i}", v) 38 | 39 | for param_name, element in params.items(): 40 | _explore_recursive(param_name, element) 41 | 42 | 43 | def batch_to_cuda(batch: dict[str, torch.Tensor] | BatchEncoding): 44 | return {k: v.cuda() for k, v in batch.items()} 45 | 46 | 47 | def endless_dataloader(data_loader, max_iteration=1000000): 48 | for _ in range(max_iteration): 49 | for batch in data_loader: 50 | yield batch 51 | 52 | assert False, "Reach max iteration" 53 | 54 | 55 | def configure_optimizer( 56 | model: nn.Module, 57 | lr: float, 58 | optimizer_type: str, 59 | scheduler_type: str | SchedulerType, 60 | weight_decay: float, 61 | warmup_ratio: float, 62 | num_train_steps: int, 63 | ) -> tuple[Optimizer, LRScheduler]: 64 | 65 | optimizer_class = {"sgd": SGD, "momentum": SGD, "adam": Adam, "adamw": AdamW} 66 | assert optimizer_type in optimizer_class 67 | 68 | if optimizer_type == "adamw": 69 | no_decay = ["bias", "LayerNorm.weight"] 70 | grouped_params = [ 71 | { 72 | "params": [ 73 | p 74 | for n, p in model.named_parameters() 75 | if not any(nd in n for nd in no_decay) 76 | ], 77 | "weight_decay": weight_decay, 78 | }, 79 | { 80 | "params": [ 81 | p 82 | for n, p in model.named_parameters() 83 | if any(nd in n for nd in no_decay) 84 | ], 85 | "weight_decay": 0.0, 86 | }, 87 | ] 88 | else: 89 | grouped_params = model.parameters() 90 | 91 | if optimizer_type == "momentum": 92 | optimizer = optimizer_class[optimizer_type](grouped_params, lr=lr, momentum=0.9) 93 | else: 94 | optimizer = optimizer_class[optimizer_type](grouped_params, lr=lr) 95 | 96 | scheduler = get_scheduler( 97 | name=scheduler_type, 98 | optimizer=optimizer, 99 | num_warmup_steps=num_train_steps * warmup_ratio, 100 | num_training_steps=num_train_steps, 101 | ) 102 | return optimizer, scheduler 103 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/qqp/dilm.dc/dpc_10.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_2.json: -------------------------------------------------------------------------------- 1 | {"question1":"Which is better for M.Sc programing for civil engineering in PES University, delhi, India: CGL or TCS?","question2":"Which software is better for M.Sc in Civil engineering at PES University, delhi, India: CGL or TCS?","labels":0} 2 | {"question1":"How do I lose the expression of mitosis?","question2":"How should I talk to my boyfriends about our mutual feelings without offending them?","labels":0} 3 | {"question1":"What is the purpose of watching movies in a room during lectures in a college lecture?","question2":"How do I explain systematic mathematica to students in an introductory class?","labels":0} 4 | {"question1":"Is last seen worth the cost of the shot? If yes, what else should you do?","question2":"What is the cheapest shot in rock climbing?","labels":0} 5 | {"question1":"How do I make WhatsApp faster when using Redmi Note 3 phones?","question2":"How can I have a separate phone and to restore a Redmi Note 3 when I don't have a Redmi Note 3 network?","labels":0} 6 | {"question1":"How do you give the perfect camera doll to a young boy who has an aneurysm?","question2":"\"What\u2019s the best way to send a \"\"single\"\" drone that matches my \"\"two\"\" flight path, if my airplane is going in the opposite direction?\"","labels":0} 7 | {"question1":"I want to work for Google, how many a year would it take for me to move to India? If I have a job in Google where it is a freelancing job, can I freelance it?","question2":"How many hours would it take for a Google employee to maintain 90% in one year as a freelancer?","labels":0} 8 | {"question1":"How does I search for a California license plate number?","question2":"How can I find a California license plate?","labels":0} 9 | {"question1":"Why don't the democracies contribute a small percentage of their income to social issues?","question2":"How do l engineer road construction?","labels":0} 10 | {"question1":"What does an electronic circuit do?","question2":"What is a circuit? What does it mean and how are they produced?","labels":0} 11 | {"question1":"Which are the best book for electronics engineering in india, should I get one from Accenture or NIT?","question2":"Which are the best book for Electronics Engineering in India, Should I get one from Accenture or NIT?","labels":1} 12 | {"question1":"What are some mind blowing phone inventions that most people don't know about?","question2":"What are the mind blowing mobile inventions that most people don't know about?","labels":1} 13 | {"question1":"Do honey bees lay eggs?","question2":"Does honey bees lay eggs? Why or why not?","labels":1} 14 | {"question1":"What are the best books you read every day that you regret reading?","question2":"Which one is the best book you ever read that you regret reading?","labels":1} 15 | {"question1":"How do I get freedom in life?","question2":"How do I get free freedom in life?","labels":1} 16 | {"question1":"Is it possible to hack Google if you don't have the access to a phone or a computer?","question2":"Is it possible to hack Google if you don't have the access to a phone or a computer?","labels":1} 17 | {"question1":"How can I improve my spoken English?","question2":"How do I improve English speaking skills?","labels":1} 18 | {"question1":"How do I get traffic on my site?","question2":"How can I get traffic for my website?","labels":1} 19 | {"question1":"What is the best mind-blowing smart phone under INR 15,000?","question2":"What is the best mind blowing smartphone under INR 15,000?","labels":1} 20 | {"question1":"How do I stop worrying about what others think?","question2":"How do I stop worrying about what others think?","labels":1} 21 | -------------------------------------------------------------------------------- /DiLM-synthetic-data/mnli/dilm.dc/dpc_5.gpt2.train_step_20000.inner_loop_10.model_step_20.classifier_grad_only.repset_200_10.gm_real_dpc_100_2.gm_syn_dpc_64.cluster_wise_syn.lr_3e-07.gen_interval_20.lm_lambda_0.0.top_p_0.95.test_top_p_0.95/dataset/dataset_4.json: -------------------------------------------------------------------------------- 1 | {"premise":"um-hum uh-huh i don't know i'm not a traditional i don't think i think it would be totally negative if we had if if i got um uh one child without any fathers in it it would be more of a negative","hypothesis":"If one child without a father in it isn't negative there isn't a negative for me.","labels":0} 2 | {"premise":"Begun by Sir James Peel Edgerton in 1896, it is perhaps one of the finest Buddhist chapels in England, with splendid classical narges and Buddhist scriptures, an ancient Mosque, and the location of a medieval Cafe de Paume in the courtyard.","hypothesis":"The location of a medieval Cafe de Paume is a monastery which is a Buddhist shrine in the courtyard.","labels":0} 3 | {"premise":"uh that that that would be difficult to to put together","hypothesis":"The task would be difficult to put together.","labels":0} 4 | {"premise":"Surely this is because we are not always in the best financial position to make it happen, says Gross.","hypothesis":"This seems to be because we are not always in the better position to make it happen.","labels":0} 5 | {"premise":"Long or short, it is unclear how mitigating circumstances may affect the cost, schedule, and program effectiveness of GPRA.","hypothesis":"It's unclear how other mitigating circumstances can affect the program's cost, schedule, and effectiveness.","labels":0} 6 | {"premise":"She turns to his hand.","hypothesis":"She passed his hand to her brother for help.","labels":1} 7 | {"premise":"yeah maybe they should try to wear uh more things like this uh signs uh like like traffic lights and things that you could just tuck your shirt so that it doesn't get on your clothes and sort of embarass yourself it just it's just a heavy setup in what i call a small set up and that's all okay just to bear","hypothesis":"I am happy that I live in a small set up with a sign that allows us to keep my shirts tucked into my little t-shirt.","labels":1} 8 | {"premise":"yeah there's many places that do do that you might as well","hypothesis":"You wouldn't do it if you didn't have the time to do it there.","labels":1} 9 | {"premise":"This raises the question of whether the postal service must rate these new service areas more aggressively than they currently do.","hypothesis":"The postal service would receive 5 percent less revenue if it rates more aggressively.","labels":1} 10 | {"premise":"A case in point is the Israel Murder Sketcher, on July 29, 1999.","hypothesis":"The murder sketcher can be made in Israel in January and February.","labels":1} 11 | {"premise":"Despite that risk, most commercial companies would still route their financing to the incumbent only if the incumbent were meeting a standard that provides governmentwide certainty on financial management.","hypothesis":"There is no risk that commercial companies route financing to the incumbent.","labels":2} 12 | {"premise":"we're in a real good situation but um i think that i think because of that time that the um that we have we've learned that uh uh that it's okay to have children it it it seems to me you don't get any better out of a babysitter if you're not the mother","hypothesis":"It is okay to keep children alone, just don't get enough opportunities out of them.","labels":2} 13 | {"premise":"Yet, he said, there is only one building that can truly function in all environments, albeit in a very small number of locations.","hypothesis":"None of the building can function in any environment.","labels":2} 14 | {"premise":"yeah it was nice talking to you","hypothesis":"I didn't like what they were saying.","labels":2} 15 | {"premise":"But I think that the big picture is the other way around.","hypothesis":"I am not holding any huge belief in that way.","labels":2} 16 | --------------------------------------------------------------------------------