├── smaberta ├── __init__.py └── smaberta.py ├── requirements.txt ├── examples ├── execute_finetuning.sh ├── test_finetuning.py ├── Tutorial.ipynb └── lm_finetuning.py ├── Dockerfile ├── LICENSE ├── setup.py ├── README.md ├── .gitignore └── data ├── tutorial_test.csv └── tutorial_train.csv /smaberta/__init__.py: -------------------------------------------------------------------------------- 1 | from smaberta.smaberta import TransformerModel 2 | 3 | 4 | __version__ = '0.0.1' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==2.6.0 2 | simpletransformers==0.22.1 3 | pandas 4 | tensorboardX 5 | torch 6 | torchvision 7 | tqdm -------------------------------------------------------------------------------- /examples/execute_finetuning.sh: -------------------------------------------------------------------------------- 1 | export TRAIN_FILE=./data/lm_train 2 | export TEST_FILE=./data/lm_test 3 | 4 | python3 lm_finetuning.py \ 5 | --output_dir=output \ 6 | --model_type=roberta-base \ 7 | --model_name_or_path=roberta-base \ 8 | --line_by_line \ 9 | --do_train \ 10 | --train_data_file=$TRAIN_FILE \ 11 | --do_eval \ 12 | --mlm \ 13 | --eval_data_file=$TEST_FILE 14 | 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.7-slim-stretch 3 | 4 | # Set the working directory to /dockerTutorial 5 | WORKDIR /dockerTutorial 6 | 7 | # Copy the current directory contents into the container at /dockerTutorial 8 | COPY . /dockerTutorial 9 | 10 | #Install any needed packages specified in requirements.txt 11 | RUN pip3 install -r requirements.txt 12 | 13 | # Define environment variable 14 | ENV COUNT 0 15 | 16 | # Run test.py when the container launches 17 | CMD ["python", "train_and_classify_clinton_tweets.py"] 18 | -------------------------------------------------------------------------------- /examples/test_finetuning.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import random 4 | import torch 5 | import pickle 6 | random.seed(1) 7 | np.random.seed(1) 8 | torch.manual_seed(1) 9 | torch.cuda.manual_seed(1) 10 | 11 | from smaberta import TransformerModel 12 | 13 | model = TransformerModel('roberta', 'roberta-base', finetune=True, args={"num_train_epochs":1, 'fp16':False, "output_dir":"test-finetune", "reprocess_input":True}) 14 | 15 | #model.lm_evaluate('./data/lm_eval') 16 | print("------------------------------------------------") 17 | 18 | model.finetune("./data/lm_train", "./data/lm_eval") 19 | 20 | print("------------------------------------------------") 21 | model.lm_evaluate('./data/lm_eval') 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Zhanna Terechshenko and Vishakh Padmakumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup 4 | 5 | if sys.version_info[0] != 3: 6 | raise RuntimeError('Unsupported python version "{0}"'.format( 7 | sys.version_info[0])) 8 | 9 | def _get_file_content(file_name): 10 | with open(file_name, 'r') as file_handler: 11 | return str(file_handler.read()) 12 | def get_long_description(): 13 | return _get_file_content('README.md') 14 | 15 | #on_rtd = os.environ.get('READTHEDOCS') == 'True' 16 | 17 | #if not on_rtd: 18 | # INSTALL_REQUIRES = [ 19 | # 'pandas', 20 | # 'requests', 21 | # ] 22 | #else: 23 | # INSTALL_REQUIRES = [ 24 | # 'requests', 25 | # ] 26 | 27 | INSTALL_REQUIRES = [ 28 | 'transformers==2.6.0', 29 | 'simpletransformers==0.22.1', 30 | 'pandas', 31 | 'torch', 32 | 'torchvision', 33 | 'tensorboardX', 34 | 'tqdm' 35 | ] 36 | 37 | setup( 38 | name="smaberta", 39 | version='0.0.2', 40 | author="Vishakh Padmakumar, Zhanna Terechshenko", 41 | description="a wrapper for the huggingface transformer libraries", 42 | long_description=get_long_description(), 43 | long_description_content_type="text/markdown", 44 | keywords='nlp transformers classification text-classification fine-tuning', 45 | url="https://github.com/SMAPPNYU/SMaBERTa.git", 46 | packages=['smaberta'], 47 | py_modules=['smaberta'], 48 | license="MIT", 49 | classifiers=( 50 | "Programming Language :: Python :: 3", 51 | "License :: OSI Approved :: MIT License", 52 | "Operating System :: OS Independent", 53 | ), 54 | install_requires=INSTALL_REQUIRES 55 | ) 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SMaBERTa 2 | 3 | PyPI version 4 | DOI 5 | 6 | This repository contains the code for SMaBERTa, a wrapper for the huggingface transformer libraries. 7 | It was developed by Zhanna Terechshenko and Vishakh Padmakumar through research at the Center for 8 | Social Media and Politics at NYU. 9 | 10 | ## Setup 11 | 12 | To install using pip, run 13 | ``` 14 | pip install smaberta 15 | ``` 16 | 17 | To install from the source, first download the repository by running 18 | 19 | ``` 20 | git clone https://github.com/SMAPPNYU/SMaBERTa.git 21 | ``` 22 | 23 | Then, install the dependencies for this repo and setup by running 24 | ``` 25 | cd SMaBERTa 26 | pip install -r requirements.txt 27 | python setup.py install 28 | ``` 29 | 30 | ## Using the package 31 | 32 | Basic use: 33 | 34 | ``` 35 | from smaberta import TransformerModel 36 | 37 | epochs = 3 38 | lr = 4e-6 39 | 40 | training_sample = ['Today is a great day', 'Today is a terrible day'] 41 | training_labels = [1, 0] 42 | 43 | model = TransformerModel('roberta', 'roberta-base', num_labels=25, reprocess_input_data=True, 44 | num_train_epochs=epochs, learning_rate=lr, output_dir='./saved_model/', 45 | overwrite_output_dir=True, fp16=False) 46 | 47 | model.train(training_sample, training_labels) 48 | 49 | ``` 50 | 51 | For further details, see `Tutorial.ipynb` in the [examples](https://github.com/SMAPPNYU/SMaBERTa/tree/master/examples) directory. 52 | 53 | # Acknowledgements 54 | 55 | Code for this project was adapted from version 0.6 of https://github.com/ThilinaRajapakse/simpletransformers 56 | 57 | Vishakh Padmakumar and Zhanna Terechshenko contributed to the software writing, implementation, and testing. 58 | 59 | Megan Brown contributed to documentation and publication. 60 | 61 | If you use this software in your research please cite it as: 62 | 63 | ``` 64 | @misc{padmakumar_terechshenko, 65 | author = {Vishakh Padmakumar and Zhanna Terechshenko}, 66 | title = {SMAPPNYU/SMaBERTa}, 67 | month = dec, 68 | year = 2020, 69 | doi = {10.5281/zenodo.5090728}, 70 | url = {https://doi.org/10.5281/zenodo.5090728} 71 | } 72 | ``` 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # package specific 2 | .ipynb_checkpoints/* 3 | runs/* 4 | data/* 5 | outputs/* 6 | __pycache__/* 7 | cache_dir/* 8 | *.pkl 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | -------------------------------------------------------------------------------- /examples/Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import warnings\n", 10 | "warnings.filterwarnings('ignore')\n", 11 | "\n", 12 | "import pandas as pd\n", 13 | "import numpy as np\n", 14 | "import random\n", 15 | "import torch\n", 16 | "import pickle\n", 17 | "random.seed(1)\n", 18 | "np.random.seed(1)\n", 19 | "torch.manual_seed(1)\n", 20 | "torch.cuda.manual_seed(1)\n", 21 | "\n", 22 | "import sys\n", 23 | "sys.path.append('../smaberta')\n", 24 | "from smaberta import TransformerModel" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Loading Data\n", 32 | "\n", 33 | "Load train data stored in CSV format using Pandas. Pretty much any format is acceptable, just some form of text and accompanying labels. Modify according to your task. For the purpose of this tutorial, we are using a sample from New York Times Front Page Dataset (Boydstun, 2014)." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "train_df = pd.read_csv(\"../data/tutorial_train.csv\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Loading test data" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "test_df = pd.read_csv(\"../data/tutorial_test.csv\")" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Just to get an idea of what this dataset looks like" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Paired data consisting of freeform text accompanied by their supervised labels towards the particular task. Here the text is headlines of news stories and the label categorizes them into the subjects. We have a total of 25 possible labels here, each represented by a separate number." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "print(len(train_df.label.values))" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/html": [ 92 | "
\n", 93 | "\n", 106 | "\n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | "
textlabel
0AIDS in prison, treatment costs overwhelm pris...12
1olympics security19
2police brutality12
3Iranian nuclear program; deal with European Un...16
4terror alert raised16
\n", 142 | "
" 143 | ], 144 | "text/plain": [ 145 | " text label\n", 146 | "0 AIDS in prison, treatment costs overwhelm pris... 12\n", 147 | "1 olympics security 19\n", 148 | "2 police brutality 12\n", 149 | "3 Iranian nuclear program; deal with European Un... 16\n", 150 | "4 terror alert raised 16" 151 | ] 152 | }, 153 | "execution_count": 4, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "train_df.head()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 5, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "['AIDS in prison, treatment costs overwhelm prison budgets', 'olympics security', 'police brutality', 'Iranian nuclear program; deal with European Union and its leaving of Iran free to develop plutonium.', 'terror alert raised', 'Job report shows unexpected vigor for US economy', \"Clinton proposes West Bank Plan to Isreal's Prime Minister Netanyahu\", 'Senators debate Iraq War policy', 'Myrtle Beach', 'china visit'] [12, 19, 12, 16, 16, 5, 19, 16, 14, 19]\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "print(train_df.text[:10].tolist(), train_df.label[:10].tolist())" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "### Learning Parameters\n", 184 | "These are training arguments that you would use to train the classifier. For the purposes of the tutorial we set some sample values. Presumably in a different case you would perform a grid search or random search CV" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 6, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "Learning Rate 0.001\n", 197 | "Train Epochs 2\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "lr = 1e-3\n", 203 | "epochs = 2\n", 204 | "print(\"Learning Rate \", lr)\n", 205 | "print(\"Train Epochs \", epochs)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "### Initialise model\n", 213 | "1. First argument is indicative to use the Roberta architecture (alternatives - Bert, XLNet... as provided by Huggingface). Used to specify the right tokenizer and classification head as well \n", 214 | "2. Second argument provides intialisation point as provided by Huggingface [here](https://huggingface.co/transformers/pretrained_models.html). Examples - roberta-base, roberta-large, gpt2-large...\n", 215 | "3. The tokenizer accepts the freeform text input and tansforms it into a sequence of tokens suitable for input to the transformer. The transformer architecture processes these before passing it on to the classifier head which transforms this representation into the label space. \n", 216 | "4. Number of labels is specified below to initialise the classification head appropriately. As per the classification task you would change this.\n", 217 | "5. You can see the training args set above were used in the model initiation below.. \n", 218 | "6. Pass in training arguments as initialised, especially note the output directory where the model is to be saved and also training logs will be output. The overwrite output directory parameter is a safeguard in case you're rerunning the experiment. Similarly if you're rerunning the same experiment with different parameters, you might not want to reprocess the input every time - the first time it's done, it is cached so you might be able to just reuse the same. fp16 refers to floating point precision which you set according to the GPUs available to you, it shouldn't affect the classification result just the performance." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 7, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "model = TransformerModel('roberta', 'roberta-base', num_labels=25, reprocess_input_data=True, num_train_epochs=epochs, learning_rate=lr, \n", 228 | " output_dir='./saved_model/', overwrite_output_dir=True, fp16=False)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "### Run training" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 8, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "Starting Epoch: 0\n", 248 | "Starting Epoch: 1\n", 249 | "Training of roberta model complete. Saved to ./saved_model/.\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "model.train(train_df['text'], test_df['label'])" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "To see more in depth logs, set flag show_running_loss=True on the function call of train_model" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "### Inference from model\n", 269 | "\n", 270 | "At training time the model is saved to the output directory that was passed in at initialization. We can either continue retaining the same model object, or load from the directory it was previously saved at. In this example we show the loading to illustrate how you would do the same. This is helpful when you want to train and save a classifier and use the same sporadically. For example in an online setting where you have some labelled training data you would train and save a model, and then load and use it to classify tweets as your collection pipeline progresses." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 9, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "model = TransformerModel('roberta', 'roberta-base', num_labels=25, location=\"./saved_model/\")" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "### Evaluate on test set\n", 287 | "\n", 288 | "At inference time we have access to the model outputs which we can use to make predictions as shown below. Similarly you could perform any emprical analysis on the output before/after saving the same. Typically you would save the results for replication purposes. You can use the model outputs as you would on a normal Pytorch model, here we just show label predictions and accuracy. In this tutorial we only used a fraction of the available data, hence why the actual accuracy is not great. For full results that we conducted on the experiments, check out our paper." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 10, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "{'mcc': 0.0}\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "result, model_outputs, wrong_predictions = model.evaluate(test_df['text'], test_df['label'])\n", 306 | "preds = np.argmax(model_outputs, axis = 1)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 12, 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/plain": [ 317 | "(998, 998)" 318 | ] 319 | }, 320 | "execution_count": 12, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | } 324 | ], 325 | "source": [ 326 | "len(test_df), len(preds)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 14, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "name": "stdout", 336 | "output_type": "stream", 337 | "text": [ 338 | "Accuracy: 0.23947895791583165\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "correct = 0\n", 344 | "labels = test_df['label'].tolist()\n", 345 | "for i in range(len(labels)):\n", 346 | " if preds[i] == labels[i]:\n", 347 | " correct+=1\n", 348 | "\n", 349 | "accuracy = correct/len(labels)\n", 350 | "print(\"Accuracy: \", accuracy)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 15, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "pickle.dump(model_outputs, open(\"../model_outputs.pkl\", \"wb\"))" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "### Run inference \n", 367 | "\n", 368 | "This is the use case when you only have a new set of documents and no labels. For example if we just want to make predictions on a set of new text documents without loading a pandas datafram i.e. if you just have a list of texts, it can be predicted as shown below. Note that here you have the predictions and model outputs." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 17, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "texts = test_df['text'].tolist()" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 18, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "preds, model_outputs = model.predict(texts)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 19, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "Accuracy: 0.23947895791583165\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "correct = 0\n", 404 | "for i in range(len(labels)):\n", 405 | " if preds[i] == labels[i]:\n", 406 | " correct+=1\n", 407 | "\n", 408 | "accuracy = correct/len(labels)\n", 409 | "print(\"Accuracy: \", accuracy)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "### References" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "Boydstun, Amber E. (2014). New York Times Front Page Dataset. www.comparativeagendas.net. Accessed April 26, 2019.\n", 424 | "\n", 425 | "\n", 426 | "\n" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [] 435 | } 436 | ], 437 | "metadata": { 438 | "kernelspec": { 439 | "display_name": "Python 3", 440 | "language": "python", 441 | "name": "python3" 442 | }, 443 | "language_info": { 444 | "codemirror_mode": { 445 | "name": "ipython", 446 | "version": 3 447 | }, 448 | "file_extension": ".py", 449 | "mimetype": "text/x-python", 450 | "name": "python", 451 | "nbconvert_exporter": "python", 452 | "pygments_lexer": "ipython3", 453 | "version": "3.6.9" 454 | } 455 | }, 456 | "nbformat": 4, 457 | "nbformat_minor": 4 458 | } 459 | -------------------------------------------------------------------------------- /examples/lm_finetuning.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned 19 | using a masked language modeling (MLM) loss. 20 | """ 21 | 22 | 23 | import argparse 24 | import glob 25 | import logging 26 | import os 27 | import pickle 28 | import random 29 | import re 30 | import shutil 31 | from typing import Dict, List, Tuple 32 | 33 | import numpy as np 34 | import torch 35 | from torch.nn.utils.rnn import pad_sequence 36 | from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler 37 | from torch.utils.data.distributed import DistributedSampler 38 | from tqdm import tqdm, trange 39 | 40 | from transformers import ( 41 | MODEL_WITH_LM_HEAD_MAPPING, 42 | WEIGHTS_NAME, 43 | AdamW, 44 | AutoConfig, 45 | AutoModelWithLMHead, 46 | AutoTokenizer, 47 | PreTrainedModel, 48 | PreTrainedTokenizer, 49 | get_linear_schedule_with_warmup, 50 | ) 51 | 52 | 53 | try: 54 | from torch.utils.tensorboard import SummaryWriter 55 | except ImportError: 56 | from tensorboardX import SummaryWriter 57 | 58 | 59 | logger = logging.getLogger(__name__) 60 | 61 | 62 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 63 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 64 | 65 | 66 | class TextDataset(Dataset): 67 | def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): 68 | assert os.path.isfile(file_path) 69 | 70 | block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence) 71 | 72 | directory, filename = os.path.split(file_path) 73 | cached_features_file = os.path.join( 74 | directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename 75 | ) 76 | 77 | if os.path.exists(cached_features_file) and not args.overwrite_cache: 78 | logger.info("Loading features from cached file %s", cached_features_file) 79 | with open(cached_features_file, "rb") as handle: 80 | self.examples = pickle.load(handle) 81 | else: 82 | logger.info("Creating features from dataset file at %s", directory) 83 | 84 | self.examples = [] 85 | with open(file_path, encoding="utf-8") as f: 86 | text = f.read() 87 | 88 | tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) 89 | 90 | for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size 91 | self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])) 92 | # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) 93 | # If your dataset is small, first you should loook for a bigger one :-) and second you 94 | # can change this behavior by adding (model specific) padding. 95 | 96 | logger.info("Saving features into cached file %s", cached_features_file) 97 | with open(cached_features_file, "wb") as handle: 98 | pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) 99 | 100 | def __len__(self): 101 | return len(self.examples) 102 | 103 | def __getitem__(self, item): 104 | return torch.tensor(self.examples[item], dtype=torch.long) 105 | 106 | 107 | class LineByLineTextDataset(Dataset): 108 | def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): 109 | assert os.path.isfile(file_path) 110 | # Here, we do not cache the features, operating under the assumption 111 | # that we will soon use fast multithreaded tokenizers from the 112 | # `tokenizers` repo everywhere =) 113 | logger.info("Creating features from dataset file at %s", file_path) 114 | 115 | with open(file_path, encoding="utf-8") as f: 116 | lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] 117 | 118 | self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"] 119 | 120 | def __len__(self): 121 | return len(self.examples) 122 | 123 | def __getitem__(self, i): 124 | return torch.tensor(self.examples[i], dtype=torch.long) 125 | 126 | 127 | def load_and_cache_examples(args, tokenizer, evaluate=False): 128 | file_path = args.eval_data_file if evaluate else args.train_data_file 129 | if args.line_by_line: 130 | return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size) 131 | else: 132 | return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size) 133 | 134 | 135 | def set_seed(args): 136 | random.seed(args.seed) 137 | np.random.seed(args.seed) 138 | torch.manual_seed(args.seed) 139 | if args.n_gpu > 0: 140 | torch.cuda.manual_seed_all(args.seed) 141 | 142 | 143 | def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]: 144 | ordering_and_checkpoint_path = [] 145 | 146 | glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix))) 147 | 148 | for path in glob_checkpoints: 149 | if use_mtime: 150 | ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) 151 | else: 152 | regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path) 153 | if regex_match and regex_match.groups(): 154 | ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) 155 | 156 | checkpoints_sorted = sorted(ordering_and_checkpoint_path) 157 | checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] 158 | return checkpoints_sorted 159 | 160 | 161 | def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None: 162 | if not args.save_total_limit: 163 | return 164 | if args.save_total_limit <= 0: 165 | return 166 | 167 | # Check if we should delete older checkpoint(s) 168 | checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime) 169 | if len(checkpoints_sorted) <= args.save_total_limit: 170 | return 171 | 172 | number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit) 173 | checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] 174 | for checkpoint in checkpoints_to_be_deleted: 175 | logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) 176 | shutil.rmtree(checkpoint) 177 | 178 | 179 | def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: 180 | """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ 181 | 182 | if tokenizer.mask_token is None: 183 | raise ValueError( 184 | "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." 185 | ) 186 | 187 | labels = inputs.clone() 188 | # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) 189 | probability_matrix = torch.full(labels.shape, args.mlm_probability) 190 | special_tokens_mask = [ 191 | tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() 192 | ] 193 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) 194 | if tokenizer._pad_token is not None: 195 | padding_mask = labels.eq(tokenizer.pad_token_id) 196 | probability_matrix.masked_fill_(padding_mask, value=0.0) 197 | masked_indices = torch.bernoulli(probability_matrix).bool() 198 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 199 | 200 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 201 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 202 | inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) 203 | 204 | # 10% of the time, we replace masked input tokens with random word 205 | indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced 206 | random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) 207 | inputs[indices_random] = random_words[indices_random] 208 | 209 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 210 | return inputs, labels 211 | 212 | 213 | def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: 214 | """ Train the model """ 215 | if args.local_rank in [-1, 0]: 216 | tb_writer = SummaryWriter() 217 | 218 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 219 | 220 | def collate(examples: List[torch.Tensor]): 221 | if tokenizer._pad_token is None: 222 | return pad_sequence(examples, batch_first=True) 223 | return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) 224 | 225 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 226 | train_dataloader = DataLoader( 227 | train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate 228 | ) 229 | 230 | if args.max_steps > 0: 231 | t_total = args.max_steps 232 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 233 | else: 234 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 235 | 236 | # Prepare optimizer and schedule (linear warmup and decay) 237 | no_decay = ["bias", "LayerNorm.weight"] 238 | optimizer_grouped_parameters = [ 239 | { 240 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 241 | "weight_decay": args.weight_decay, 242 | }, 243 | {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, 244 | ] 245 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 246 | scheduler = get_linear_schedule_with_warmup( 247 | optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total 248 | ) 249 | 250 | # Check if saved optimizer or scheduler states exist 251 | if ( 252 | args.model_name_or_path 253 | and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) 254 | and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) 255 | ): 256 | # Load in optimizer and scheduler states 257 | optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) 258 | scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) 259 | 260 | if args.fp16: 261 | try: 262 | from apex import amp 263 | except ImportError: 264 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 265 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 266 | 267 | # multi-gpu training (should be after apex fp16 initialization) 268 | if args.n_gpu > 1: 269 | model = torch.nn.DataParallel(model) 270 | 271 | # Distributed training (should be after apex fp16 initialization) 272 | if args.local_rank != -1: 273 | model = torch.nn.parallel.DistributedDataParallel( 274 | model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True 275 | ) 276 | 277 | # Train! 278 | logger.info("***** Running training *****") 279 | logger.info(" Num examples = %d", len(train_dataset)) 280 | logger.info(" Num Epochs = %d", args.num_train_epochs) 281 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 282 | logger.info( 283 | " Total train batch size (w. parallel, distributed & accumulation) = %d", 284 | args.train_batch_size 285 | * args.gradient_accumulation_steps 286 | * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), 287 | ) 288 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 289 | logger.info(" Total optimization steps = %d", t_total) 290 | 291 | global_step = 0 292 | epochs_trained = 0 293 | steps_trained_in_current_epoch = 0 294 | # Check if continuing training from a checkpoint 295 | if args.model_name_or_path and os.path.exists(args.model_name_or_path): 296 | try: 297 | # set global_step to gobal_step of last saved checkpoint from model path 298 | checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] 299 | global_step = int(checkpoint_suffix) 300 | epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) 301 | steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) 302 | 303 | logger.info(" Continuing training from checkpoint, will skip to saved global_step") 304 | logger.info(" Continuing training from epoch %d", epochs_trained) 305 | logger.info(" Continuing training from global step %d", global_step) 306 | logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) 307 | except ValueError: 308 | logger.info(" Starting fine-tuning.") 309 | 310 | tr_loss, logging_loss = 0.0, 0.0 311 | 312 | model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training 313 | model_to_resize.resize_token_embeddings(len(tokenizer)) 314 | 315 | model.zero_grad() 316 | train_iterator = trange( 317 | epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] 318 | ) 319 | set_seed(args) # Added here for reproducibility 320 | for _ in train_iterator: 321 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 322 | for step, batch in enumerate(epoch_iterator): 323 | 324 | # Skip past any already trained steps if resuming training 325 | if steps_trained_in_current_epoch > 0: 326 | steps_trained_in_current_epoch -= 1 327 | continue 328 | 329 | inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) 330 | inputs = inputs.to(args.device) 331 | labels = labels.to(args.device) 332 | model.train() 333 | outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) 334 | loss = outputs[0] # model outputs are always tuple in transformers (see doc) 335 | 336 | if args.n_gpu > 1: 337 | loss = loss.mean() # mean() to average on multi-gpu parallel training 338 | if args.gradient_accumulation_steps > 1: 339 | loss = loss / args.gradient_accumulation_steps 340 | 341 | if args.fp16: 342 | with amp.scale_loss(loss, optimizer) as scaled_loss: 343 | scaled_loss.backward() 344 | else: 345 | loss.backward() 346 | 347 | tr_loss += loss.item() 348 | if (step + 1) % args.gradient_accumulation_steps == 0: 349 | if args.fp16: 350 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 351 | else: 352 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 353 | optimizer.step() 354 | scheduler.step() # Update learning rate schedule 355 | model.zero_grad() 356 | global_step += 1 357 | 358 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 359 | # Log metrics 360 | if ( 361 | args.local_rank == -1 and args.evaluate_during_training 362 | ): # Only evaluate when single GPU otherwise metrics may not average well 363 | results = evaluate(args, model, tokenizer) 364 | for key, value in results.items(): 365 | tb_writer.add_scalar("eval_{}".format(key), value, global_step) 366 | tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) 367 | tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) 368 | logging_loss = tr_loss 369 | 370 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: 371 | checkpoint_prefix = "checkpoint" 372 | # Save model checkpoint 373 | output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) 374 | os.makedirs(output_dir, exist_ok=True) 375 | model_to_save = ( 376 | model.module if hasattr(model, "module") else model 377 | ) # Take care of distributed/parallel training 378 | model_to_save.save_pretrained(output_dir) 379 | tokenizer.save_pretrained(output_dir) 380 | 381 | torch.save(args, os.path.join(output_dir, "training_args.bin")) 382 | logger.info("Saving model checkpoint to %s", output_dir) 383 | 384 | _rotate_checkpoints(args, checkpoint_prefix) 385 | 386 | torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) 387 | torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) 388 | logger.info("Saving optimizer and scheduler states to %s", output_dir) 389 | 390 | if args.max_steps > 0 and global_step > args.max_steps: 391 | epoch_iterator.close() 392 | break 393 | if args.max_steps > 0 and global_step > args.max_steps: 394 | train_iterator.close() 395 | break 396 | 397 | if args.local_rank in [-1, 0]: 398 | tb_writer.close() 399 | 400 | return global_step, tr_loss / global_step 401 | 402 | 403 | def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: 404 | # Loop to handle MNLI double evaluation (matched, mis-matched) 405 | eval_output_dir = args.output_dir 406 | 407 | eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) 408 | 409 | if args.local_rank in [-1, 0]: 410 | os.makedirs(eval_output_dir, exist_ok=True) 411 | 412 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 413 | # Note that DistributedSampler samples randomly 414 | 415 | def collate(examples: List[torch.Tensor]): 416 | if tokenizer._pad_token is None: 417 | return pad_sequence(examples, batch_first=True) 418 | return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) 419 | 420 | eval_sampler = SequentialSampler(eval_dataset) 421 | eval_dataloader = DataLoader( 422 | eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate 423 | ) 424 | 425 | # multi-gpu evaluate 426 | if args.n_gpu > 1: 427 | model = torch.nn.DataParallel(model) 428 | 429 | # Eval! 430 | logger.info("***** Running evaluation {} *****".format(prefix)) 431 | logger.info(" Num examples = %d", len(eval_dataset)) 432 | logger.info(" Batch size = %d", args.eval_batch_size) 433 | eval_loss = 0.0 434 | nb_eval_steps = 0 435 | model.eval() 436 | 437 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 438 | inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) 439 | inputs = inputs.to(args.device) 440 | labels = labels.to(args.device) 441 | 442 | with torch.no_grad(): 443 | outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) 444 | lm_loss = outputs[0] 445 | eval_loss += lm_loss.mean().item() 446 | nb_eval_steps += 1 447 | 448 | eval_loss = eval_loss / nb_eval_steps 449 | perplexity = torch.exp(torch.tensor(eval_loss)) 450 | 451 | result = {"perplexity": perplexity} 452 | 453 | output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") 454 | with open(output_eval_file, "w") as writer: 455 | logger.info("***** Eval results {} *****".format(prefix)) 456 | for key in sorted(result.keys()): 457 | logger.info(" %s = %s", key, str(result[key])) 458 | writer.write("%s = %s\n" % (key, str(result[key]))) 459 | 460 | return result 461 | 462 | 463 | def main(): 464 | parser = argparse.ArgumentParser() 465 | 466 | # Required parameters 467 | parser.add_argument( 468 | "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)." 469 | ) 470 | parser.add_argument( 471 | "--output_dir", 472 | type=str, 473 | required=True, 474 | help="The output directory where the model predictions and checkpoints will be written.", 475 | ) 476 | parser.add_argument( 477 | "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.", 478 | ) 479 | 480 | # Other parameters 481 | parser.add_argument( 482 | "--eval_data_file", 483 | default=None, 484 | type=str, 485 | help="An optional input evaluation data file to evaluate the perplexity on (a text file).", 486 | ) 487 | parser.add_argument( 488 | "--line_by_line", 489 | action="store_true", 490 | help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.", 491 | ) 492 | parser.add_argument( 493 | "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir" 494 | ) 495 | parser.add_argument( 496 | "--model_name_or_path", 497 | default=None, 498 | type=str, 499 | help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.", 500 | ) 501 | 502 | parser.add_argument( 503 | "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling." 504 | ) 505 | parser.add_argument( 506 | "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss" 507 | ) 508 | 509 | parser.add_argument( 510 | "--config_name", 511 | default=None, 512 | type=str, 513 | help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.", 514 | ) 515 | parser.add_argument( 516 | "--tokenizer_name", 517 | default=None, 518 | type=str, 519 | help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.", 520 | ) 521 | parser.add_argument( 522 | "--cache_dir", 523 | default=None, 524 | type=str, 525 | help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)", 526 | ) 527 | parser.add_argument( 528 | "--block_size", 529 | default=-1, 530 | type=int, 531 | help="Optional input sequence length after tokenization." 532 | "The training dataset will be truncated in block of this size for training." 533 | "Default to the model max input length for single sentence inputs (take into account special tokens).", 534 | ) 535 | parser.add_argument("--do_train", action="store_true", help="Whether to run training.") 536 | parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") 537 | parser.add_argument( 538 | "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." 539 | ) 540 | 541 | parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") 542 | parser.add_argument( 543 | "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation." 544 | ) 545 | parser.add_argument( 546 | "--gradient_accumulation_steps", 547 | type=int, 548 | default=1, 549 | help="Number of updates steps to accumulate before performing a backward/update pass.", 550 | ) 551 | parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") 552 | parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") 553 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") 554 | parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") 555 | parser.add_argument( 556 | "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform." 557 | ) 558 | parser.add_argument( 559 | "--max_steps", 560 | default=-1, 561 | type=int, 562 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.", 563 | ) 564 | parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") 565 | 566 | parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") 567 | parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") 568 | parser.add_argument( 569 | "--save_total_limit", 570 | type=int, 571 | default=None, 572 | help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default", 573 | ) 574 | parser.add_argument( 575 | "--eval_all_checkpoints", 576 | action="store_true", 577 | help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number", 578 | ) 579 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 580 | parser.add_argument( 581 | "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" 582 | ) 583 | parser.add_argument( 584 | "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" 585 | ) 586 | parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") 587 | 588 | parser.add_argument( 589 | "--fp16", 590 | action="store_true", 591 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", 592 | ) 593 | parser.add_argument( 594 | "--fp16_opt_level", 595 | type=str, 596 | default="O1", 597 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 598 | "See details at https://nvidia.github.io/apex/amp.html", 599 | ) 600 | parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") 601 | parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") 602 | parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") 603 | args = parser.parse_args() 604 | 605 | if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm: 606 | raise ValueError( 607 | "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " 608 | "flag (masked language modeling)." 609 | ) 610 | if args.eval_data_file is None and args.do_eval: 611 | raise ValueError( 612 | "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " 613 | "or remove the --do_eval argument." 614 | ) 615 | if args.should_continue: 616 | sorted_checkpoints = _sorted_checkpoints(args) 617 | if len(sorted_checkpoints) == 0: 618 | raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.") 619 | else: 620 | args.model_name_or_path = sorted_checkpoints[-1] 621 | 622 | if ( 623 | os.path.exists(args.output_dir) 624 | and os.listdir(args.output_dir) 625 | and args.do_train 626 | and not args.overwrite_output_dir 627 | ): 628 | raise ValueError( 629 | "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( 630 | args.output_dir 631 | ) 632 | ) 633 | 634 | # Setup distant debugging if needed 635 | if args.server_ip and args.server_port: 636 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script 637 | import ptvsd 638 | 639 | print("Waiting for debugger attach") 640 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) 641 | ptvsd.wait_for_attach() 642 | 643 | # Setup CUDA, GPU & distributed training 644 | if args.local_rank == -1 or args.no_cuda: 645 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 646 | args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() 647 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 648 | torch.cuda.set_device(args.local_rank) 649 | device = torch.device("cuda", args.local_rank) 650 | torch.distributed.init_process_group(backend="nccl") 651 | args.n_gpu = 1 652 | args.device = device 653 | 654 | # Setup logging 655 | logging.basicConfig( 656 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 657 | datefmt="%m/%d/%Y %H:%M:%S", 658 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, 659 | ) 660 | logger.warning( 661 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 662 | args.local_rank, 663 | device, 664 | args.n_gpu, 665 | bool(args.local_rank != -1), 666 | args.fp16, 667 | ) 668 | 669 | # Set seed 670 | set_seed(args) 671 | 672 | # Load pretrained model and tokenizer 673 | if args.local_rank not in [-1, 0]: 674 | torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab 675 | 676 | if args.config_name: 677 | config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir) 678 | elif args.model_name_or_path: 679 | config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) 680 | else: 681 | # When we release a pip version exposing CONFIG_MAPPING, 682 | # we can do `config = CONFIG_MAPPING[args.model_type]()`. 683 | raise ValueError( 684 | "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it," 685 | "and load it from here, using --config_name" 686 | ) 687 | 688 | if args.tokenizer_name: 689 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir) 690 | elif args.model_name_or_path: 691 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) 692 | else: 693 | raise ValueError( 694 | "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," 695 | "and load it from here, using --tokenizer_name" 696 | ) 697 | 698 | if args.block_size <= 0: 699 | args.block_size = tokenizer.max_len 700 | # Our input block size will be the max possible for the model 701 | else: 702 | args.block_size = min(args.block_size, tokenizer.max_len) 703 | 704 | if args.model_name_or_path: 705 | model = AutoModelWithLMHead.from_pretrained( 706 | args.model_name_or_path, 707 | from_tf=bool(".ckpt" in args.model_name_or_path), 708 | config=config, 709 | cache_dir=args.cache_dir, 710 | ) 711 | else: 712 | logger.info("Training new model from scratch") 713 | model = AutoModelWithLMHead.from_config(config) 714 | 715 | model.to(args.device) 716 | 717 | if args.local_rank == 0: 718 | torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab 719 | 720 | logger.info("Training/evaluation parameters %s", args) 721 | 722 | # Training 723 | if args.do_train: 724 | if args.local_rank not in [-1, 0]: 725 | torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache 726 | 727 | train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) 728 | 729 | if args.local_rank == 0: 730 | torch.distributed.barrier() 731 | 732 | global_step, tr_loss = train(args, train_dataset, model, tokenizer) 733 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 734 | 735 | # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() 736 | if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): 737 | # Create output directory if needed 738 | if args.local_rank in [-1, 0]: 739 | os.makedirs(args.output_dir, exist_ok=True) 740 | 741 | logger.info("Saving model checkpoint to %s", args.output_dir) 742 | # Save a trained model, configuration and tokenizer using `save_pretrained()`. 743 | # They can then be reloaded using `from_pretrained()` 744 | model_to_save = ( 745 | model.module if hasattr(model, "module") else model 746 | ) # Take care of distributed/parallel training 747 | model_to_save.save_pretrained(args.output_dir) 748 | tokenizer.save_pretrained(args.output_dir) 749 | 750 | # Good practice: save your training arguments together with the trained model 751 | torch.save(args, os.path.join(args.output_dir, "training_args.bin")) 752 | 753 | # Load a trained model and vocabulary that you have fine-tuned 754 | model = AutoModelWithLMHead.from_pretrained(args.output_dir) 755 | tokenizer = AutoTokenizer.from_pretrained(args.output_dir) 756 | model.to(args.device) 757 | 758 | # Evaluation 759 | results = {} 760 | if args.do_eval and args.local_rank in [-1, 0]: 761 | checkpoints = [args.output_dir] 762 | if args.eval_all_checkpoints: 763 | checkpoints = list( 764 | os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) 765 | ) 766 | logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging 767 | logger.info("Evaluate the following checkpoints: %s", checkpoints) 768 | for checkpoint in checkpoints: 769 | global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" 770 | prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" 771 | 772 | model = AutoModelWithLMHead.from_pretrained(checkpoint) 773 | model.to(args.device) 774 | result = evaluate(args, model, tokenizer, prefix=prefix) 775 | result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) 776 | results.update(result) 777 | 778 | return results 779 | 780 | 781 | if __name__ == "__main__": 782 | main() 783 | -------------------------------------------------------------------------------- /smaberta/smaberta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # Full credit to simpletransformers v0.6 4 | 5 | #TODO more appropriate name since we are just wrapping around RoBERTa 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | import os 10 | import json 11 | import logging 12 | import math 13 | from multiprocessing import cpu_count 14 | import random 15 | from typing import Dict, List, Tuple 16 | 17 | 18 | import numpy as np 19 | from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix 20 | from scipy.stats import pearsonr 21 | from simpletransformers.classification.classification_utils import (convert_examples_to_features, InputExample) 22 | from transformers import (WEIGHTS_NAME, BertConfig, 23 | BertForSequenceClassification, BertTokenizer, 24 | RobertaConfig, 25 | RobertaForSequenceClassification, 26 | RobertaTokenizer, 27 | XLMConfig, XLMForSequenceClassification, 28 | XLMTokenizer, XLNetConfig, 29 | XLNetForSequenceClassification, 30 | XLNetTokenizer, 31 | DistilBertConfig, 32 | DistilBertForSequenceClassification, 33 | DistilBertTokenizer, 34 | PreTrainedTokenizer, 35 | PreTrainedModel, 36 | AutoModelWithLMHead) 37 | from transformers import AdamW, get_linear_schedule_with_warmup 38 | import torch 39 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 40 | TensorDataset) 41 | from torch.utils.data.distributed import DistributedSampler 42 | from torch.nn.utils.rnn import pad_sequence 43 | from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler 44 | from tensorboardX import SummaryWriter 45 | from tqdm import trange, tqdm 46 | 47 | logger = logging.getLogger(__name__) 48 | 49 | class LineByLineTextDataset(Dataset): 50 | """ 51 | Dataset format for finetuning. Each line of file contains new fine tuning sentence. 52 | """ 53 | def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512): 54 | assert os.path.isfile(file_path) 55 | logger.info("Creating features from dataset file at %s", file_path) 56 | 57 | with open(file_path, encoding="utf-8") as f: 58 | lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] 59 | 60 | self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"] 61 | 62 | def __len__(self): 63 | return len(self.examples) 64 | 65 | def __getitem__(self, i): 66 | return torch.tensor(self.examples[i], dtype=torch.long) 67 | 68 | def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args, mlm_probability=0.15) -> Tuple[torch.Tensor, torch.Tensor]: 69 | """ Prepare masked tokens inputs/labels for masked language modeling """ 70 | 71 | if tokenizer.mask_token is None: 72 | raise ValueError( 73 | "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." 74 | ) 75 | 76 | labels = inputs.clone() 77 | # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) 78 | probability_matrix = torch.full(labels.shape, mlm_probability) 79 | special_tokens_mask = [ 80 | tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() 81 | ] 82 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) 83 | if tokenizer._pad_token is not None: 84 | padding_mask = labels.eq(tokenizer.pad_token_id) 85 | probability_matrix.masked_fill_(padding_mask, value=0.0) 86 | masked_indices = torch.bernoulli(probability_matrix).bool() 87 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 88 | 89 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 90 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 91 | inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) 92 | 93 | # 10% of the time, we replace masked input tokens with random word 94 | indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced 95 | random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) 96 | inputs[indices_random] = random_words[indices_random] 97 | 98 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 99 | return inputs, labels 100 | 101 | 102 | class TransformerModel: 103 | def __init__(self, model_type, model_name, finetune=False, num_labels=2, use_cuda=True, location="", **kwargs): 104 | """ 105 | Initializes a Transformer model. 106 | Args: 107 | model_type: The type of model (bert, xlnet, xlm, roberta, distilbert) 108 | model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin). 109 | finetune: Set to true or false based on if you want to initialise the model for fine tuning or classification 110 | num_labels (optional): The number of labels or classes in the dataset. 111 | location: To load a saved model from a particular location on your computer and use that as the base as opposed to the standard release from HuggingFace 112 | use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. 113 | **kwargs (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. 114 | """ 115 | 116 | MODEL_CLASSES = { 117 | 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), 118 | 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 119 | 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 120 | 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 121 | 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), 122 | } 123 | 124 | config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] 125 | if location=="": 126 | self.tokenizer = tokenizer_class.from_pretrained(model_name) 127 | if finetune: 128 | self.model=AutoModelWithLMHead.from_pretrained(model_name) 129 | else: 130 | self.model = model_class.from_pretrained(model_name, num_labels=num_labels) 131 | else: 132 | self.tokenizer = tokenizer_class.from_pretrained(location) 133 | if finetune: 134 | self.model=AutoModelWithLMHead.from_pretrained(location) 135 | else: 136 | self.model = model_class.from_pretrained(location, num_labels=num_labels) 137 | 138 | if use_cuda: 139 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 140 | else: 141 | self.device = "cpu" 142 | 143 | self.results = {} 144 | 145 | self.args = { 146 | 'output_dir': 'outputs/', 147 | 'cache_dir': 'cache_dir', 148 | 'fp16': True, 149 | 'fp16_opt_level': 'O1', 150 | 'max_seq_length': 128, 151 | 'train_batch_size': 25, 152 | 'finetune_batch_size': 4, 153 | 'gradient_accumulation_steps': 1, 154 | 'eval_batch_size': 50, 155 | 'finetune_eval_batch_size': 4, 156 | 'num_train_epochs': 1, 157 | 'num_finetune_epochs': 1, 158 | 'weight_decay': 0, 159 | 'learning_rate': 4e-5, 160 | 'finetune_learning_rate': 5e-5, 161 | 'adam_epsilon': 1e-8, 162 | 'warmup_ratio': 0.06, 163 | 'warmup_steps': 0, 164 | 'max_grad_norm': 1.0, 165 | 'mlm': True, 166 | 'logging_steps': 50, 167 | 'finetune_logging_steps': 100, 168 | 'save_steps': 2000, 169 | 'finetune_save_steps': 500, 170 | 'overwrite_output_dir': False, 171 | 'reprocess_input_data': False, 172 | 'process_count': cpu_count() - 2 if cpu_count() > 2 else 1, 173 | 'device': self.device, 174 | 'model_name_or_path': False, 175 | } 176 | 177 | 178 | self.args.update(kwargs) 179 | 180 | if use_cuda: 181 | self.args['n_gpu'] : torch.cuda.device_count() 182 | 183 | self.args['model_name'] = model_name 184 | self.args['model_type'] = model_type 185 | 186 | def train(self, training_samples, training_labels, output_dir=None, show_running_loss=False, **kwargs): 187 | """ 188 | Trains the model using 'train_df' 189 | Args: 190 | training_samples: Iterable list or pandas series of text samples for training 191 | training_labels: Iterable list of the output labels corresponding to the text samples in `training_samples` 192 | output_dir: The directory where model files will be saved. If not given, self.args['output_dir'] will be used. 193 | show_running_loss (optional): Set to False to prevent running loss from being printed to console. Defaults to True. 194 | **kwargs (optional): Optional changes to the args dict of the model. Any changes made will persist for the model. 195 | Returns: 196 | None 197 | """ 198 | 199 | self.args.update(kwargs) 200 | 201 | if not output_dir: 202 | output_dir = self.args['output_dir'] 203 | 204 | if os.path.exists(output_dir) and os.listdir(output_dir) and not self.args['overwrite_output_dir']: 205 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(output_dir)) 206 | 207 | if not isinstance(training_samples, list): 208 | try: 209 | training_samples = list(training_samples) 210 | except: 211 | raise Exception('Training samples must be iterable') 212 | 213 | if not isinstance(training_labels, list): 214 | try: 215 | training_labels = list(training_labels) 216 | except: 217 | raise Exception('Training labels must be iterable') 218 | 219 | self.model.to(self.device) 220 | 221 | train_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(training_samples, training_labels))] 222 | 223 | train_dataset = self._load_and_cache_examples(train_examples) 224 | global_step, tr_loss = self._train(train_dataset, output_dir, show_running_loss=show_running_loss) 225 | 226 | if not os.path.exists(output_dir): 227 | os.makedirs(output_dir) 228 | 229 | model_to_save = self.model.module if hasattr(self.model, 'module') else self.model 230 | model_to_save.save_pretrained(output_dir) 231 | self.tokenizer.save_pretrained(output_dir) 232 | torch.save(self.args, os.path.join(output_dir, 'training_args.bin')) 233 | 234 | print(f'Training of {self.args["model_type"]} model complete. Saved to {output_dir}.') 235 | 236 | def evaluate(self, testing_samples, testing_labels, output_dir=None, verbose=False, **kwargs): 237 | """ 238 | Evaluates the model on eval_df. Saves results to output_dir. 239 | Args: 240 | testing_samples: an iterable list of texts for testing 241 | testing_labels: the labels corresponding to the testing samples 242 | output_dir: The directory where model files will be saved. If not given, self.args['output_dir'] will be used. 243 | verbose: If verbose, results will be printed to the console on completion of evaluation. 244 | **kwargs: Additional metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use). E.g. f1=sklearn.metrics.f1_score. 245 | A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions. 246 | Returns: 247 | result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn) 248 | model_outputs: List of model outputs for each row in eval_df 249 | wrong_preds: List of InputExample objects corresponding to each incorrect prediction by the model 250 | """ 251 | 252 | if not output_dir: 253 | output_dir = self.args['output_dir'] 254 | 255 | self.model.to(self.device) 256 | 257 | result, model_outputs, wrong_preds = self._evaluate(testing_samples, testing_labels, output_dir, kwargs) 258 | self.results.update(result) 259 | 260 | if not verbose: 261 | print(self.results) 262 | 263 | return result, model_outputs, wrong_preds 264 | 265 | def _evaluate(self, testing_samples, testing_labels, output_dir, prefix="", **kwargs): 266 | """ 267 | Evaluates the model on eval_df. 268 | Utility function to be used by the evaluate() method. Not intended to be used directly. 269 | """ 270 | self.args.update(kwargs) 271 | 272 | tokenizer = self.tokenizer 273 | device = self.device 274 | model = self.model 275 | args = self.args 276 | eval_output_dir = output_dir 277 | 278 | if not isinstance(testing_samples, list): 279 | try: 280 | testing_samples = list(testing_samples) 281 | except: 282 | raise Exception('Testing samples must be iterable') 283 | 284 | if not isinstance(testing_labels, list): 285 | try: 286 | testing_labels = list(testing_labels) 287 | except: 288 | raise Exception('Testing labels must be iterable') 289 | 290 | results = {} 291 | 292 | eval_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(testing_samples, testing_labels))] 293 | eval_dataset = self._load_and_cache_examples(eval_examples, evaluate=True) 294 | if not os.path.exists(eval_output_dir): 295 | os.makedirs(eval_output_dir) 296 | 297 | eval_sampler = SequentialSampler(eval_dataset) 298 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size']) 299 | 300 | eval_loss = 0.0 301 | nb_eval_steps = 0 302 | preds = None 303 | out_label_ids = None 304 | #for batch in tqdm(eval_dataloader): 305 | for batch in eval_dataloader: 306 | model.eval() 307 | batch = tuple(t.to(device) for t in batch) 308 | 309 | with torch.no_grad(): 310 | inputs = {'input_ids': batch[0], 311 | 'attention_mask': batch[1], 312 | 'labels': batch[3]} 313 | if self.args['model_type'] != 'distilbert': 314 | inputs['token_type_ids'] = batch[2] if self.args['model_type'] in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids 315 | outputs = model(**inputs) 316 | tmp_eval_loss, logits = outputs[:2] 317 | 318 | eval_loss += tmp_eval_loss.mean().item() 319 | nb_eval_steps += 1 320 | if preds is None: 321 | preds = logits.detach().cpu().numpy() 322 | out_label_ids = inputs['labels'].detach().cpu().numpy() 323 | else: 324 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 325 | out_label_ids = np.append( 326 | out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) 327 | 328 | eval_loss = eval_loss / nb_eval_steps 329 | model_outputs = preds 330 | preds = np.argmax(preds, axis=1) 331 | result, wrong = self.compute_metrics(preds, out_label_ids, eval_examples, **kwargs) 332 | results.update(result) 333 | 334 | output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") 335 | with open(output_eval_file, "w") as writer: 336 | for key in sorted(result.keys()): 337 | writer.write("%s = %s\n" % (key, str(result[key]))) 338 | 339 | return results, model_outputs, wrong 340 | 341 | 342 | def _load_and_cache_examples(self, examples, evaluate=False, no_cache=False): 343 | """ 344 | Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures. 345 | Utility function for train() and eval() methods. Not intended to be used directly. 346 | """ 347 | 348 | process_count = self.args['process_count'] 349 | 350 | tokenizer = self.tokenizer 351 | output_mode = 'classification' 352 | args=self.args 353 | 354 | if not os.path.isdir(self.args['cache_dir']): 355 | os.mkdir(self.args['cache_dir']) 356 | 357 | mode = 'dev' if evaluate else 'train' 358 | cached_features_file = os.path.join(args['cache_dir'], f"cached_{mode}_{args['model_type']}_{args['max_seq_length']}_binary") 359 | 360 | if os.path.exists(cached_features_file) and not args['reprocess_input_data'] and not no_cache: 361 | features = torch.load(cached_features_file) 362 | 363 | else: 364 | features = convert_examples_to_features(examples, args['max_seq_length'], tokenizer, output_mode, 365 | # xlnet has a cls token at the end 366 | cls_token_at_end=bool(args['model_type'] in ['xlnet']), 367 | cls_token=tokenizer.cls_token, 368 | cls_token_segment_id=2 if self.args['model_type'] in ['xlnet'] else 0, 369 | sep_token=tokenizer.sep_token, 370 | # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 371 | sep_token_extra=bool(args['model_type'] in ['roberta']), 372 | # pad on the left for xlnet 373 | pad_on_left=bool(args['model_type'] in ['xlnet']), 374 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], 375 | pad_token_segment_id=4 if self.args['model_type'] in ['xlnet'] else 0, 376 | process_count=process_count, silent=True) 377 | 378 | if not no_cache: 379 | torch.save(features, cached_features_file) 380 | 381 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 382 | all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) 383 | all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) 384 | if output_mode == "classification": 385 | all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) 386 | elif output_mode == "regression": 387 | all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) 388 | 389 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 390 | return dataset 391 | 392 | 393 | def _train(self, train_dataset, output_dir, show_running_loss=True): 394 | """ 395 | Trains the model on train_dataset. 396 | Utility function to be used by the train_model() method. Not intended to be used directly. 397 | """ 398 | tokenizer = self.tokenizer 399 | device = self.device 400 | model = self.model 401 | args = self.args 402 | tb_writer = SummaryWriter() 403 | train_sampler = RandomSampler(train_dataset) 404 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size']) 405 | 406 | t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs'] 407 | 408 | no_decay = ['bias', 'LayerNorm.weight'] 409 | optimizer_grouped_parameters = [ 410 | {'params': [p for n, p in model.named_parameters() if not any( 411 | nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']}, 412 | {'params': [p for n, p in model.named_parameters() if any( 413 | nd in n for nd in no_decay)], 'weight_decay': 0.0} 414 | ] 415 | 416 | warmup_steps = math.ceil(t_total * args['warmup_ratio']) 417 | args['warmup_steps'] = warmup_steps if self.args['warmup_steps'] == 0 else args['warmup_steps'] 418 | 419 | optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon']) 420 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps=t_total) 421 | 422 | if self.args['fp16']: 423 | try: 424 | from apex import amp 425 | except ImportError: 426 | raise ImportError( 427 | "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 428 | model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level']) 429 | 430 | global_step = 0 431 | tr_loss, logging_loss = 0.0, 0.0 432 | model.zero_grad() 433 | train_iterator = range(int(args['num_train_epochs']))#, desc="Epoch") 434 | ctr = 0 435 | for _ in train_iterator: 436 | print("Starting Epoch: ", ctr) 437 | ctr+=1 438 | # epoch_iterator = tqdm(train_dataloader, desc="Iteration") 439 | for step, batch in enumerate(train_dataloader):#, desc="Current iteration"): 440 | model.train() 441 | batch = tuple(t.to(device) for t in batch) 442 | inputs = {'input_ids': batch[0], 443 | 'attention_mask': batch[1], 444 | 'labels': batch[3]} 445 | # XLM, DistilBERT and RoBERTa don't use segment_ids 446 | if self.args['model_type'] != 'distilbert': 447 | inputs['token_type_ids'] = batch[2] if self.args['model_type'] in ['bert', 'xlnet'] else None 448 | outputs = model(**inputs) 449 | # model outputs are always tuple in pytorch-transformers (see doc) 450 | loss = outputs[0] 451 | if show_running_loss: 452 | print("\rRunning loss: %f" % loss, end='') 453 | 454 | if self.args['gradient_accumulation_steps'] > 1: 455 | loss = loss / args['gradient_accumulation_steps'] 456 | 457 | if self.args['fp16']: 458 | with amp.scale_loss(loss, optimizer) as scaled_loss: 459 | scaled_loss.backward() 460 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm']) 461 | 462 | else: 463 | loss.backward() 464 | torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm']) 465 | 466 | tr_loss += loss.item() 467 | if (step + 1) % args['gradient_accumulation_steps'] == 0: 468 | optimizer.step() 469 | scheduler.step() # Update learning rate schedule 470 | model.zero_grad() 471 | global_step += 1 472 | 473 | if self.args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0: 474 | # Log metrics 475 | # Only evaluate when single GPU otherwise metrics may not average well 476 | tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) 477 | tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step) 478 | logging_loss = tr_loss 479 | 480 | if self.args['save_steps'] > 0 and global_step % args['save_steps'] == 0: 481 | # Save model checkpoint 482 | output_dir = os.path.join( 483 | output_dir, 'checkpoint-{}'.format(global_step)) 484 | if not os.path.exists(output_dir): 485 | os.makedirs(output_dir) 486 | # Take care of distributed/parallel training 487 | model_to_save = model.module if hasattr( 488 | model, 'module') else model 489 | model_to_save.save_pretrained(output_dir) 490 | return global_step, tr_loss / global_step 491 | 492 | 493 | def compute_metrics(self, preds, labels, eval_examples, **kwargs): 494 | """ 495 | Computes the evaluation metrics for the model predictions. 496 | Args: 497 | preds: Model predictions 498 | labels: Ground truth labels 499 | eval_examples: List of examples on which evaluation was performed 500 | **kwargs: Additional metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use). E.g. f1=sklearn.metrics.f1_score. 501 | A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions. 502 | Returns: 503 | result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn) 504 | wrong: List of InputExample objects corresponding to each incorrect prediction by the model 505 | """ 506 | assert len(preds) == len(labels) 507 | 508 | mcc = matthews_corrcoef(labels, preds) 509 | 510 | extra_metrics = {} 511 | for metric, func in kwargs.items(): 512 | extra_metrics[metric] = func(labels, preds) 513 | 514 | mismatched = labels != preds 515 | wrong = [i for (i, v) in zip(eval_examples, mismatched) if v] 516 | 517 | if self.model.num_labels == 2: 518 | tn, fp, fn, tp = confusion_matrix(labels, preds).ravel() 519 | return {**{ 520 | "mcc": mcc, 521 | "tp": tp, 522 | "tn": tn, 523 | "fp": fp, 524 | "fn": fn 525 | }, **extra_metrics}, wrong 526 | 527 | else: 528 | return {**{"mcc": mcc}, **extra_metrics}, wrong 529 | 530 | def predict(self, to_predict): 531 | """ 532 | Performs predictions on a list of text. 533 | Args: 534 | to_predict: A python list of text (str) to be sent to the model for prediction. 535 | Returns: 536 | preds: A python list of the predictions (0 or 1) for each text. 537 | model_outputs: A python list of the raw model outputs for each text. 538 | """ 539 | 540 | tokenizer = self.tokenizer 541 | device = self.device 542 | model = self.model 543 | args = self.args 544 | 545 | self.model.to(self.device) 546 | 547 | eval_examples = [InputExample(i, text, None, 0) for i, text in enumerate(to_predict)] 548 | 549 | eval_dataset = self._load_and_cache_examples(eval_examples, evaluate=True, no_cache=True) 550 | 551 | eval_sampler = SequentialSampler(eval_dataset) 552 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size']) 553 | 554 | eval_loss = 0.0 555 | nb_eval_steps = 0 556 | preds = None 557 | out_label_ids = None 558 | #for batch in tqdm(eval_dataloader): 559 | for batch in eval_dataloader: 560 | model.eval() 561 | batch = tuple(t.to(device) for t in batch) 562 | 563 | with torch.no_grad(): 564 | inputs = {'input_ids': batch[0], 565 | 'attention_mask': batch[1], 566 | # XLM don't use segment_ids 567 | 'token_type_ids': batch[2] if self.args['model_type'] in ['bert', 'xlnet'] else None, 568 | 'labels': batch[3]} 569 | outputs = model(**inputs) 570 | tmp_eval_loss, logits = outputs[:2] 571 | 572 | eval_loss += tmp_eval_loss.mean().item() 573 | nb_eval_steps += 1 574 | if preds is None: 575 | preds = logits.detach().cpu().numpy() 576 | out_label_ids = inputs['labels'].detach().cpu().numpy() 577 | else: 578 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 579 | out_label_ids = np.append( 580 | out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) 581 | 582 | eval_loss = eval_loss / nb_eval_steps 583 | model_outputs = preds 584 | preds = np.argmax(preds, axis=1) 585 | 586 | return preds, model_outputs 587 | 588 | def finetune(self, train_file_path, eval_file_path): 589 | """ 590 | Fine tune the probability distribution of the language model on your own text 591 | Args: 592 | train_file_path: File containing samples of your text in consecutive lines. No labels necessary 593 | eval_file_path: File containing samples of your text in consecutive lines, used as the validation set to perform a sanity check on fine tuning 594 | Returns: 595 | global_step: Number of training steps 596 | Average loss per step 597 | Also saves the model in the output_dir provided as an argument on init 598 | """ 599 | model = self.model 600 | tokenizer = self.tokenizer 601 | args = self.args 602 | print(args) 603 | #print("Starting model finetuning") 604 | train_dataset = LineByLineTextDataset(tokenizer, file_path=train_file_path) 605 | """ Train the model """ 606 | tb_writer = SummaryWriter() 607 | 608 | def collate(examples: List[torch.Tensor]): 609 | if tokenizer._pad_token is None: 610 | return pad_sequence(examples, batch_first=True) 611 | return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) 612 | 613 | train_sampler = RandomSampler(train_dataset) 614 | train_dataloader = DataLoader( 615 | train_dataset, sampler=train_sampler, batch_size=args["finetune_batch_size"], collate_fn=collate 616 | ) 617 | 618 | t_total = len(train_dataloader) // args["num_finetune_epochs"] 619 | 620 | # Prepare optimizer and schedule (linear warmup and decay) 621 | no_decay = ["bias", "LayerNorm.weight"] 622 | optimizer_grouped_parameters = [ 623 | { 624 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 625 | "weight_decay": args["weight_decay"], 626 | }, 627 | {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, 628 | ] 629 | optimizer = AdamW(optimizer_grouped_parameters, lr=args["finetune_learning_rate"], eps=args["adam_epsilon"]) 630 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=t_total) 631 | 632 | # Check if saved optimizer or scheduler states exist 633 | if ( 634 | args["model_name_or_path"] 635 | and os.path.isfile(os.path.join(args["model_name_or_path"], "optimizer.pt")) 636 | and os.path.isfile(os.path.join(args["model_name_or_path"], "scheduler.pt")) 637 | ): 638 | # Load in optimizer and scheduler states 639 | optimizer.load_state_dict(torch.load(os.path.join(args["model_name_or_path"], "optimizer.pt"))) 640 | scheduler.load_state_dict(torch.load(os.path.join(args["model_name_or_path"], "scheduler.pt"))) 641 | 642 | if self.args["fp16"]: 643 | try: 644 | from apex import amp 645 | except ImportError: 646 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 647 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 648 | 649 | # multi-gpu training (should be after apex fp16 initialization) 650 | #if args["n_gpu"] > 1: 651 | # model = torch.nn.DataParallel(model) 652 | 653 | # Train! 654 | logger.info("***** Running training *****") 655 | logger.info(" Num examples = %d", len(train_dataset)) 656 | logger.info(" Num Epochs = %d", args["num_finetune_epochs"]) 657 | logger.info(" Instantaneous batch size per GPU = %d", args["finetune_batch_size"]) 658 | logger.info(" Gradient Accumulation steps = %d", 1) 659 | logger.info(" Total optimization steps = %d", t_total) 660 | #print("Beginning") 661 | global_step = 0 662 | epochs_trained = 0 663 | steps_trained_in_current_epoch = 0 664 | # Check if continuing training from a checkpoint 665 | if self.args["model_name_or_path"] and os.path.exists(self.args["model_name_or_path"]): 666 | try: 667 | # set global_step to gobal_step of last saved checkpoint from model path 668 | checkpoint_suffix = args["model_name_or_path"].split("-")[-1].split("/")[0] 669 | global_step = int(checkpoint_suffix) 670 | epochs_trained = global_step // (len(train_dataloader)) 671 | steps_trained_in_current_epoch = global_step % (len(train_dataloader)) 672 | 673 | logger.info(" Continuing training from checkpoint, will skip to saved global_step") 674 | logger.info(" Continuing training from epoch %d", epochs_trained) 675 | logger.info(" Continuing training from global step %d", global_step) 676 | logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) 677 | except ValueError: 678 | logger.info(" Starting fine-tuning.") 679 | 680 | tr_loss, logging_loss = 0.0, 0.0 681 | 682 | model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training 683 | model_to_resize.resize_token_embeddings(len(tokenizer)) 684 | model.to(self.device) 685 | model.zero_grad() 686 | train_iterator = trange( 687 | epochs_trained, int(args["num_train_epochs"]), desc="Epoch", disable=False) 688 | #set_seed(args) # Added here for reproducibility 689 | for _ in train_iterator: 690 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False) 691 | for step, batch in enumerate(epoch_iterator): 692 | 693 | # Skip past any already trained steps if resuming training 694 | if steps_trained_in_current_epoch > 0: 695 | steps_trained_in_current_epoch -= 1 696 | continue 697 | 698 | inputs, labels = mask_tokens(batch, tokenizer, args) if self.args["mlm"] else (batch, batch) 699 | inputs = inputs.to(args["device"]) 700 | labels = labels.to(args["device"]) 701 | model.train() 702 | outputs = model(inputs, masked_lm_labels=labels) if self.args["mlm"] else model(inputs, labels=labels) 703 | loss = outputs[0] # model outputs are always tuple in transformers (see doc) 704 | 705 | #if args["n_gpu"] >Also saves the model in the output_dir: 706 | # loss = loss.mean() # mean() to average on multi-gpu parallel training 707 | 708 | if self.args["fp16"]: 709 | with amp.scale_loss(loss, optimizer) as scaled_loss: 710 | scaled_loss.backward() 711 | else: 712 | loss.backward() 713 | 714 | tr_loss += loss.item() 715 | 716 | if self.args["fp16"]: 717 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) 718 | else: 719 | torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) 720 | 721 | optimizer.step() 722 | scheduler.step() # Update learning rate schedule 723 | model.zero_grad() 724 | global_step += 1 725 | 726 | if global_step % args["finetune_logging_steps"] == 0: 727 | # Log metrics 728 | results = self.lm_evaluate(eval_file_path) 729 | for key, value in results.items(): 730 | tb_writer.add_scalar("eval_{}".format(key), value, global_step) 731 | tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) 732 | tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["finetune_logging_steps"], global_step) 733 | logging_loss = tr_loss 734 | 735 | if global_step % args["finetune_save_steps"] == 0 or global_step==t_total-1: 736 | checkpoint_prefix = "checkpoint" 737 | # Save model checkpoint 738 | output_dir = os.path.join(args["output_dir"], "{}-{}".format(checkpoint_prefix, global_step)) 739 | os.makedirs(output_dir, exist_ok=True) 740 | model_to_save = ( 741 | model.module if hasattr(model, "module") else model 742 | ) # Take care of distributed/parallel training 743 | model_to_save.save_pretrained(output_dir) 744 | tokenizer.save_pretrained(output_dir) 745 | 746 | torch.save(args, os.path.join(output_dir, "training_args.bin")) 747 | logger.info("Saving model checkpoint to %s", output_dir) 748 | 749 | #_rotate_checkpoints(args, checkpoint_prefix) 750 | 751 | torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) 752 | torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) 753 | logger.info("Saving optimizer and scheduler states to %s", output_dir) 754 | 755 | #if args.max_steps > 0 and global_step > args.max_steps: 756 | # epoch_iterator.close() 757 | # break 758 | #if args.max_steps > 0 and global_step > args.max_steps: 759 | # train_iterator.close() 760 | # break 761 | 762 | #if args.local_rank in [-1, 0]: 763 | tb_writer.close() 764 | 765 | return global_step, tr_loss / global_step 766 | 767 | def lm_evaluate(self, eval_file_path, prefix="") -> Dict: 768 | """ 769 | Evaluates the language model for perplexity on the set provided 770 | Args: 771 | eval_file_path: Location of file containing sample text for validation 772 | prefix: Prefix for saving results of evaluation 773 | TODO: Better saving/removing this argument 774 | Returns: 775 | Final evaluation perplexity score 776 | Saves results of evaluation to output_dir 777 | """ 778 | model = self.model 779 | tokenizer = self.tokenizer 780 | args = self.args 781 | #print(args) 782 | #print("Starting evaluation") 783 | # Loop to handle MNLI double evaluation (matched, mis-matched) 784 | eval_output_dir = args["output_dir"] 785 | 786 | eval_dataset = LineByLineTextDataset(tokenizer, file_path=eval_file_path) 787 | 788 | #if args.local_rank in [-1, 0]: 789 | os.makedirs(eval_output_dir, exist_ok=True) 790 | 791 | def collate(examples: List[torch.Tensor]): 792 | if tokenizer._pad_token is None: 793 | return pad_sequence(examples, batch_first=True) 794 | return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) 795 | 796 | eval_sampler = SequentialSampler(eval_dataset) 797 | eval_dataloader = DataLoader( 798 | eval_dataset, sampler=eval_sampler, batch_size=args["finetune_eval_batch_size"], collate_fn=collate 799 | ) 800 | 801 | # multi-gpu evaluate 802 | #if args.n_gpu > 1: 803 | # model = torch.nn.DataParallel(model) 804 | 805 | # Eval! 806 | logger.info("***** Running evaluation {} *****".format(prefix)) 807 | logger.info(" Num examples = %d", len(eval_dataset)) 808 | logger.info(" Batch size = %d", args["finetune_eval_batch_size"]) 809 | eval_loss = 0.0 810 | nb_eval_steps = 0 811 | model.to(self.device) 812 | model.eval() 813 | 814 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 815 | inputs, labels = mask_tokens(batch, tokenizer, args) if self.args["mlm"] else (batch, batch) 816 | inputs = inputs.to(args["device"]) 817 | labels = labels.to(args["device"]) 818 | 819 | with torch.no_grad(): 820 | outputs = model(inputs, masked_lm_labels=labels) if self.args["mlm"] else model(inputs, labels=labels) 821 | lm_loss = outputs[0] 822 | eval_loss += lm_loss.mean().item() 823 | nb_eval_steps += 1 824 | 825 | eval_loss = eval_loss / nb_eval_steps 826 | perplexity = torch.exp(torch.tensor(eval_loss)) 827 | 828 | result = {"perplexity": perplexity} 829 | print("Evaluation perplexity: ", result) 830 | output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") 831 | with open(output_eval_file, "w") as writer: 832 | logger.info("***** Eval results {} *****".format(prefix)) 833 | for key in sorted(result.keys()): 834 | logger.info(" %s = %s", key, str(result[key])) 835 | writer.write("%s = %s\n" % (key, str(result[key]))) 836 | 837 | return result 838 | 839 | -------------------------------------------------------------------------------- /data/tutorial_test.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "AIDS in prison, treatment costs overwhelm prison budgets",12 3 | olympics security,19 4 | police brutality,12 5 | Iranian nuclear program; deal with European Union and its leaving of Iran free to develop plutonium.,16 6 | terror alert raised,16 7 | Job report shows unexpected vigor for US economy,5 8 | Clinton proposes West Bank Plan to Isreal's Prime Minister Netanyahu,19 9 | Senators debate Iraq War policy,16 10 | Myrtle Beach,14 11 | china visit,19 12 | elections in Rwanda,19 13 | Sudan tires of war between Arabs and Christians,19 14 | Enron scandal,12 15 | primaries - McCain,20 16 | US to sign a treaty banning land mines,16 17 | ross perot to run for president,20 18 | European law prohibits American-style buying and selling of personal data,19 19 | clinton's list of donors,20 20 | Guantanamo Bay opposition,2 21 | US demands Iraq disarm in a meeting with allies,16 22 | conservatives attack job training bill,5 23 | Underground rumors and popularity of supposed obesity miracle drug; doctors' call for caution.,3 24 | old style farm in Illinois,4 25 | Questions about whether NYC mayor Bloomberg's plan to impose strict requirements for passing the third grade would not be counter-productive.,6 26 | nyc schools chancellor ousted,6 27 | Fed Reserve Chairman and former treasury sec against dropping tax to stimulate economy,1 28 | supreme court ruling on redistricting,20 29 | Governor of California to sign welfare law to move aid recipients into jobs,13 30 | AT&T may split up,15 31 | immgration crackdown at walmart,9 32 | Urban development programs to replace high-rise public housing,14 33 | jungle trees hides ancient archetectual buildings,19 34 | Difficulties faced by prospective parents whose unborn child is diagnosed with serious conditions.,12 35 | NY school cirriculum,6 36 | witness against Al-Qaeda,19 37 | evidence from 1918 flu found in genetics,3 38 | ground zero reconstuction,21 39 | Growing field of geriatric care managers,3 40 | israel may release arafat,19 41 | billboards and smoking,3 42 | push for a nationwide product liability bill,15 43 | White House selection for new Treasury Secretary,20 44 | Internal pressure for reforms in China; use of centenary of Deng Xiaoping for their promotion.,19 45 | Afghan girls and education,19 46 | Growth in the number of uninsured in US,3 47 | Israel captures hebron,19 48 | plan to revamp Lower Manhatten transit network,10 49 | AIDS medication; AZT,3 50 | supreme court ruling on vouchers for private schools,6 51 | Canadian sues US for detaining and beating him for 10 months,2 52 | New spy gear used in Iraq,16 53 | US-French split over iraq,16 54 | get out the vote campaigns,20 55 | Two Congressmen and their recovery from alcoholism,3 56 | retail boom,1 57 | medicare drug plan backed by aarp,3 58 | Blue collar workers whose jobs have gone overseas find themselves abandoned by labor unions,5 59 | State Department annual report finds an increase in terrorist incidents.,16 60 | Japan struggles to adopt Western-style capitalism,19 61 | US and russia sign a pact on nuclear arms cuts,16 62 | New Orleans reconstruction,15 63 | legal battle begins over expansion of government's powers in fighting terrorism,19 64 | independent council investigating the clintons,20 65 | Thanksgiving in immigrant households.,9 66 | genetic engineering regulations,4 67 | E-bay auctions,15 68 | Yeltsin nominates Primakov as Prime Minister,19 69 | ken starr testimony,20 70 | Fire disrupts Brooklyn subway lines,10 71 | war in Iraq:US abuse of prioners in Afghanistan,19 72 | Clinton impeachment trial,20 73 | Afghanistan reconstruction; mine- and explosives-clearing in preparation for airport re-opening.,16 74 | bush legislative plan,20 75 | Britain's royal navy actively recruits homosexuals,19 76 | "Series on life of people with A.L.S., or Lou Gherig's disease.",3 77 | poll on americans' opinions on microsoft,15 78 | "Saudi leaders voice public support for US, but are sensitive to launching military operations",16 79 | Man on death row appeals for clemency,12 80 | NY senate debate,20 81 | more men are experiencing sexual harrassment from other males,2 82 | Chinese Sneak Into Taiwan in Hopes of Prosperity,19 83 | anniversary of waco fire,16 84 | "Clinton Retirement, Calls on Republicans",20 85 | "Damage done to Columbia University's Research, Result of Power Outage in New York City",8 86 | the number of dead and missing firefighters,16 87 | Israel studying formerly unthinkable proposals,19 88 | Profile of current population of drug felons in NYC prisons.,12 89 | National Governors' Association decide not to try to change new Federal welfare law; pressure from Congressional Republicans; creating a compromise over benefits for legal immigrants,13 90 | Elian Gonzalez,9 91 | stroke therapy,3 92 | Congressman sentenced in bribery case,20 93 | GOP primary,20 94 | rapid expansion of technology requires area codes in phone calls,17 95 | "US warns China to abstain from military action against Taiwan, Urge for peaceful resolve",19 96 | Kerry fires campaign chief,20 97 | cancer drugs in mice,3 98 | Saudi Arabia faces end of oil boom and beginning of recession,19 99 | sniper attacks,12 100 | Germany accuses U.S. of illegally kidnapping innocent terror suspect; CIA,16 101 | Home prices rise leading to bidding wars,14 102 | Terror's influence on the art world,19 103 | countries cracking down on cartels,18 104 | New discoveries about the nature of brain injury,3 105 | "despite depictions from credit card companies, most Americans bamkrupt due to hard luck",15 106 | US college students under quarantine from SARS,3 107 | murder charge,12 108 | Kofi Annan visits Iraq,19 109 | Tenet healthcare settles fraud accusations,3 110 | Coretta Scott King's funeral service,2 111 | Congressional Republicans and White House officials near budget agreement,1 112 | "Hurricane Katrina, New Orleans police force falling apart",12 113 | car bomb in pakistan outside American Consulate,16 114 | President Bush wants to send more troops to Iraq,16 115 | Democrat Senators-to-Governors; Corzine announcement of run for post of NJ Governor.,20 116 | death penalty debate,12 117 | clinton orders forces to the persian gulf,16 118 | NY times neediest cases fund,17 119 | Narrow approval of Central American free-trade deal.,18 120 | legacy of the los angeles riots,2 121 | USS cole returns to duty,16 122 | Chinese Government Publicizes Falun Gong Self-Immolation,19 123 | Ariel Sharon to found a new party,19 124 | U.S. troops capture high number of prisoners in Iraq not protected under Geneva convention,16 125 | water management in the west,21 126 | anti-american sentiment in Falluja,16 127 | black businessmen,15 128 | New York Stock Exchange C.E.O. accused of stealing money,15 129 | freed Hamas prisoners,19 130 | AOL accidentally releases the identity of one of its users to public,2 131 | Former President Bush promotes business interests of equity firm,18 132 | killings in kosovo,16 133 | Women giving birth in Africa,19 134 | Overview of the 1998 campaigns,20 135 | Dedication of WWII Memorial in Washington D.C..,21 136 | NASA experiment fails,17 137 | Federal Panel overrules science fraud charges against Thereza Imanishi- Kari,3 138 | mutual funds trying to assure their investors,12 139 | bickering over Iraq in European countries,16 140 | Shooting of Amadou Diallo results in citizens not trusting police,12 141 | Refusal of Sonia Ghandi to serve as prime minister.,19 142 | Giuliani upset by lesser charges in criminal death of a police officer,12 143 | election problems in duval county florida,20 144 | couriers serve immigrants,9 145 | China has Trouble Becoming Producer of Crocodile Goods,19 146 | Netanyahu becomes first Prime Minister to cross into Gaza to meet with Palestinians,19 147 | "Clinton and Dole fundraise in same areas, benefit from added attention",20 148 | "Presidential election; rise to prominence of Barak Obama, culminating in his selection as keynote speaker at the Democratic convention.",20 149 | NYC councilman killed at city hall,12 150 | Lieberman's presidential candidacy,20 151 | Child welfare in New Jersey,12 152 | Clark enters democratic race for president,20 153 | Liberians ask for US help,16 154 | US generals meet in a palace to discuss rebuilding,16 155 | father who killed at kids hockey game sentenced,12 156 | police killing youth in venezuela,19 157 | Bush warns Hussein to allow UN inspectors to search for wmds,16 158 | Catholic priest abuse scandal,12 159 | Scandal involving Tom DeLay's family being paid by his campaign committees,20 160 | death of 2 afghan inmates,19 161 | Bush calls for allies to help with transferring Iraq sovereignty,16 162 | Legal status of terrorist suspects; federal judge's halting of military trial of Osama Bin Laden's driver.,2 163 | soaring birthrates in NYC,3 164 | Berlin: Where it is today compared with the past,19 165 | Orchard owners negotiating with home developers rather than collecting apples,4 166 | Advice to investors and reflection on financial lessons,15 167 | corporation and the community its in,15 168 | Pataki calls for financing to promote environment,7 169 | "Zaire: looking into the new era, economics",19 170 | Philip Morris trying to win favor from New York Legislature,3 171 | people travelling for July 4 holiday,20 172 | New welfare programs allow welfare recipients to keep more benefits when they get jobs,13 173 | guerrillas in the phillipenes,19 174 | hiring of Foreign Service officers,19 175 | Growth in demand for services of individual taking down public pay-phone locations and numbers.,17 176 | Clinton impeachment trial,20 177 | states changing stringent anticrime measures,12 178 | Court awards billions of dollars to New York City schools,6 179 | Chinese trade bill,18 180 | Cleveland: No Cell Phones While Driving,10 181 | NYC Mayor Bloomberg announces large budget proposal and estimates there will be a large surplus.,24 182 | Bin Laden linked to kenya embassy bombing,16 183 | new accounting standards for local and state governments,24 184 | immigration laws,9 185 | information on bin laden whereabouts found,19 186 | U.S. presence in Iraq; U.S. propaganda efforts,16 187 | postwar chaos in Mosul leads to anger at US,16 188 | mediator fails to find settlement between California and power suppliers,8 189 | Defense Dept considers changes to military tribunals,2 190 | new building to be built in times square,14 191 | Federal Reserve unexpectedly cuts interest rates,1 192 | group of 14 senators averts showdown over judges,20 193 | democratic convention,20 194 | teenager burns a boat belonging to bush,12 195 | Giuliani defends police officers in confrontation with marchers at Million Youth march,12 196 | strikes in Yugoslavia,19 197 | Discovery of ancient ruins in eastern Utah.,17 198 | Bush administration expanding NSA and bypassing Congress,20 199 | nytimes neediest cases fund ad,17 200 | bosnian election,19 201 | louisiana caucuses,20 202 | congress and white house agree on budget,1 203 | economy trouble,1 204 | new elections in Peru,19 205 | US visa policy called unfair and arbitrary,9 206 | prosecutors want a genetic test of lewinsky's dress,20 207 | Federal appeals rules to continue with recall as scheduled,24 208 | congress to act on firestone tire problems,15 209 | rape of a jogger in central park,12 210 | war in chechnya,19 211 | "Carnival docks cruise ships in Brooklyn, revitalizing the port",14 212 | Kosovo: attack by Serbs ends cease-fire,19 213 | Hyde accedes to Democratic demands on some inquiry issues,20 214 | efforts to move people from welfare to work,13 215 | Iraq reconstruction; major creditors of Iraq agreement for large debt write-off.,16 216 | Gov. Pataki negotiates agreement with Consolidated Edison favoring businesses,8 217 | Supreme Court declines to hear Terri Schiavo case,2 218 | Cambodia: leader resists punishing Khmer Rouge,19 219 | US role in Afghan rebuilding,16 220 | Bush's campaign strategy,20 221 | bush proposes medicare overhaul,3 222 | clinton talks about sending peacekeepers in kosovo,16 223 | Negative impact of global economic crisis on Russia,19 224 | legal reform in Morocco,19 225 | 9/11 memorial service,21 226 | white house inquiry-hilary clinton,20 227 | Changing role of juries in the legal system,12 228 | Difficulties in procuring non-oil sources of energy.,8 229 | Senator blocks promotions to get planes for Idaho national guard,16 230 | UN treaty bans nuclear testing,16 231 | terrorism,19 232 | schools looking for principals,6 233 | investigation into anthrax mailings,16 234 | soldiers in Iraq and Afghanistan to have extended tours of duty,16 235 | Saddam Hussein captivity and trial; Iraqi officials expectation that U.S. will soon transfer him to their custody.,16 236 | Boeing Company Stock Plummets,15 237 | Stock Market Bubble burst for technology stocks,1 238 | new york legislature,24 239 | campaign finance reform,20 240 | NY times neediest cases fund,13 241 | Clinton impeachment trial,20 242 | vietnamese immigration to the US,9 243 | serbs attacks kosovo rebels,19 244 | louima case,12 245 | Bush lowers expectations of a quick war,16 246 | Afghan tribes come before government,19 247 | Clinton's state of the union,20 248 | Russian bank scandal,12 249 | Underground gambling games in New York City,15 250 | clinton puts sanctions on iran and libya,18 251 | reorganization of board of education HQ,6 252 | investigation into catholic preist abuse scandal made public,12 253 | Vermont sport of hunting fish with firearms.,12 254 | abortion pill,2 255 | bill bradley needs some primary victories,20 256 | investigation of hillary clinton,20 257 | Growth of federally financed tutoring industry as a result of No Child Left Behind,6 258 | Cyclospora outbreak in raspberries,4 259 | Israel-Palestine; death of children in the territories; conflicting accounts.,19 260 | Working for the Clintons; Margaret A. Williams,20 261 | Inquiry into problems with U.N.'s Iraq oil-for-food program,19 262 | Presidential elections; Primat der Innenpolitik in both candidates' campaigns.,20 263 | cuban exiles dying off,19 264 | Study of nuns helps with understanding Alzheimer's disease,3 265 | 11 EU countries prepare to introduce the Euro,19 266 | surge in donations to WTC victims leads to items sitting unused,16 267 | NJ senate race,20 268 | Bush looks elsewhere for oil and gas resources after facing opposition in Artic,8 269 | debt in the apple industry after marketing new Red Delicious,4 270 | White House and Rupublican negotiators make push toward agreement on budget and tax cuts,1 271 | intelligence projections,16 272 | Medicaid system not adequately serving poor in the Bronx,3 273 | Federal government producing news clips for positive public relations,20 274 | Elections and violence directed toward election workers in Iraq,19 275 | catholic abuse scandal,12 276 | NJ senate race,20 277 | "Partial, de-facto transfer of powers to Iraqi authorities prior to date of formal devolution of sovereignity.",16 278 | nyc subway station renovations,10 279 | Two Dominicans extradited to the U.S. to face drug and murder charges,12 280 | supreme court reviews violence against women law,2 281 | Iraqis angry over Turkish role in reconstruction,19 282 | accidental chinese embassy bombing,16 283 | censorship of school drama material that religious community members call immoral,2 284 | airlines to inspect Boeing fuel pumps,10 285 | Hurricane Katrina victims in new unfamiliar areas,14 286 | cuban embargo,19 287 | Oneida Indian lawsuit against New York for unlawfully acquiring land,21 288 | Supreme court rules on death row prisoners' challenge rights,12 289 | Catholic preist abuse scandal,12 290 | Pakistani foreign bank corrpution,19 291 | Book details Secretary of State Colin Powell's warnings to Bush regarding the invasion of Iraq.,16 292 | 9/11 inquiry,16 293 | Increasing mortality of elderly WWII veterans.,16 294 | Presidential election; voters that remain uncertain after final debate.,20 295 | election campaigns,20 296 | Maine's prescription drug plan approved by Supreme Court,3 297 | Terror Suspect freed of U.S. and allowed to go to Jordan,19 298 | GOP praises clinton trip to china,19 299 | healthcare,3 300 | senate kills nuclear test ban treaty,16 301 | Senator votes to keep bill that allows for overhaling of national political campaign financing,20 302 | Bush plan to change Medicare,3 303 | witness describes Bin Laden plotting against US,16 304 | Paula Jones and her accusations of sexual harassment against Clinton,20 305 | Pentagon trying to attack supreme leader of Taliban,16 306 | Tawana Brawley Trial; raped by white men; Rev. Al Sharpton stands as witness,2 307 | Vision,20 308 | Scientist suspected as China spy when hired fired for security breaches,16 309 | animal rights and chinese food,7 310 | Vodafone's AT&T Wireless bid.,15 311 | South Korea on terrorism alert; 2 men attempt to assasinate North Korean defector; Increase in Cold War Tension,19 312 | "Cruise Line, Sexual Assaults Disclosed",12 313 | pictures of combat in Afghanistan,16 314 | US suing intel for antitrust,15 315 | Iraqis change their names from Saddam to avoid being killed by Shiite militia,19 316 | italian election,19 317 | right to protest in hong kong will be curtailed,19 318 | Presidential election; Republican convention; further articles.,20 319 | Israel buries those killed in bus rampage,19 320 | new wtc tower,21 321 | SEC looking for new auidting head,15 322 | The U.S. brings Palestinian and Israeli leaders together for Mideast negotiations,19 323 | Role of lobbyists in Madison Square Garden stadium deal in New York,14 324 | wounded US army soldiers,16 325 | NYC plan to export trash will take more time and money than projected,7 326 | Militas battle for southeastern Iraqi city,19 327 | NATO presses Serbs to release 16 Bosnian citizens,16 328 | 2000 campaign; gun control,12 329 | House to review nation's intelligence agencies,16 330 | crackdown on abusive nursing homes,3 331 | candidates touting their experience,20 332 | Continuing corruption scandal at former Connecticut Governor's office.,24 333 | Chinese obtained U.S. technology and arms secrets,16 334 | WHO decides to kill the last smallpox viruses,3 335 | Three-day Jewish centennial conference in Switzerland confronts Holocaust,19 336 | American car makers plan for larger models,10 337 | China acting on fuel economy standards,19 338 | migrant smuggling route through Africa,19 339 | "downturn in telecommunications cutting jobs, affecting region",1 340 | sharon continues israeli assault in west bank,19 341 | story of two suicide bombers,19 342 | Clinton talks to TV meteorologists about global warming,7 343 | republicans urge a curb on gun sales,12 344 | last ditch efforts to collect soft money donations,20 345 | memorial day,20 346 | Bush trying to smooth relations with McCain,20 347 | New York's immigrations courts hurt by increased burden,9 348 | Assassination of Russian-backed Chechen president.,19 349 | Virginity Testing in Africa,19 350 | Traffic in body parts discovered at UCLA.,3 351 | Presidents of top universities to step down,6 352 | Improvement among America's elementary school students,6 353 | U.S. intelligence officials had received warning of coming attack on American Embassy,16 354 | terror insurance in Iraq,19 355 | Vietnam remembers Vietnam War,19 356 | Democratic primary; significance of Southern states.,20 357 | congressional resolution on Iraq,16 358 | Campaign has raised money for Clinton legal defense,20 359 | President Clinton; sex case,20 360 | Pentagon succeeds in shooting down an IBM with an interceptor,16 361 | investigation into bombing of UN building in Iraq,16 362 | new york vacation spots and the clintons,20 363 | welfare caseworkers,13 364 | Iraq insurgency; assault by Marines on city held by Shiite milita.,16 365 | Clinton Impeachment Trial,20 366 | gun control debate and suburban districts,12 367 | new products with the Windows operating system.,15 368 | plans for WTC site,14 369 | Crew withdraws threat to resign after tensions with Giuliani over voucher issue eases,6 370 | AT&T to cut long-distance rates,17 371 | China's Need for Metal Keeps U.S. Scrap Dealers Scrounging,18 372 | gay men in military,2 373 | presidential election results,20 374 | "Iran drops Rushdie death threat, and Britain restores full diplomatic relation with Tehran",19 375 | Gore's debate style,20 376 | expressing doubt about document.,20 377 | Higher rate of AIDS incidence in African Americans; role of prison-time in spreading AIDS in black neighborhoods.,3 378 | chief executive of Ford Motor Company to resign,15 379 | stock market falls,1 380 | clinton to require welfare recipients to work,13 381 | suburbs struggling to keep elderly population,14 382 | speech and language gene found,3 383 | bad doctors,3 384 | Clinton Impeachment Trial,20 385 | NY senator will not run again,20 386 | Clinton Urges House to Settle Its Differences Over Gingrich,20 387 | Lott apologizes for Thurmond comments but won't resign,20 388 | Comcast's Disney bid; probable effect on the media industry.,15 389 | Nuclear proliferation; Pakistan gov. admits some of its citizens may have sold data on nuclear weapons.,16 390 | Two men revive inquiry on Waco,16 391 | cooling of real estate merkets,14 392 | Clinton administration cover-up scandal,20 393 | Comission absolves Prime Minister Netanyahu of attack on Hamas official,19 394 | Presidential election; determination of African-Americans to avoid 2000 experience and make their ballots count.,20 395 | Clinton Impeachment Trial,20 396 | US starts direct combat with ground troops in Afghanistan,16 397 | Party change in Minnesota legislature mirrors what will happen to the nation's capitol in a few weeks,20 398 | Protest over killing of Amadou Diallo at memorial service,12 399 | Growing Iraqi Army; capture of men responsible for shooting down helicopter,16 400 | G 7 summit meeting,19 401 | pension changes,5 402 | UN weapons inspectors visit Iran,16 403 | Iraq insurgency; Falluja assault; re-capture of one-third of the city.,16 404 | Iraqi self-rule delayed,16 405 | Ford Explorer problems,15 406 | "Hurricane Katrina aftermath, Bush visits the area",15 407 | Man on a box in Abu Ghraib,19 408 | Order by Iraqi government for arrest of Ahmad Chalabi on charges of counterfeiting.,19 409 | No Child Left Behind Act requires annual testing in math and reading,6 410 | Clinton's defenders attack the credibility of Paula Jones,20 411 | Iraq Crisis: Plans for strike; diplomacy,16 412 | "Democrats gain power in Senate, but presents task of leading effectively",20 413 | Japan tells U.S. that their banking system is acutely short of capital,19 414 | Bush's appointment of the Deputy Director of National Intelligence,20 415 | arafat and the mideast violence,19 416 | MCI offers local residents telephone service,17 417 | clinton scandal,20 418 | Israel's ground war raises potential for casualties,19 419 | counterterrorism in Belgium,19 420 | Terri Schiavo case emboldens religious right,2 421 | Saddam Hussein trial,16 422 | Serbs continue to displace ethnic Albanians and force them to flee,19 423 | rape of a jogger in central park,12 424 | Growth of the richest class in the U.S.,1 425 | Petty Officers admits Japanese trawler was on radar before accident,16 426 | "Insurgency in Afghanistan, Navy Seal rescued",16 427 | Israeli Prime Minister Netanyahu begins aggressive campaign at home after returning from the U.S.,19 428 | Modernization in China threatens traditional tribes,19 429 | NY construction wall fell,10 430 | Iran President delivers inaugural address,19 431 | Peru election,19 432 | supreme court nominee John Roberts,20 433 | new york times wins pulitzers for 9/11 coverage,17 434 | Internet users loosing initial draw to eclectic possibilities,17 435 | clinton scandal,20 436 | teenage brothers admit to killing their father,12 437 | lebanon and israel fighting,19 438 | Iraqi premier moves to establish regional talks,19 439 | Bush administration and problems with Middle East foreign policy,19 440 | Republicans split after impeachment issue,20 441 | gay and lesbian pride parade,2 442 | NYC budget woes; cuts in police force,12 443 | peru hostage crisis,19 444 | MTA to use video surveillance in subways,10 445 | heirs of song suing inspiration for use of song title,15 446 | a body of an everest climber found,12 447 | Republicans not getting any help from economy in reelection bids,20 448 | Schundler's campaign for NJ governor lagging behind opponent,24 449 | lobbyists and medicare,20 450 | class action sexual harassment lawsuit at smith barney,15 451 | How Clinton will be judged in Lewinsky scandal,20 452 | Japan-China ecomonic ties,19 453 | Escape of a U.S. hostage; death of soldiers in attacks.,16 454 | meetings about iraq policy,16 455 | priest abuse scandal,12 456 | Commerce Department's regulation of satellites may harm American satellite makers in foreign markets,18 457 | Afghan government moving forward with efforts to get Osama Bin Laden to leave,16 458 | abortion vote,2 459 | Nationalist veterans demand more retirement benefits fromTaiwan's Government,19 460 | NYC Mayor Michael Bloomberg's property-tax rebate proposal,24 461 | Congressional elections,20 462 | Migrants in Mexico endure poor conditions,19 463 | Prudential Insurance Company; Arthur Ryan asks NJ comissioner to investigate after customer complaints,15 464 | high school suffers with loss of loved ones in terrorist attacks,16 465 | End of assault weapons ban had little effect despite predictions,12 466 | Iraqis killed in suicide bomb attack,19 467 | IMF bailout of south korea,19 468 | massacre of Kosovo men by Serbs,19 469 | supreme court upholds new campaign finance law,20 470 | "Chileans hail death of Augosto Pinochet, but violence mars celebration",19 471 | Encephalitis virus in New York is much more serious,3 472 | justice investigation involves Clinton; campaign financing,20 473 | transit strike in NYC,10 474 | Giuliani criticized on city charter plan,24 475 | return of exiles to Iraq,16 476 | hawaiian estate controversy,24 477 | special military tribunals to try foreigners charged with terrorism,19 478 | news anchor injured in Iraq,16 479 | more job losses,1 480 | pentagon keeping helicopters away from NATO forces,16 481 | Gephardt's views on gays,2 482 | George Allen falters in U.S. Senate race in Virginia,20 483 | terrorism arrest,16 484 | areas offering sanctuary.,16 485 | Iraq blames US for market explosion,16 486 | losses from terrorist attacks could force major airline carriers to bankruptcy,10 487 | election reform bill passed,20 488 | Economic policy leaders from the U.S. and Japan fail to agree on global economic cure,19 489 | US wants defections from iraq,16 490 | Serbian war criminal,16 491 | American spacecraft lands on Mars,17 492 | textbook explaining how-to terrorist activities introduced in embassy bombing trials,16 493 | Presidential election/Democratic primary; further articles.,20 494 | Executive branch failures in preventing 9/11; NSC advisor Rice given memo about Al Qaeda,16 495 | peace in chechnya,19 496 | McCain's wife's new role,20 497 | Saddam Hussein was a regional terror for 30 years,16 498 | Senate races,20 499 | arson in northern ireland,19 500 | Death of Yassir Arafat,19 501 | Scientists using genes to enhance breeding of crops and livestock,4 502 | plea tossed in Iraqi abuse case,19 503 | Iraq reconstruction; large-scale billing fraud by American security company.,16 504 | China tightens rein on freedom of speech and press,19 505 | Iraq insurgency; assault on Falluja; Marines' experience.,16 506 | Board to determine which airlines to aid,10 507 | Clinton's Brother-in-law paid for lobbying pardons,20 508 | Britain: drug testing laws,19 509 | Madeleine Albright brings fighting Kosovo together for peace talks,16 510 | Britain: ruling to allow gay soldiers,19 511 | U.S. is debating talks with Iran over nukes,16 512 | U.S. may try new approach with North Korea,19 513 | New drug to prevent heart failure,3 514 | princess diana divorce,19 515 | Debate on what should be built on the site of the WTC.,14 516 | Quantum experts win Nobel Prizes,17 517 | pinochet can be extradited to stand trial,19 518 | 9/11 aftermath; behavior of office workers near Ground Zero.,16 519 | Mideast violence,19 520 | Bush administration's decision to oppose lawsuits of drug and medical device manufacturers for faulty products.,3 521 | Iraqi and American forces kill insurgents,16 522 | tobacco company damages,3 523 | Presidential election; Bush's National Guard service; investigation into Bush's activities during the period.,20 524 | Video of Jose Padilla reveals life of terror suspect,16 525 | Merger between Kmart and Sears.,15 526 | senate committee divided on clinton's air force secretary nomination,20 527 | Yemen and its foreign policy,19 528 | China: Taklimakan Desert to be cultivated,19 529 | Clinton promises veto in Republican tax cut,20 530 | editor admits crack expose was flawed,17 531 | Superintendant of New York City school district get success in tough area,6 532 | market rally,1 533 | General says US is still at war in Iraq,16 534 | Shortage of nurses in African countries due to large-scale emigration to developed world.,19 535 | "Summer School wrongly ordered for 8,600 students",6 536 | Syrian president buried,19 537 | Other countries question fairness of international aid policies after Asian tsunami,19 538 | ny welfare reform,13 539 | Judge refuses postponement of McVeigh execution,12 540 | louima case,12 541 | child porn case,12 542 | livery cabs a growing problem in NYC,14 543 | Newark in worse shape now than in 1967,14 544 | Donor to Democratic Party accused of receiving foreigners' cash,20 545 | NYSE may move to new jersey,15 546 | Iraq elections; Sunni Arabs' statements that their followers could boycott the election.,19 547 | Congress tries to limit Drug Cartel Money Launderers from sending money to Columbia,12 548 | EPA to clean up homes poisoned by 9/11 dust and ash,7 549 | Bush vows to aid countries in war on terror,16 550 | ross perot barred from debates,20 551 | NJ senate race,20 552 | drunk driver sentence,12 553 | Question whether Viagra will improve the sex life of women,3 554 | Saddam Hussein war crimes trial,16 555 | Exxon and Mobil oil merger,15 556 | Federal court upholds law giving notice of sex offenders,12 557 | juror's education,12 558 | Nuclear proliferation; role of network organized by Pakistani; further revelations thereon.,16 559 | south africa gun law debate,19 560 | China Transitions Leadership Peacefully; Hu Jintao,19 561 | "Death toll rises in Lebanon, Lebanese Prime Minister calls for international involvement",19 562 | Eliot Spitzer and New York gubernatorial race,24 563 | "Roberts Confirmation, relationship between the courts and congress",20 564 | fish going extinct in the hudson,4 565 | Aftermath of meeting between President Clinton and nation's most powerful bankers,15 566 | 2000 campaign for vice president,20 567 | Efforts by Shiite leaders to persuade Moqtada al-Sadr to withdraw militia units and permit the deployment of Iraqi government forces.,16 568 | Trent Lott tries to fix consumer price index,1 569 | Security against a potential New Year's terrorist attack.,16 570 | bush demands israeli withdrawal,19 571 | piracy in mexico,15 572 | new land conservation effort,7 573 | Hezbollah and Israel both choose violence to resolve recent conflict,19 574 | UN allows Iraq to export oil to help civilian population,19 575 | Poor conditions in Russia impair the ability of figure skaters to practice,19 576 | War on terror; U.S. government claims that Osama Bin Laden is personally preparing an attack on U.S. soil.,16 577 | Dentists notice a rise in Meth use,3 578 | F.B.I investigates Democratic campaign money,20 579 | Haitian crisis; seizure of second-largest city by rebels.,19 580 | Google buys out YouTube,15 581 | some refusing to pay taxes,1 582 | air traffic controllers,10 583 | california governor and abortion,2 584 | Democrats hopeful of success in 2006 elections,20 585 | Photo: victims of 9/11 honored,16 586 | Female Condom: Important Weapon against AIDS,3 587 | white house says prewar Iraq intelligence was flawed,16 588 | Photo-Hilary Clinton visits Harrient Tubman Learing Center in Harlem,20 589 | Improving political style of NYC Mayor Michael Bloomberg.,24 590 | Turkey planning to occupy iraq in the event of war to prevent refugee entrance,19 591 | "Corruption, spying, and leaks in Silicon Valley",15 592 | Elizabeth Dole; Red Cross; Presidential Campiagn,20 593 | NJ sprawl,14 594 | funeral for a sniper victim,12 595 | arab leaders to meet,19 596 | Enron scandal; plight of workers rendered unemployed by corporation's collapse.,15 597 | fringe parties in NY politics,24 598 | speculation that russian president is sick,19 599 | Tobacco industry to gain from settlement,3 600 | antiterror in europe,19 601 | Proposed Freedom center at ground zero,21 602 | Bush administration scaling back oil drilling in Gulf of Mexico,8 603 | South Korea wants longer range missiles,19 604 | recall probable in CA,24 605 | "Oklahoma City Bombing: trial, friend sticks to story in cross-examination",16 606 | Israel's Barak decides to quit politics,19 607 | indian politician,19 608 | plane crash,10 609 | Gore attacks Bush tax cut plan,1 610 | chinese trade bill,18 611 | astronomy satellite,17 612 | chaos in Liberia,19 613 | UN troops leaving haiti,19 614 | Indian economy,19 615 | Federal budget,1 616 | concern over doctor/investor relationships,3 617 | Suspects in Madrid attacks blow themselves up after being surrounded by police.,19 618 | hand recounts in florida can continue,20 619 | Milosevic trial will test international law,19 620 | Health expenditures in the United States as a proportion of GDP.,3 621 | Tobacco companies selling cigarettes to traders to funnel them into black markets,12 622 | Tough sentence for former WorldCom chairman,12 623 | gore campaign,20 624 | Free Trade Zone of the Americas given the go-ahead,18 625 | Former Private Secretary reveals information about deals made between narcotics traffickers and political leaders,12 626 | Al Qaeda defector used by prosecution in terrorism cases.,16 627 | 2000 campaign-cheney chosen,20 628 | Clinton Impeachment Trial,20 629 | religious practices vs. health concerns; New York City politics,2 630 | American spy plane lands in China after crashing with Chinese fighter jet,16 631 | Auto industry; reduce SUV emissions,7 632 | Senate and White House promoting measures that increase use of ethanol,8 633 | Promotion of John Edwards as running-mate of Kerry.,20 634 | Europeans debate US plan for UN involvement in Iraq,19 635 | E.P.A:air quality standards,7 636 | Abu Ghraib scandal; Bush apology combined with continued support for Rumsfeld.,19 637 | immigrants in suburbia,9 638 | inquiry into fraud by MCI,12 639 | Domestic surveillance,2 640 | chinese dissident sent to the US,19 641 | Bush's pick for secretary of defense,20 642 | Pataki barely breaking a sweat in race for second term as Governor,24 643 | New Jersey jail raid,12 644 | Kennedy relative sentenced in a murder trial,12 645 | criminal inquiry leads to raid in Marine unit,16 646 | takeover battle for sprint,15 647 | German leader warns about iraq war,16 648 | auto industry mileage plan,10 649 | Death of Arafat; analysis.,19 650 | State of the stock markets; analysis of signs of recovery.,1 651 | Standoff at Falluja; discussion in American command whether U.S. should pull out of the city.,16 652 | refugees in Kosovo need food,19 653 | CIA and FBI agree to truce,16 654 | panel says US should require insurance to pay for vaccines,3 655 | Martha Stewart trial; dismissal of most serious charge.,12 656 | cocaine fight in columbia,12 657 | Abu Ghraib scandal; order by U.S. commander in Iraq to halt use of all coercive interrogation techniques.,19 658 | jack kemp,20 659 | hummers,10 660 | Stalemated election,20 661 | Deaf Mexican immigrants held captive in North Carolina,9 662 | US near a trade deal with china,18 663 | Enron scandal,15 664 | UN resolution on Iraq's future,16 665 | Real estate broker completes largest transaction in U.S. history and buys property along East River,15 666 | U.S. has not been tracking weapons intended for Iraqi security forces,16 667 | clinton in bosnia,19 668 | reactions to the start of the Iraq war in America,16 669 | Bush and Cheney reaching out to democrats,20 670 | Hezbollah works to rebuild Lebanon to win popular support,19 671 | "peer-to-peer services being used for pornography, not just music",17 672 | in NYC.,21 673 | undecided washington sentate race,20 674 | France reveals evidence against Nazi war criminal to stand trial,19 675 | mentally ill health care,3 676 | Special report on use of wireless technology in America.,17 677 | 2000 campaign; bush after college,20 678 | first soldier killed in Afghanistan buried,16 679 | Washington State voters face affirmative action measure,2 680 | egyptians joining the palestinian cause,19 681 | IRA to help disarm ulster fighters,19 682 | Hussein rallies his troops,19 683 | bob dole challenges clinton's ethics,20 684 | Remembering Dr. Martin Luther King Jr,2 685 | Increasing nuclear proliferation despite diplomatic agreements; damage wrought by export of Pakistani nuclear expertise.,16 686 | Republicans questioning of Bush's ban on stem-cell research.,3 687 | new subpoenas over campaign finance violations,20 688 | Chinese trade bill,18 689 | Veteran health care,16 690 | women in India changing their roles,19 691 | new communications law,17 692 | poll on opinions of new yorkers,24 693 | Federal investigators link deaths to same suspect,12 694 | shootout in the bronx,12 695 | technological breakthrough in computing,17 696 | Revelation of prescient pre-war report about danger of post-Saddam Iraqi civil war; administration attempts to minimize its significance.,16 697 | Iraq's oil industry,19 698 | Britain: Prime Minister Campaigns,19 699 | Iraqi war casualties are up sharply,16 700 | US donations to jewish settlers,19 701 | "Bush announces government will take stronger role in airline security, will station troops",10 702 | hospital worker died of anthrax inhalation,16 703 | Shooting at the Empire State Building; the gunman,12 704 | limits to Putin's power,19 705 | graves uncovered in Sri Lanka have not lead to charges,19 706 | "Indian computer security is bad, files at risk",21 707 | fossils of second largest dinosaur found in Egypt,19 708 | naturalized citizens in NY,9 709 | NATO: Russia agrees to alliance expansion,16 710 | Virginia offer of scholarships to black students denied access to high school during segregation wars.,2 711 | greenspan says the economy is good,1 712 | FBI agent charged with spying for russia,16 713 | California plan for large-scale cutbacks in greenhouse-gas emissions.,7 714 | Microsoft antitrust case,15 715 | House approved bill to turn airport security to government,10 716 | irish peace referendum,19 717 | Trinity College and the revival of Hartford,6 718 | Clinton heart surgery.,20 719 | online prescription drug sales,3 720 | Sotheby's chairman convicted of price-fixing,12 721 | Key role of 9/11 widows in formation and activities of the 9/11 Comission.,16 722 | Rice reviews progress in Iraq; rejects exit strategy,16 723 | Al Qaeda in Karachi,16 724 | China to Protect Private Property Rights; Boon to Entreprenuerial Class,19 725 | welfare reform in Italy,19 726 | More on the Clinton inauguration,20 727 | South Carolina campaign,20 728 | Employment trends in the U.S.,1 729 | "stock slump hurts 401Ks, makes many rethink retirement plans",15 730 | European opinion,19 731 | Clinton denies Paula Jones' accusations,20 732 | exit polling from the presidential election,20 733 | U.S. shift to support cease fire in Lebanon started frantic round of negotiations in U.N.,19 734 | senate approves online contracts,15 735 | tax plans in campaign,1 736 | Bush calls for end to loans to buy stock,15 737 | Panama takes control of the Panama canal,19 738 | halliburton overcharging for fuel,16 739 | Foul Air and Water Part of Cost of Boom in China's Exports,19 740 | housing conditions for the poor,14 741 | drug review process,3 742 | Gates to create foundation to bring internet into public libraries,6 743 | Cubans still struggling to make ends meet,19 744 | Supreme Court nomination; partisan dispute,20 745 | Bali bombing,19 746 | Doctors' pay regains ground despite the effects of HMOs,3 747 | suicide scandal in Germany,19 748 | Cruise lines pay little income tax because of loophole in tax law,10 749 | More Palestinian and Israeli struggle,19 750 | Japanese elections,19 751 | burning of chemical weapons,7 752 | "New York law holding car owners liable for car accidents, whatever the driver, limiting car-leasing in the State.",10 753 | "weather data predicts years of frequent, stronger hurricanes",17 754 | tobacco settlement money held up by new york politics,3 755 | FTC ruling on doctors to let them band together,3 756 | israel and lebanon relations,19 757 | NYC Chancellor cuts school budget by cutting program spending,6 758 | I.B.M. guilty of selling advanced computers to Russian nuclear weapons laboratory,12 759 | Hugo Chavez opposition,19 760 | heightened terror alert,16 761 | Meningitis Epidemic in West Africa,19 762 | Iraqi constitution to be voted on,19 763 | uprising in Ivory Coast,19 764 | Research showing that aspirin use can help prevent breast cancer.,3 765 | more questions arising over Clinton pardons,20 766 | Prosecutors stop Haitian murder suspect's efforts to leave country,12 767 | fraud claim in Iraqi election,19 768 | Businesses use people's YouTube and MySpace videos as free advertising,17 769 | Cleanliness takes a back seat to financial survival in airline industry,10 770 | scores of top students rise while those of average students decline on reading tests,6 771 | mideast violence,19 772 | applying for college,6 773 | Israel requests shipment of U.S. artillery rockets,16 774 | "Bush threatens to veto Senate's patients' bill of rights, Democrats say he'll have to accept it",3 775 | Difficulties for NYC mayor Michael Bloomberg in playing host to the Republican convention.,20 776 | states propose reducing medicaid,3 777 | unauthorized wiretaps,2 778 | house races,20 779 | debate on war strategy disappears after advance on Baghdad,16 780 | British bracing for Bush protesters,19 781 | Bush rejects a quick pullout from Iraq,16 782 | interrogation of terror suspects,16 783 | NYC to pay overtime to police and fire chiefs despite concerns,12 784 | "France: parliament elections, little mention of economic plan",19 785 | campigner for US senate avoids SEC investigation,20 786 | Declining energy prices has caused the U.S. to give up all gains made in conserving energy,8 787 | first Latino becomes Los Angeles mayor in more than a century,24 788 | "New York City Budget, tax cuts",24 789 | Young blacks link tobacco use to marijuana,3 790 | U.S. to restore relations with Libya,19 791 | Bolivian Leader in Exile After Efforts to Eradicate Coca,19 792 | Chinese backlash against closing of news journal; censorship,19 793 | use of carbon fuel declining,7 794 | Sky scrapers advertising value,15 795 | Europeans begin to fear growing Muslim minorities in their countries,19 796 | bombing in jerusalem,19 797 | Clinton urges bipartisanship to get budget negotiations moving again,1 798 | Bush prepares for war,16 799 | Sunni militia force Shiite bakeries in Baghdad to close,19 800 | us peace efforts in the middle east,19 801 | Pressure grows on GOP House leadership over Foley scandal,20 802 | new organ transplant strategies,3 803 | Bankers Trust Company admits to diverting money to enhance financial performance,12 804 | Lebanon's ex-premier killed in car bomb attack; Syrian influence in Lebanon,19 805 | Roberts Confirmation hearings,20 806 | 3 rich kids moved around after mother kills their father,12 807 | copyright law changes,15 808 | Bush mideast speech,19 809 | auto industry agrees to design changes to improve safety,10 810 | soldier leaving for iraq,16 811 | "witness accuses former boss for price-fixing between Sotheby's, Christy's",12 812 | Law Enforcement fears that domestic terrorist attacks are linked by white supremacists,12 813 | Controversy over acquisition of U.S. ports by state-owned Middle-Eastern countries.,21 814 | White house dealings with enron,20 815 | wealthy taking year off before starting college,6 816 | Debate over who will pay for repairs of beach erosion in New Jersey,7 817 | 1996 Election; public opinion polls,20 818 | Israeli commandos raid Hezbollah stronghold despite truce,19 819 | Stricter elementary school standards,6 820 | Questionable legal practices in Checnya highlighted in torture of woman accused of adultery,19 821 | Saddam Hussein; Life in Iraq,19 822 | re-designed station wagons enter the auto market,10 823 | US tells citizens in India to leave,19 824 | "Internet message boards allows company employees to vent, sometimes ugly, conversations",5 825 | europe bans british beef,19 826 | priest abuse scandal,12 827 | terror suspect,16 828 | Iran blamed for the killing of Iranian dissidents in Germany,19 829 | Supreme court rules on recruiting at universities,16 830 | India Genral Election; Congress Party Losing Power,19 831 | Generic AIDS Drug makers want to sell in South Africa,19 832 | Taiwan election,19 833 | Juveniles punished for killing 5 people in school shooting,6 834 | calling up reserves in the iraq war,16 835 | Murders of women along the Mexican border,19 836 | the political future of Indonesia,19 837 | Clintom impeachment trial,20 838 | Fish market in NYC closing,15 839 | elderly people in philadelphia,3 840 | Bush won't continue plan to rid of weapons with plutonium,16 841 | Al Sharpton's bid for the Presidency,20 842 | kennedy and castro,19 843 | Markets Surge after Investor concers about inflation eased,1 844 | NJ doctors protest high insurance costs,3 845 | Bush seeks to expand NAFTA throughout Central and South America,18 846 | Bombings in London,19 847 | communications equipment maker announces lay-offs,5 848 | The difficuluties of scheduling a war; Olympics; Islamic holidays,16 849 | New York traffic court,12 850 | "Russian President fired Prime Minister and appointed a former KGB officer, Vladmir Putin",19 851 | conjoined twins separated,3 852 | Livestock testing at state fairs,4 853 | NJ budget problems,24 854 | NJ troopers using hotel staffs to stop drug smugglers,12 855 | Iraqi Prime Minister denounces Israeli attacks on Lebanon,19 856 | medicare-prescription drug benefits,3 857 | north korea has access to plutonium,16 858 | Comcast bid for Disney; probable strategy of Disney leader to counter it.,15 859 | surrogate mothers have babies for gay couples,2 860 | Civilian death toll in Iraq reaches new high,16 861 | culture war in Israel,19 862 | US plans for a palestinian state,19 863 | Guilty verdict on prominent investment banker.,12 864 | mourning a school shooting in scotland,19 865 | US attacks iraq,16 866 | Tactics of American unit against Shiite militia of Moqtada al-Sadr.,16 867 | Red Cross criticizes Guantanamo Bay,19 868 | Israeli Prime Minister; Isreali cabinet,19 869 | virus on a cruise ship,3 870 | Russian president's plan to tighten executive control over the legislative branch and local governents; stated rationale in terrorist threat.,19 871 | Chief executives see 22 percent raise in salary in last decade,5 872 | political terrorists kill Cambodians at democratic rally,19 873 | John Kerry and the Cambodian Swift Boat incident,20 874 | "Health care costs, Medicare",3 875 | recount analysis,20 876 | poll find NYC split over mayoral candidates,24 877 | Democratic primary; further articles.,20 878 | Human embryo cloning in South Korea; significance.,3 879 | NYC school board dispute,6 880 | American offensive against Shiite militia.,16 881 | supreme court ends a ban on ads for casino gambling,15 882 | Abortion doctor eulogized as killer is sought,2 883 | US reliance on Saudi oil,8 884 | School uniforms in Public School 7 in New York,6 885 | bob dole candidacy,20 886 | Bush talks about AIDS in South Africa,19 887 | Army veteran accuses her top-ranked Army boss with sexual assault,16 888 | nytimes neediest cases charity,9 889 | Abu Ghraib prisoner abuse scandal; interrogation unit alleges having supplied early reports to superior officers.,19 890 | big landlord,14 891 | Mad Cow disease in the US,4 892 | Reminder that standard time has resumed,15 893 | Iranian politics,19 894 | Afghani president's description of private armies as the principal threat to his country.,19 895 | mergers in europe,19 896 | Isreali curfew in nablus,19 897 | Immigration debate in Congress,9 898 | German election results,19 899 | man had hand cut off by Taliban on charge of theft,19 900 | israel and lebanon occupation,19 901 | Merging of American Companies,15 902 | Profile of American contractor beheaded in Iraq.,16 903 | Theory of gene flaw proposed to explain evolution of human beings.,17 904 | Life of John Roberts,20 905 | Inefficient evidence to look into Interior Secretary in his role in denial of Indian casino application,21 906 | peace in Angola,19 907 | bad week for Italian PM,19 908 | Senate rejects rival proposal to campaign finance law,20 909 | radio communication in NYC,12 910 | Re-development of WTO site.,21 911 | patents on drugs end,3 912 | nyc labor disputes,5 913 | change in laws causes deportations,9 914 | Chinese Poor Struggle with HIV/AIDS,19 915 | smallpox vaccine,16 916 | High cost of potentially effective anti-cancer drug,3 917 | airport security,10 918 | Police protest over pay,12 919 | Internet message boards facilitate dialogue on race,17 920 | negotiations on interim Afghan government stall,16 921 | captured spy chief returns to Peru,19 922 | Sheriffs department in columbine under scrutiny,6 923 | Supreme Court divided on interpretation of Clean Water Act,7 924 | Senate confirms Ashcroft,20 925 | campaign finance limits,20 926 | Chicago orders big retail stores to raise minimum wage,5 927 | Army recruiting helped by bad economy,16 928 | Federal Reserve considers cutting interest rates,1 929 | Conservative Christian politician hurt by ties to Jack Abramoff lobbying scandal,20 930 | Bush's Social Security reform plan,13 931 | osteoporosis gene found,3 932 | defect in heart devices,3 933 | US dispatches agents to Germany to uncover terrorist network,16 934 | bacteria in chickens,4 935 | people who live in homes where BTK murdered,12 936 | affirmative action poll,2 937 | suicide attack in Israel,19 938 | Space exploration; infrequent success therein and high costs thereof.,17 939 | politicians on parade,20 940 | whitewater scandal trial,20 941 | AT&T's withdrawal from the residential-phone business.,17 942 | ground zero cleanup nears the end,16 943 | large purchase of woods and wetlands for public use and preservation,7 944 | Danger to VA mental health services from large estimated numbers of future patients among veterans of Iraq war.,16 945 | Dow Chemical Company knowingly deceived women on breast implants,3 946 | Clinton scanal fallout,20 947 | lobbyists sway legislators on Chinese trade policy,18 948 | Standard time resumes,15 949 | second day of blackout continues as company makes accusations,8 950 | bob dole attacks clinton in the presidential debate,20 951 | PHOTO: Senator John Glenn at Brooks Air Force Base,20 952 | New York's hospitals receive Federal aid package,3 953 | East Germany's economic revival,19 954 | shady democratic party fund raising,20 955 | Bush aides defending budget in Congress,1 956 | New York State Legislature approves early retirement for most experienced teachers,6 957 | "fireworks explosions in Lima, Peru kill hundreds",19 958 | House Democrats choose Steny Hoyer as House majority leader over Nancy Pelosi's choice,20 959 | McCain endorses Bush,20 960 | fighting in the mideast,19 961 | reform party presidential candidate,20 962 | Iraq; Sadam Hussein; split in Arab nations,19 963 | inquiry into columbia breakup,17 964 | barak agrees to halt settlements in the west bank,19 965 | Comcast's Disney bid; rejection by Disney.,15 966 | welfare and the states,13 967 | NY mayor race,24 968 | Bush challenges Mideast to try democracy,16 969 | Swiss failed to payback Nazi payments after WWII,19 970 | North Korea fires ballistic missile over Japan,19 971 | US soldiers killed in Iraq,16 972 | Medpartners/Mullikin to Buy Caremark International; Creation of A Large Physician Management Company,15 973 | guilty verdict in a wendy's murder case,12 974 | Bosnian forces pull back troops and weapons from front line,19 975 | Iowa house race,20 976 | terrorism,19 977 | chief fundraiser for Senator may have received illegal campaign contributions,20 978 | Iraqis want more power to control themselves,16 979 | Prosecution of NYC school superintendent charged with $1m embezzlementt; difficulties in detecting embezzlement in districts with high per-student spending.,12 980 | "FDA proposes ban on importing blood from Europe, upsetting European suppliers",3 981 | indonesian cleric falls ill,19 982 | "Japan's Economy in Debt, despite Japanese ability to save",19 983 | GAO files suit over cheney energy meetings,20 984 | "Oxycontin sales grew, but at a cost",3 985 | costs on loans,15 986 | Bristol Myers Squibb to Yield Patent Rights Over Aids Drugs in Africa,19 987 | china trade deal,18 988 | "Former United States Housing Secretary pleads guilty, lied to FBI",20 989 | Middle East politics; Ariel Sharon,19 990 | Governor-elect Eliot Spitzer likely to ask for ouster of State Comptroller Alan Hevesi,24 991 | domestic violence courts in NY,12 992 | China Bans Text Messaging/E-mail in Protests Against Japan,19 993 | Lacking relief and repair effort in Tsunami ravaged Indonesia,19 994 | Bush declared winner by Florida,20 995 | Palestinians and Israels hold off peace talks despite Madeleine Albright's visit,19 996 | stock market,1 997 | china sending missiles to iran,16 998 | Bomb attacks in Iraq; U.S. soldiers killed.,16 999 | Tora Bora offers many hiding places for Al Qaeda fighters,16 1000 | -------------------------------------------------------------------------------- /data/tutorial_train.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "AIDS in prison, treatment costs overwhelm prison budgets",12 3 | olympics security,19 4 | police brutality,12 5 | Iranian nuclear program; deal with European Union and its leaving of Iran free to develop plutonium.,16 6 | terror alert raised,16 7 | Job report shows unexpected vigor for US economy,5 8 | Clinton proposes West Bank Plan to Isreal's Prime Minister Netanyahu,19 9 | Senators debate Iraq War policy,16 10 | Myrtle Beach,14 11 | china visit,19 12 | elections in Rwanda,19 13 | Sudan tires of war between Arabs and Christians,19 14 | Enron scandal,12 15 | primaries - McCain,20 16 | US to sign a treaty banning land mines,16 17 | ross perot to run for president,20 18 | European law prohibits American-style buying and selling of personal data,19 19 | clinton's list of donors,20 20 | Guantanamo Bay opposition,2 21 | US demands Iraq disarm in a meeting with allies,16 22 | conservatives attack job training bill,5 23 | Underground rumors and popularity of supposed obesity miracle drug; doctors' call for caution.,3 24 | old style farm in Illinois,4 25 | Questions about whether NYC mayor Bloomberg's plan to impose strict requirements for passing the third grade would not be counter-productive.,6 26 | nyc schools chancellor ousted,6 27 | Fed Reserve Chairman and former treasury sec against dropping tax to stimulate economy,1 28 | supreme court ruling on redistricting,20 29 | Governor of California to sign welfare law to move aid recipients into jobs,13 30 | AT&T may split up,15 31 | immgration crackdown at walmart,9 32 | Urban development programs to replace high-rise public housing,14 33 | jungle trees hides ancient archetectual buildings,19 34 | Difficulties faced by prospective parents whose unborn child is diagnosed with serious conditions.,12 35 | NY school cirriculum,6 36 | witness against Al-Qaeda,19 37 | evidence from 1918 flu found in genetics,3 38 | ground zero reconstuction,21 39 | Growing field of geriatric care managers,3 40 | israel may release arafat,19 41 | billboards and smoking,3 42 | push for a nationwide product liability bill,15 43 | White House selection for new Treasury Secretary,20 44 | Internal pressure for reforms in China; use of centenary of Deng Xiaoping for their promotion.,19 45 | Afghan girls and education,19 46 | Growth in the number of uninsured in US,3 47 | Israel captures hebron,19 48 | plan to revamp Lower Manhatten transit network,10 49 | AIDS medication; AZT,3 50 | supreme court ruling on vouchers for private schools,6 51 | Canadian sues US for detaining and beating him for 10 months,2 52 | New spy gear used in Iraq,16 53 | US-French split over iraq,16 54 | get out the vote campaigns,20 55 | Two Congressmen and their recovery from alcoholism,3 56 | retail boom,1 57 | medicare drug plan backed by aarp,3 58 | Blue collar workers whose jobs have gone overseas find themselves abandoned by labor unions,5 59 | State Department annual report finds an increase in terrorist incidents.,16 60 | Japan struggles to adopt Western-style capitalism,19 61 | US and russia sign a pact on nuclear arms cuts,16 62 | New Orleans reconstruction,15 63 | legal battle begins over expansion of government's powers in fighting terrorism,19 64 | independent council investigating the clintons,20 65 | Thanksgiving in immigrant households.,9 66 | genetic engineering regulations,4 67 | E-bay auctions,15 68 | Yeltsin nominates Primakov as Prime Minister,19 69 | ken starr testimony,20 70 | Fire disrupts Brooklyn subway lines,10 71 | war in Iraq:US abuse of prioners in Afghanistan,19 72 | Clinton impeachment trial,20 73 | Afghanistan reconstruction; mine- and explosives-clearing in preparation for airport re-opening.,16 74 | bush legislative plan,20 75 | Britain's royal navy actively recruits homosexuals,19 76 | "Series on life of people with A.L.S., or Lou Gherig's disease.",3 77 | poll on americans' opinions on microsoft,15 78 | "Saudi leaders voice public support for US, but are sensitive to launching military operations",16 79 | Man on death row appeals for clemency,12 80 | NY senate debate,20 81 | more men are experiencing sexual harrassment from other males,2 82 | Chinese Sneak Into Taiwan in Hopes of Prosperity,19 83 | anniversary of waco fire,16 84 | "Clinton Retirement, Calls on Republicans",20 85 | "Damage done to Columbia University's Research, Result of Power Outage in New York City",8 86 | the number of dead and missing firefighters,16 87 | Israel studying formerly unthinkable proposals,19 88 | Profile of current population of drug felons in NYC prisons.,12 89 | National Governors' Association decide not to try to change new Federal welfare law; pressure from Congressional Republicans; creating a compromise over benefits for legal immigrants,13 90 | Elian Gonzalez,9 91 | stroke therapy,3 92 | Congressman sentenced in bribery case,20 93 | GOP primary,20 94 | rapid expansion of technology requires area codes in phone calls,17 95 | "US warns China to abstain from military action against Taiwan, Urge for peaceful resolve",19 96 | Kerry fires campaign chief,20 97 | cancer drugs in mice,3 98 | Saudi Arabia faces end of oil boom and beginning of recession,19 99 | sniper attacks,12 100 | Germany accuses U.S. of illegally kidnapping innocent terror suspect; CIA,16 101 | Home prices rise leading to bidding wars,14 102 | Terror's influence on the art world,19 103 | countries cracking down on cartels,18 104 | New discoveries about the nature of brain injury,3 105 | "despite depictions from credit card companies, most Americans bamkrupt due to hard luck",15 106 | US college students under quarantine from SARS,3 107 | murder charge,12 108 | Kofi Annan visits Iraq,19 109 | Tenet healthcare settles fraud accusations,3 110 | Coretta Scott King's funeral service,2 111 | Congressional Republicans and White House officials near budget agreement,1 112 | "Hurricane Katrina, New Orleans police force falling apart",12 113 | car bomb in pakistan outside American Consulate,16 114 | President Bush wants to send more troops to Iraq,16 115 | Democrat Senators-to-Governors; Corzine announcement of run for post of NJ Governor.,20 116 | death penalty debate,12 117 | clinton orders forces to the persian gulf,16 118 | NY times neediest cases fund,17 119 | Narrow approval of Central American free-trade deal.,18 120 | legacy of the los angeles riots,2 121 | USS cole returns to duty,16 122 | Chinese Government Publicizes Falun Gong Self-Immolation,19 123 | Ariel Sharon to found a new party,19 124 | U.S. troops capture high number of prisoners in Iraq not protected under Geneva convention,16 125 | water management in the west,21 126 | anti-american sentiment in Falluja,16 127 | black businessmen,15 128 | New York Stock Exchange C.E.O. accused of stealing money,15 129 | freed Hamas prisoners,19 130 | AOL accidentally releases the identity of one of its users to public,2 131 | Former President Bush promotes business interests of equity firm,18 132 | killings in kosovo,16 133 | Women giving birth in Africa,19 134 | Overview of the 1998 campaigns,20 135 | Dedication of WWII Memorial in Washington D.C..,21 136 | NASA experiment fails,17 137 | Federal Panel overrules science fraud charges against Thereza Imanishi- Kari,3 138 | mutual funds trying to assure their investors,12 139 | bickering over Iraq in European countries,16 140 | Shooting of Amadou Diallo results in citizens not trusting police,12 141 | Refusal of Sonia Ghandi to serve as prime minister.,19 142 | Giuliani upset by lesser charges in criminal death of a police officer,12 143 | election problems in duval county florida,20 144 | couriers serve immigrants,9 145 | China has Trouble Becoming Producer of Crocodile Goods,19 146 | Netanyahu becomes first Prime Minister to cross into Gaza to meet with Palestinians,19 147 | "Clinton and Dole fundraise in same areas, benefit from added attention",20 148 | "Presidential election; rise to prominence of Barak Obama, culminating in his selection as keynote speaker at the Democratic convention.",20 149 | NYC councilman killed at city hall,12 150 | Lieberman's presidential candidacy,20 151 | Child welfare in New Jersey,12 152 | Clark enters democratic race for president,20 153 | Liberians ask for US help,16 154 | US generals meet in a palace to discuss rebuilding,16 155 | father who killed at kids hockey game sentenced,12 156 | police killing youth in venezuela,19 157 | Bush warns Hussein to allow UN inspectors to search for wmds,16 158 | Catholic priest abuse scandal,12 159 | Scandal involving Tom DeLay's family being paid by his campaign committees,20 160 | death of 2 afghan inmates,19 161 | Bush calls for allies to help with transferring Iraq sovereignty,16 162 | Legal status of terrorist suspects; federal judge's halting of military trial of Osama Bin Laden's driver.,2 163 | soaring birthrates in NYC,3 164 | Berlin: Where it is today compared with the past,19 165 | Orchard owners negotiating with home developers rather than collecting apples,4 166 | Advice to investors and reflection on financial lessons,15 167 | corporation and the community its in,15 168 | Pataki calls for financing to promote environment,7 169 | "Zaire: looking into the new era, economics",19 170 | Philip Morris trying to win favor from New York Legislature,3 171 | people travelling for July 4 holiday,20 172 | New welfare programs allow welfare recipients to keep more benefits when they get jobs,13 173 | guerrillas in the phillipenes,19 174 | hiring of Foreign Service officers,19 175 | Growth in demand for services of individual taking down public pay-phone locations and numbers.,17 176 | Clinton impeachment trial,20 177 | states changing stringent anticrime measures,12 178 | Court awards billions of dollars to New York City schools,6 179 | Chinese trade bill,18 180 | Cleveland: No Cell Phones While Driving,10 181 | NYC Mayor Bloomberg announces large budget proposal and estimates there will be a large surplus.,24 182 | Bin Laden linked to kenya embassy bombing,16 183 | new accounting standards for local and state governments,24 184 | immigration laws,9 185 | information on bin laden whereabouts found,19 186 | U.S. presence in Iraq; U.S. propaganda efforts,16 187 | postwar chaos in Mosul leads to anger at US,16 188 | mediator fails to find settlement between California and power suppliers,8 189 | Defense Dept considers changes to military tribunals,2 190 | new building to be built in times square,14 191 | Federal Reserve unexpectedly cuts interest rates,1 192 | group of 14 senators averts showdown over judges,20 193 | democratic convention,20 194 | teenager burns a boat belonging to bush,12 195 | Giuliani defends police officers in confrontation with marchers at Million Youth march,12 196 | strikes in Yugoslavia,19 197 | Discovery of ancient ruins in eastern Utah.,17 198 | Bush administration expanding NSA and bypassing Congress,20 199 | nytimes neediest cases fund ad,17 200 | bosnian election,19 201 | louisiana caucuses,20 202 | congress and white house agree on budget,1 203 | economy trouble,1 204 | new elections in Peru,19 205 | US visa policy called unfair and arbitrary,9 206 | prosecutors want a genetic test of lewinsky's dress,20 207 | Federal appeals rules to continue with recall as scheduled,24 208 | congress to act on firestone tire problems,15 209 | rape of a jogger in central park,12 210 | war in chechnya,19 211 | "Carnival docks cruise ships in Brooklyn, revitalizing the port",14 212 | Kosovo: attack by Serbs ends cease-fire,19 213 | Hyde accedes to Democratic demands on some inquiry issues,20 214 | efforts to move people from welfare to work,13 215 | Iraq reconstruction; major creditors of Iraq agreement for large debt write-off.,16 216 | Gov. Pataki negotiates agreement with Consolidated Edison favoring businesses,8 217 | Supreme Court declines to hear Terri Schiavo case,2 218 | Cambodia: leader resists punishing Khmer Rouge,19 219 | US role in Afghan rebuilding,16 220 | Bush's campaign strategy,20 221 | bush proposes medicare overhaul,3 222 | clinton talks about sending peacekeepers in kosovo,16 223 | Negative impact of global economic crisis on Russia,19 224 | legal reform in Morocco,19 225 | 9/11 memorial service,21 226 | white house inquiry-hilary clinton,20 227 | Changing role of juries in the legal system,12 228 | Difficulties in procuring non-oil sources of energy.,8 229 | Senator blocks promotions to get planes for Idaho national guard,16 230 | UN treaty bans nuclear testing,16 231 | terrorism,19 232 | schools looking for principals,6 233 | investigation into anthrax mailings,16 234 | soldiers in Iraq and Afghanistan to have extended tours of duty,16 235 | Saddam Hussein captivity and trial; Iraqi officials expectation that U.S. will soon transfer him to their custody.,16 236 | Boeing Company Stock Plummets,15 237 | Stock Market Bubble burst for technology stocks,1 238 | new york legislature,24 239 | campaign finance reform,20 240 | NY times neediest cases fund,13 241 | Clinton impeachment trial,20 242 | vietnamese immigration to the US,9 243 | serbs attacks kosovo rebels,19 244 | louima case,12 245 | Bush lowers expectations of a quick war,16 246 | Afghan tribes come before government,19 247 | Clinton's state of the union,20 248 | Russian bank scandal,12 249 | Underground gambling games in New York City,15 250 | clinton puts sanctions on iran and libya,18 251 | reorganization of board of education HQ,6 252 | investigation into catholic preist abuse scandal made public,12 253 | Vermont sport of hunting fish with firearms.,12 254 | abortion pill,2 255 | bill bradley needs some primary victories,20 256 | investigation of hillary clinton,20 257 | Growth of federally financed tutoring industry as a result of No Child Left Behind,6 258 | Cyclospora outbreak in raspberries,4 259 | Israel-Palestine; death of children in the territories; conflicting accounts.,19 260 | Working for the Clintons; Margaret A. Williams,20 261 | Inquiry into problems with U.N.'s Iraq oil-for-food program,19 262 | Presidential elections; Primat der Innenpolitik in both candidates' campaigns.,20 263 | cuban exiles dying off,19 264 | Study of nuns helps with understanding Alzheimer's disease,3 265 | 11 EU countries prepare to introduce the Euro,19 266 | surge in donations to WTC victims leads to items sitting unused,16 267 | NJ senate race,20 268 | Bush looks elsewhere for oil and gas resources after facing opposition in Artic,8 269 | debt in the apple industry after marketing new Red Delicious,4 270 | White House and Rupublican negotiators make push toward agreement on budget and tax cuts,1 271 | intelligence projections,16 272 | Medicaid system not adequately serving poor in the Bronx,3 273 | Federal government producing news clips for positive public relations,20 274 | Elections and violence directed toward election workers in Iraq,19 275 | catholic abuse scandal,12 276 | NJ senate race,20 277 | "Partial, de-facto transfer of powers to Iraqi authorities prior to date of formal devolution of sovereignity.",16 278 | nyc subway station renovations,10 279 | Two Dominicans extradited to the U.S. to face drug and murder charges,12 280 | supreme court reviews violence against women law,2 281 | Iraqis angry over Turkish role in reconstruction,19 282 | accidental chinese embassy bombing,16 283 | censorship of school drama material that religious community members call immoral,2 284 | airlines to inspect Boeing fuel pumps,10 285 | Hurricane Katrina victims in new unfamiliar areas,14 286 | cuban embargo,19 287 | Oneida Indian lawsuit against New York for unlawfully acquiring land,21 288 | Supreme court rules on death row prisoners' challenge rights,12 289 | Catholic preist abuse scandal,12 290 | Pakistani foreign bank corrpution,19 291 | Book details Secretary of State Colin Powell's warnings to Bush regarding the invasion of Iraq.,16 292 | 9/11 inquiry,16 293 | Increasing mortality of elderly WWII veterans.,16 294 | Presidential election; voters that remain uncertain after final debate.,20 295 | election campaigns,20 296 | Maine's prescription drug plan approved by Supreme Court,3 297 | Terror Suspect freed of U.S. and allowed to go to Jordan,19 298 | GOP praises clinton trip to china,19 299 | healthcare,3 300 | senate kills nuclear test ban treaty,16 301 | Senator votes to keep bill that allows for overhaling of national political campaign financing,20 302 | Bush plan to change Medicare,3 303 | witness describes Bin Laden plotting against US,16 304 | Paula Jones and her accusations of sexual harassment against Clinton,20 305 | Pentagon trying to attack supreme leader of Taliban,16 306 | Tawana Brawley Trial; raped by white men; Rev. Al Sharpton stands as witness,2 307 | Vision,20 308 | Scientist suspected as China spy when hired fired for security breaches,16 309 | animal rights and chinese food,7 310 | Vodafone's AT&T Wireless bid.,15 311 | South Korea on terrorism alert; 2 men attempt to assasinate North Korean defector; Increase in Cold War Tension,19 312 | "Cruise Line, Sexual Assaults Disclosed",12 313 | pictures of combat in Afghanistan,16 314 | US suing intel for antitrust,15 315 | Iraqis change their names from Saddam to avoid being killed by Shiite militia,19 316 | italian election,19 317 | right to protest in hong kong will be curtailed,19 318 | Presidential election; Republican convention; further articles.,20 319 | Israel buries those killed in bus rampage,19 320 | new wtc tower,21 321 | SEC looking for new auidting head,15 322 | The U.S. brings Palestinian and Israeli leaders together for Mideast negotiations,19 323 | Role of lobbyists in Madison Square Garden stadium deal in New York,14 324 | wounded US army soldiers,16 325 | NYC plan to export trash will take more time and money than projected,7 326 | Militas battle for southeastern Iraqi city,19 327 | NATO presses Serbs to release 16 Bosnian citizens,16 328 | 2000 campaign; gun control,12 329 | House to review nation's intelligence agencies,16 330 | crackdown on abusive nursing homes,3 331 | candidates touting their experience,20 332 | Continuing corruption scandal at former Connecticut Governor's office.,24 333 | Chinese obtained U.S. technology and arms secrets,16 334 | WHO decides to kill the last smallpox viruses,3 335 | Three-day Jewish centennial conference in Switzerland confronts Holocaust,19 336 | American car makers plan for larger models,10 337 | China acting on fuel economy standards,19 338 | migrant smuggling route through Africa,19 339 | "downturn in telecommunications cutting jobs, affecting region",1 340 | sharon continues israeli assault in west bank,19 341 | story of two suicide bombers,19 342 | Clinton talks to TV meteorologists about global warming,7 343 | republicans urge a curb on gun sales,12 344 | last ditch efforts to collect soft money donations,20 345 | memorial day,20 346 | Bush trying to smooth relations with McCain,20 347 | New York's immigrations courts hurt by increased burden,9 348 | Assassination of Russian-backed Chechen president.,19 349 | Virginity Testing in Africa,19 350 | Traffic in body parts discovered at UCLA.,3 351 | Presidents of top universities to step down,6 352 | Improvement among America's elementary school students,6 353 | U.S. intelligence officials had received warning of coming attack on American Embassy,16 354 | terror insurance in Iraq,19 355 | Vietnam remembers Vietnam War,19 356 | Democratic primary; significance of Southern states.,20 357 | congressional resolution on Iraq,16 358 | Campaign has raised money for Clinton legal defense,20 359 | President Clinton; sex case,20 360 | Pentagon succeeds in shooting down an IBM with an interceptor,16 361 | investigation into bombing of UN building in Iraq,16 362 | new york vacation spots and the clintons,20 363 | welfare caseworkers,13 364 | Iraq insurgency; assault by Marines on city held by Shiite milita.,16 365 | Clinton Impeachment Trial,20 366 | gun control debate and suburban districts,12 367 | new products with the Windows operating system.,15 368 | plans for WTC site,14 369 | Crew withdraws threat to resign after tensions with Giuliani over voucher issue eases,6 370 | AT&T to cut long-distance rates,17 371 | China's Need for Metal Keeps U.S. Scrap Dealers Scrounging,18 372 | gay men in military,2 373 | presidential election results,20 374 | "Iran drops Rushdie death threat, and Britain restores full diplomatic relation with Tehran",19 375 | Gore's debate style,20 376 | expressing doubt about document.,20 377 | Higher rate of AIDS incidence in African Americans; role of prison-time in spreading AIDS in black neighborhoods.,3 378 | chief executive of Ford Motor Company to resign,15 379 | stock market falls,1 380 | clinton to require welfare recipients to work,13 381 | suburbs struggling to keep elderly population,14 382 | speech and language gene found,3 383 | bad doctors,3 384 | Clinton Impeachment Trial,20 385 | NY senator will not run again,20 386 | Clinton Urges House to Settle Its Differences Over Gingrich,20 387 | Lott apologizes for Thurmond comments but won't resign,20 388 | Comcast's Disney bid; probable effect on the media industry.,15 389 | Nuclear proliferation; Pakistan gov. admits some of its citizens may have sold data on nuclear weapons.,16 390 | Two men revive inquiry on Waco,16 391 | cooling of real estate merkets,14 392 | Clinton administration cover-up scandal,20 393 | Comission absolves Prime Minister Netanyahu of attack on Hamas official,19 394 | Presidential election; determination of African-Americans to avoid 2000 experience and make their ballots count.,20 395 | Clinton Impeachment Trial,20 396 | US starts direct combat with ground troops in Afghanistan,16 397 | Party change in Minnesota legislature mirrors what will happen to the nation's capitol in a few weeks,20 398 | Protest over killing of Amadou Diallo at memorial service,12 399 | Growing Iraqi Army; capture of men responsible for shooting down helicopter,16 400 | G 7 summit meeting,19 401 | pension changes,5 402 | UN weapons inspectors visit Iran,16 403 | Iraq insurgency; Falluja assault; re-capture of one-third of the city.,16 404 | Iraqi self-rule delayed,16 405 | Ford Explorer problems,15 406 | "Hurricane Katrina aftermath, Bush visits the area",15 407 | Man on a box in Abu Ghraib,19 408 | Order by Iraqi government for arrest of Ahmad Chalabi on charges of counterfeiting.,19 409 | No Child Left Behind Act requires annual testing in math and reading,6 410 | Clinton's defenders attack the credibility of Paula Jones,20 411 | Iraq Crisis: Plans for strike; diplomacy,16 412 | "Democrats gain power in Senate, but presents task of leading effectively",20 413 | Japan tells U.S. that their banking system is acutely short of capital,19 414 | Bush's appointment of the Deputy Director of National Intelligence,20 415 | arafat and the mideast violence,19 416 | MCI offers local residents telephone service,17 417 | clinton scandal,20 418 | Israel's ground war raises potential for casualties,19 419 | counterterrorism in Belgium,19 420 | Terri Schiavo case emboldens religious right,2 421 | Saddam Hussein trial,16 422 | Serbs continue to displace ethnic Albanians and force them to flee,19 423 | rape of a jogger in central park,12 424 | Growth of the richest class in the U.S.,1 425 | Petty Officers admits Japanese trawler was on radar before accident,16 426 | "Insurgency in Afghanistan, Navy Seal rescued",16 427 | Israeli Prime Minister Netanyahu begins aggressive campaign at home after returning from the U.S.,19 428 | Modernization in China threatens traditional tribes,19 429 | NY construction wall fell,10 430 | Iran President delivers inaugural address,19 431 | Peru election,19 432 | supreme court nominee John Roberts,20 433 | new york times wins pulitzers for 9/11 coverage,17 434 | Internet users loosing initial draw to eclectic possibilities,17 435 | clinton scandal,20 436 | teenage brothers admit to killing their father,12 437 | lebanon and israel fighting,19 438 | Iraqi premier moves to establish regional talks,19 439 | Bush administration and problems with Middle East foreign policy,19 440 | Republicans split after impeachment issue,20 441 | gay and lesbian pride parade,2 442 | NYC budget woes; cuts in police force,12 443 | peru hostage crisis,19 444 | MTA to use video surveillance in subways,10 445 | heirs of song suing inspiration for use of song title,15 446 | a body of an everest climber found,12 447 | Republicans not getting any help from economy in reelection bids,20 448 | Schundler's campaign for NJ governor lagging behind opponent,24 449 | lobbyists and medicare,20 450 | class action sexual harassment lawsuit at smith barney,15 451 | How Clinton will be judged in Lewinsky scandal,20 452 | Japan-China ecomonic ties,19 453 | Escape of a U.S. hostage; death of soldiers in attacks.,16 454 | meetings about iraq policy,16 455 | priest abuse scandal,12 456 | Commerce Department's regulation of satellites may harm American satellite makers in foreign markets,18 457 | Afghan government moving forward with efforts to get Osama Bin Laden to leave,16 458 | abortion vote,2 459 | Nationalist veterans demand more retirement benefits fromTaiwan's Government,19 460 | NYC Mayor Michael Bloomberg's property-tax rebate proposal,24 461 | Congressional elections,20 462 | Migrants in Mexico endure poor conditions,19 463 | Prudential Insurance Company; Arthur Ryan asks NJ comissioner to investigate after customer complaints,15 464 | high school suffers with loss of loved ones in terrorist attacks,16 465 | End of assault weapons ban had little effect despite predictions,12 466 | Iraqis killed in suicide bomb attack,19 467 | IMF bailout of south korea,19 468 | massacre of Kosovo men by Serbs,19 469 | supreme court upholds new campaign finance law,20 470 | "Chileans hail death of Augosto Pinochet, but violence mars celebration",19 471 | Encephalitis virus in New York is much more serious,3 472 | justice investigation involves Clinton; campaign financing,20 473 | transit strike in NYC,10 474 | Giuliani criticized on city charter plan,24 475 | return of exiles to Iraq,16 476 | hawaiian estate controversy,24 477 | special military tribunals to try foreigners charged with terrorism,19 478 | news anchor injured in Iraq,16 479 | more job losses,1 480 | pentagon keeping helicopters away from NATO forces,16 481 | Gephardt's views on gays,2 482 | George Allen falters in U.S. Senate race in Virginia,20 483 | terrorism arrest,16 484 | areas offering sanctuary.,16 485 | Iraq blames US for market explosion,16 486 | losses from terrorist attacks could force major airline carriers to bankruptcy,10 487 | election reform bill passed,20 488 | Economic policy leaders from the U.S. and Japan fail to agree on global economic cure,19 489 | US wants defections from iraq,16 490 | Serbian war criminal,16 491 | American spacecraft lands on Mars,17 492 | textbook explaining how-to terrorist activities introduced in embassy bombing trials,16 493 | Presidential election/Democratic primary; further articles.,20 494 | Executive branch failures in preventing 9/11; NSC advisor Rice given memo about Al Qaeda,16 495 | peace in chechnya,19 496 | McCain's wife's new role,20 497 | Saddam Hussein was a regional terror for 30 years,16 498 | Senate races,20 499 | arson in northern ireland,19 500 | Death of Yassir Arafat,19 501 | Scientists using genes to enhance breeding of crops and livestock,4 502 | plea tossed in Iraqi abuse case,19 503 | Iraq reconstruction; large-scale billing fraud by American security company.,16 504 | China tightens rein on freedom of speech and press,19 505 | Iraq insurgency; assault on Falluja; Marines' experience.,16 506 | Board to determine which airlines to aid,10 507 | Clinton's Brother-in-law paid for lobbying pardons,20 508 | Britain: drug testing laws,19 509 | Madeleine Albright brings fighting Kosovo together for peace talks,16 510 | Britain: ruling to allow gay soldiers,19 511 | U.S. is debating talks with Iran over nukes,16 512 | U.S. may try new approach with North Korea,19 513 | New drug to prevent heart failure,3 514 | princess diana divorce,19 515 | Debate on what should be built on the site of the WTC.,14 516 | Quantum experts win Nobel Prizes,17 517 | pinochet can be extradited to stand trial,19 518 | 9/11 aftermath; behavior of office workers near Ground Zero.,16 519 | Mideast violence,19 520 | Bush administration's decision to oppose lawsuits of drug and medical device manufacturers for faulty products.,3 521 | Iraqi and American forces kill insurgents,16 522 | tobacco company damages,3 523 | Presidential election; Bush's National Guard service; investigation into Bush's activities during the period.,20 524 | Video of Jose Padilla reveals life of terror suspect,16 525 | Merger between Kmart and Sears.,15 526 | senate committee divided on clinton's air force secretary nomination,20 527 | Yemen and its foreign policy,19 528 | China: Taklimakan Desert to be cultivated,19 529 | Clinton promises veto in Republican tax cut,20 530 | editor admits crack expose was flawed,17 531 | Superintendant of New York City school district get success in tough area,6 532 | market rally,1 533 | General says US is still at war in Iraq,16 534 | Shortage of nurses in African countries due to large-scale emigration to developed world.,19 535 | "Summer School wrongly ordered for 8,600 students",6 536 | Syrian president buried,19 537 | Other countries question fairness of international aid policies after Asian tsunami,19 538 | ny welfare reform,13 539 | Judge refuses postponement of McVeigh execution,12 540 | louima case,12 541 | child porn case,12 542 | livery cabs a growing problem in NYC,14 543 | Newark in worse shape now than in 1967,14 544 | Donor to Democratic Party accused of receiving foreigners' cash,20 545 | NYSE may move to new jersey,15 546 | Iraq elections; Sunni Arabs' statements that their followers could boycott the election.,19 547 | Congress tries to limit Drug Cartel Money Launderers from sending money to Columbia,12 548 | EPA to clean up homes poisoned by 9/11 dust and ash,7 549 | Bush vows to aid countries in war on terror,16 550 | ross perot barred from debates,20 551 | NJ senate race,20 552 | drunk driver sentence,12 553 | Question whether Viagra will improve the sex life of women,3 554 | Saddam Hussein war crimes trial,16 555 | Exxon and Mobil oil merger,15 556 | Federal court upholds law giving notice of sex offenders,12 557 | juror's education,12 558 | Nuclear proliferation; role of network organized by Pakistani; further revelations thereon.,16 559 | south africa gun law debate,19 560 | China Transitions Leadership Peacefully; Hu Jintao,19 561 | "Death toll rises in Lebanon, Lebanese Prime Minister calls for international involvement",19 562 | Eliot Spitzer and New York gubernatorial race,24 563 | "Roberts Confirmation, relationship between the courts and congress",20 564 | fish going extinct in the hudson,4 565 | Aftermath of meeting between President Clinton and nation's most powerful bankers,15 566 | 2000 campaign for vice president,20 567 | Efforts by Shiite leaders to persuade Moqtada al-Sadr to withdraw militia units and permit the deployment of Iraqi government forces.,16 568 | Trent Lott tries to fix consumer price index,1 569 | Security against a potential New Year's terrorist attack.,16 570 | bush demands israeli withdrawal,19 571 | piracy in mexico,15 572 | new land conservation effort,7 573 | Hezbollah and Israel both choose violence to resolve recent conflict,19 574 | UN allows Iraq to export oil to help civilian population,19 575 | Poor conditions in Russia impair the ability of figure skaters to practice,19 576 | War on terror; U.S. government claims that Osama Bin Laden is personally preparing an attack on U.S. soil.,16 577 | Dentists notice a rise in Meth use,3 578 | F.B.I investigates Democratic campaign money,20 579 | Haitian crisis; seizure of second-largest city by rebels.,19 580 | Google buys out YouTube,15 581 | some refusing to pay taxes,1 582 | air traffic controllers,10 583 | california governor and abortion,2 584 | Democrats hopeful of success in 2006 elections,20 585 | Photo: victims of 9/11 honored,16 586 | Female Condom: Important Weapon against AIDS,3 587 | white house says prewar Iraq intelligence was flawed,16 588 | Photo-Hilary Clinton visits Harrient Tubman Learing Center in Harlem,20 589 | Improving political style of NYC Mayor Michael Bloomberg.,24 590 | Turkey planning to occupy iraq in the event of war to prevent refugee entrance,19 591 | "Corruption, spying, and leaks in Silicon Valley",15 592 | Elizabeth Dole; Red Cross; Presidential Campiagn,20 593 | NJ sprawl,14 594 | funeral for a sniper victim,12 595 | arab leaders to meet,19 596 | Enron scandal; plight of workers rendered unemployed by corporation's collapse.,15 597 | fringe parties in NY politics,24 598 | speculation that russian president is sick,19 599 | Tobacco industry to gain from settlement,3 600 | antiterror in europe,19 601 | Proposed Freedom center at ground zero,21 602 | Bush administration scaling back oil drilling in Gulf of Mexico,8 603 | South Korea wants longer range missiles,19 604 | recall probable in CA,24 605 | "Oklahoma City Bombing: trial, friend sticks to story in cross-examination",16 606 | Israel's Barak decides to quit politics,19 607 | indian politician,19 608 | plane crash,10 609 | Gore attacks Bush tax cut plan,1 610 | chinese trade bill,18 611 | astronomy satellite,17 612 | chaos in Liberia,19 613 | UN troops leaving haiti,19 614 | Indian economy,19 615 | Federal budget,1 616 | concern over doctor/investor relationships,3 617 | Suspects in Madrid attacks blow themselves up after being surrounded by police.,19 618 | hand recounts in florida can continue,20 619 | Milosevic trial will test international law,19 620 | Health expenditures in the United States as a proportion of GDP.,3 621 | Tobacco companies selling cigarettes to traders to funnel them into black markets,12 622 | Tough sentence for former WorldCom chairman,12 623 | gore campaign,20 624 | Free Trade Zone of the Americas given the go-ahead,18 625 | Former Private Secretary reveals information about deals made between narcotics traffickers and political leaders,12 626 | Al Qaeda defector used by prosecution in terrorism cases.,16 627 | 2000 campaign-cheney chosen,20 628 | Clinton Impeachment Trial,20 629 | religious practices vs. health concerns; New York City politics,2 630 | American spy plane lands in China after crashing with Chinese fighter jet,16 631 | Auto industry; reduce SUV emissions,7 632 | Senate and White House promoting measures that increase use of ethanol,8 633 | Promotion of John Edwards as running-mate of Kerry.,20 634 | Europeans debate US plan for UN involvement in Iraq,19 635 | E.P.A:air quality standards,7 636 | Abu Ghraib scandal; Bush apology combined with continued support for Rumsfeld.,19 637 | immigrants in suburbia,9 638 | inquiry into fraud by MCI,12 639 | Domestic surveillance,2 640 | chinese dissident sent to the US,19 641 | Bush's pick for secretary of defense,20 642 | Pataki barely breaking a sweat in race for second term as Governor,24 643 | New Jersey jail raid,12 644 | Kennedy relative sentenced in a murder trial,12 645 | criminal inquiry leads to raid in Marine unit,16 646 | takeover battle for sprint,15 647 | German leader warns about iraq war,16 648 | auto industry mileage plan,10 649 | Death of Arafat; analysis.,19 650 | State of the stock markets; analysis of signs of recovery.,1 651 | Standoff at Falluja; discussion in American command whether U.S. should pull out of the city.,16 652 | refugees in Kosovo need food,19 653 | CIA and FBI agree to truce,16 654 | panel says US should require insurance to pay for vaccines,3 655 | Martha Stewart trial; dismissal of most serious charge.,12 656 | cocaine fight in columbia,12 657 | Abu Ghraib scandal; order by U.S. commander in Iraq to halt use of all coercive interrogation techniques.,19 658 | jack kemp,20 659 | hummers,10 660 | Stalemated election,20 661 | Deaf Mexican immigrants held captive in North Carolina,9 662 | US near a trade deal with china,18 663 | Enron scandal,15 664 | UN resolution on Iraq's future,16 665 | Real estate broker completes largest transaction in U.S. history and buys property along East River,15 666 | U.S. has not been tracking weapons intended for Iraqi security forces,16 667 | clinton in bosnia,19 668 | reactions to the start of the Iraq war in America,16 669 | Bush and Cheney reaching out to democrats,20 670 | Hezbollah works to rebuild Lebanon to win popular support,19 671 | "peer-to-peer services being used for pornography, not just music",17 672 | in NYC.,21 673 | undecided washington sentate race,20 674 | France reveals evidence against Nazi war criminal to stand trial,19 675 | mentally ill health care,3 676 | Special report on use of wireless technology in America.,17 677 | 2000 campaign; bush after college,20 678 | first soldier killed in Afghanistan buried,16 679 | Washington State voters face affirmative action measure,2 680 | egyptians joining the palestinian cause,19 681 | IRA to help disarm ulster fighters,19 682 | Hussein rallies his troops,19 683 | bob dole challenges clinton's ethics,20 684 | Remembering Dr. Martin Luther King Jr,2 685 | Increasing nuclear proliferation despite diplomatic agreements; damage wrought by export of Pakistani nuclear expertise.,16 686 | Republicans questioning of Bush's ban on stem-cell research.,3 687 | new subpoenas over campaign finance violations,20 688 | Chinese trade bill,18 689 | Veteran health care,16 690 | women in India changing their roles,19 691 | new communications law,17 692 | poll on opinions of new yorkers,24 693 | Federal investigators link deaths to same suspect,12 694 | shootout in the bronx,12 695 | technological breakthrough in computing,17 696 | Revelation of prescient pre-war report about danger of post-Saddam Iraqi civil war; administration attempts to minimize its significance.,16 697 | Iraq's oil industry,19 698 | Britain: Prime Minister Campaigns,19 699 | Iraqi war casualties are up sharply,16 700 | US donations to jewish settlers,19 701 | "Bush announces government will take stronger role in airline security, will station troops",10 702 | hospital worker died of anthrax inhalation,16 703 | Shooting at the Empire State Building; the gunman,12 704 | limits to Putin's power,19 705 | graves uncovered in Sri Lanka have not lead to charges,19 706 | "Indian computer security is bad, files at risk",21 707 | fossils of second largest dinosaur found in Egypt,19 708 | naturalized citizens in NY,9 709 | NATO: Russia agrees to alliance expansion,16 710 | Virginia offer of scholarships to black students denied access to high school during segregation wars.,2 711 | greenspan says the economy is good,1 712 | FBI agent charged with spying for russia,16 713 | California plan for large-scale cutbacks in greenhouse-gas emissions.,7 714 | Microsoft antitrust case,15 715 | House approved bill to turn airport security to government,10 716 | irish peace referendum,19 717 | Trinity College and the revival of Hartford,6 718 | Clinton heart surgery.,20 719 | online prescription drug sales,3 720 | Sotheby's chairman convicted of price-fixing,12 721 | Key role of 9/11 widows in formation and activities of the 9/11 Comission.,16 722 | Rice reviews progress in Iraq; rejects exit strategy,16 723 | Al Qaeda in Karachi,16 724 | China to Protect Private Property Rights; Boon to Entreprenuerial Class,19 725 | welfare reform in Italy,19 726 | More on the Clinton inauguration,20 727 | South Carolina campaign,20 728 | Employment trends in the U.S.,1 729 | "stock slump hurts 401Ks, makes many rethink retirement plans",15 730 | European opinion,19 731 | Clinton denies Paula Jones' accusations,20 732 | exit polling from the presidential election,20 733 | U.S. shift to support cease fire in Lebanon started frantic round of negotiations in U.N.,19 734 | senate approves online contracts,15 735 | tax plans in campaign,1 736 | Bush calls for end to loans to buy stock,15 737 | Panama takes control of the Panama canal,19 738 | halliburton overcharging for fuel,16 739 | Foul Air and Water Part of Cost of Boom in China's Exports,19 740 | housing conditions for the poor,14 741 | drug review process,3 742 | Gates to create foundation to bring internet into public libraries,6 743 | Cubans still struggling to make ends meet,19 744 | Supreme Court nomination; partisan dispute,20 745 | Bali bombing,19 746 | Doctors' pay regains ground despite the effects of HMOs,3 747 | suicide scandal in Germany,19 748 | Cruise lines pay little income tax because of loophole in tax law,10 749 | More Palestinian and Israeli struggle,19 750 | Japanese elections,19 751 | burning of chemical weapons,7 752 | "New York law holding car owners liable for car accidents, whatever the driver, limiting car-leasing in the State.",10 753 | "weather data predicts years of frequent, stronger hurricanes",17 754 | tobacco settlement money held up by new york politics,3 755 | FTC ruling on doctors to let them band together,3 756 | israel and lebanon relations,19 757 | NYC Chancellor cuts school budget by cutting program spending,6 758 | I.B.M. guilty of selling advanced computers to Russian nuclear weapons laboratory,12 759 | Hugo Chavez opposition,19 760 | heightened terror alert,16 761 | Meningitis Epidemic in West Africa,19 762 | Iraqi constitution to be voted on,19 763 | uprising in Ivory Coast,19 764 | Research showing that aspirin use can help prevent breast cancer.,3 765 | more questions arising over Clinton pardons,20 766 | Prosecutors stop Haitian murder suspect's efforts to leave country,12 767 | fraud claim in Iraqi election,19 768 | Businesses use people's YouTube and MySpace videos as free advertising,17 769 | Cleanliness takes a back seat to financial survival in airline industry,10 770 | scores of top students rise while those of average students decline on reading tests,6 771 | mideast violence,19 772 | applying for college,6 773 | Israel requests shipment of U.S. artillery rockets,16 774 | "Bush threatens to veto Senate's patients' bill of rights, Democrats say he'll have to accept it",3 775 | Difficulties for NYC mayor Michael Bloomberg in playing host to the Republican convention.,20 776 | states propose reducing medicaid,3 777 | unauthorized wiretaps,2 778 | house races,20 779 | debate on war strategy disappears after advance on Baghdad,16 780 | British bracing for Bush protesters,19 781 | Bush rejects a quick pullout from Iraq,16 782 | interrogation of terror suspects,16 783 | NYC to pay overtime to police and fire chiefs despite concerns,12 784 | "France: parliament elections, little mention of economic plan",19 785 | campigner for US senate avoids SEC investigation,20 786 | Declining energy prices has caused the U.S. to give up all gains made in conserving energy,8 787 | first Latino becomes Los Angeles mayor in more than a century,24 788 | "New York City Budget, tax cuts",24 789 | Young blacks link tobacco use to marijuana,3 790 | U.S. to restore relations with Libya,19 791 | Bolivian Leader in Exile After Efforts to Eradicate Coca,19 792 | Chinese backlash against closing of news journal; censorship,19 793 | use of carbon fuel declining,7 794 | Sky scrapers advertising value,15 795 | Europeans begin to fear growing Muslim minorities in their countries,19 796 | bombing in jerusalem,19 797 | Clinton urges bipartisanship to get budget negotiations moving again,1 798 | Bush prepares for war,16 799 | Sunni militia force Shiite bakeries in Baghdad to close,19 800 | us peace efforts in the middle east,19 801 | Pressure grows on GOP House leadership over Foley scandal,20 802 | new organ transplant strategies,3 803 | Bankers Trust Company admits to diverting money to enhance financial performance,12 804 | Lebanon's ex-premier killed in car bomb attack; Syrian influence in Lebanon,19 805 | Roberts Confirmation hearings,20 806 | 3 rich kids moved around after mother kills their father,12 807 | copyright law changes,15 808 | Bush mideast speech,19 809 | auto industry agrees to design changes to improve safety,10 810 | soldier leaving for iraq,16 811 | "witness accuses former boss for price-fixing between Sotheby's, Christy's",12 812 | Law Enforcement fears that domestic terrorist attacks are linked by white supremacists,12 813 | Controversy over acquisition of U.S. ports by state-owned Middle-Eastern countries.,21 814 | White house dealings with enron,20 815 | wealthy taking year off before starting college,6 816 | Debate over who will pay for repairs of beach erosion in New Jersey,7 817 | 1996 Election; public opinion polls,20 818 | Israeli commandos raid Hezbollah stronghold despite truce,19 819 | Stricter elementary school standards,6 820 | Questionable legal practices in Checnya highlighted in torture of woman accused of adultery,19 821 | Saddam Hussein; Life in Iraq,19 822 | re-designed station wagons enter the auto market,10 823 | US tells citizens in India to leave,19 824 | "Internet message boards allows company employees to vent, sometimes ugly, conversations",5 825 | europe bans british beef,19 826 | priest abuse scandal,12 827 | terror suspect,16 828 | Iran blamed for the killing of Iranian dissidents in Germany,19 829 | Supreme court rules on recruiting at universities,16 830 | India Genral Election; Congress Party Losing Power,19 831 | Generic AIDS Drug makers want to sell in South Africa,19 832 | Taiwan election,19 833 | Juveniles punished for killing 5 people in school shooting,6 834 | calling up reserves in the iraq war,16 835 | Murders of women along the Mexican border,19 836 | the political future of Indonesia,19 837 | Clintom impeachment trial,20 838 | Fish market in NYC closing,15 839 | elderly people in philadelphia,3 840 | Bush won't continue plan to rid of weapons with plutonium,16 841 | Al Sharpton's bid for the Presidency,20 842 | kennedy and castro,19 843 | Markets Surge after Investor concers about inflation eased,1 844 | NJ doctors protest high insurance costs,3 845 | Bush seeks to expand NAFTA throughout Central and South America,18 846 | Bombings in London,19 847 | communications equipment maker announces lay-offs,5 848 | The difficuluties of scheduling a war; Olympics; Islamic holidays,16 849 | New York traffic court,12 850 | "Russian President fired Prime Minister and appointed a former KGB officer, Vladmir Putin",19 851 | conjoined twins separated,3 852 | Livestock testing at state fairs,4 853 | NJ budget problems,24 854 | NJ troopers using hotel staffs to stop drug smugglers,12 855 | Iraqi Prime Minister denounces Israeli attacks on Lebanon,19 856 | medicare-prescription drug benefits,3 857 | north korea has access to plutonium,16 858 | Comcast bid for Disney; probable strategy of Disney leader to counter it.,15 859 | surrogate mothers have babies for gay couples,2 860 | Civilian death toll in Iraq reaches new high,16 861 | culture war in Israel,19 862 | US plans for a palestinian state,19 863 | Guilty verdict on prominent investment banker.,12 864 | mourning a school shooting in scotland,19 865 | US attacks iraq,16 866 | Tactics of American unit against Shiite militia of Moqtada al-Sadr.,16 867 | Red Cross criticizes Guantanamo Bay,19 868 | Israeli Prime Minister; Isreali cabinet,19 869 | virus on a cruise ship,3 870 | Russian president's plan to tighten executive control over the legislative branch and local governents; stated rationale in terrorist threat.,19 871 | Chief executives see 22 percent raise in salary in last decade,5 872 | political terrorists kill Cambodians at democratic rally,19 873 | John Kerry and the Cambodian Swift Boat incident,20 874 | "Health care costs, Medicare",3 875 | recount analysis,20 876 | poll find NYC split over mayoral candidates,24 877 | Democratic primary; further articles.,20 878 | Human embryo cloning in South Korea; significance.,3 879 | NYC school board dispute,6 880 | American offensive against Shiite militia.,16 881 | supreme court ends a ban on ads for casino gambling,15 882 | Abortion doctor eulogized as killer is sought,2 883 | US reliance on Saudi oil,8 884 | School uniforms in Public School 7 in New York,6 885 | bob dole candidacy,20 886 | Bush talks about AIDS in South Africa,19 887 | Army veteran accuses her top-ranked Army boss with sexual assault,16 888 | nytimes neediest cases charity,9 889 | Abu Ghraib prisoner abuse scandal; interrogation unit alleges having supplied early reports to superior officers.,19 890 | big landlord,14 891 | Mad Cow disease in the US,4 892 | Reminder that standard time has resumed,15 893 | Iranian politics,19 894 | Afghani president's description of private armies as the principal threat to his country.,19 895 | mergers in europe,19 896 | Isreali curfew in nablus,19 897 | Immigration debate in Congress,9 898 | German election results,19 899 | man had hand cut off by Taliban on charge of theft,19 900 | israel and lebanon occupation,19 901 | Merging of American Companies,15 902 | Profile of American contractor beheaded in Iraq.,16 903 | Theory of gene flaw proposed to explain evolution of human beings.,17 904 | Life of John Roberts,20 905 | Inefficient evidence to look into Interior Secretary in his role in denial of Indian casino application,21 906 | peace in Angola,19 907 | bad week for Italian PM,19 908 | Senate rejects rival proposal to campaign finance law,20 909 | radio communication in NYC,12 910 | Re-development of WTO site.,21 911 | patents on drugs end,3 912 | nyc labor disputes,5 913 | change in laws causes deportations,9 914 | Chinese Poor Struggle with HIV/AIDS,19 915 | smallpox vaccine,16 916 | High cost of potentially effective anti-cancer drug,3 917 | airport security,10 918 | Police protest over pay,12 919 | Internet message boards facilitate dialogue on race,17 920 | negotiations on interim Afghan government stall,16 921 | captured spy chief returns to Peru,19 922 | Sheriffs department in columbine under scrutiny,6 923 | Supreme Court divided on interpretation of Clean Water Act,7 924 | Senate confirms Ashcroft,20 925 | campaign finance limits,20 926 | Chicago orders big retail stores to raise minimum wage,5 927 | Army recruiting helped by bad economy,16 928 | Federal Reserve considers cutting interest rates,1 929 | Conservative Christian politician hurt by ties to Jack Abramoff lobbying scandal,20 930 | Bush's Social Security reform plan,13 931 | osteoporosis gene found,3 932 | defect in heart devices,3 933 | US dispatches agents to Germany to uncover terrorist network,16 934 | bacteria in chickens,4 935 | people who live in homes where BTK murdered,12 936 | affirmative action poll,2 937 | suicide attack in Israel,19 938 | Space exploration; infrequent success therein and high costs thereof.,17 939 | politicians on parade,20 940 | whitewater scandal trial,20 941 | AT&T's withdrawal from the residential-phone business.,17 942 | ground zero cleanup nears the end,16 943 | large purchase of woods and wetlands for public use and preservation,7 944 | Danger to VA mental health services from large estimated numbers of future patients among veterans of Iraq war.,16 945 | Dow Chemical Company knowingly deceived women on breast implants,3 946 | Clinton scanal fallout,20 947 | lobbyists sway legislators on Chinese trade policy,18 948 | Standard time resumes,15 949 | second day of blackout continues as company makes accusations,8 950 | bob dole attacks clinton in the presidential debate,20 951 | PHOTO: Senator John Glenn at Brooks Air Force Base,20 952 | New York's hospitals receive Federal aid package,3 953 | East Germany's economic revival,19 954 | shady democratic party fund raising,20 955 | Bush aides defending budget in Congress,1 956 | New York State Legislature approves early retirement for most experienced teachers,6 957 | "fireworks explosions in Lima, Peru kill hundreds",19 958 | House Democrats choose Steny Hoyer as House majority leader over Nancy Pelosi's choice,20 959 | McCain endorses Bush,20 960 | fighting in the mideast,19 961 | reform party presidential candidate,20 962 | Iraq; Sadam Hussein; split in Arab nations,19 963 | inquiry into columbia breakup,17 964 | barak agrees to halt settlements in the west bank,19 965 | Comcast's Disney bid; rejection by Disney.,15 966 | welfare and the states,13 967 | NY mayor race,24 968 | Bush challenges Mideast to try democracy,16 969 | Swiss failed to payback Nazi payments after WWII,19 970 | North Korea fires ballistic missile over Japan,19 971 | US soldiers killed in Iraq,16 972 | Medpartners/Mullikin to Buy Caremark International; Creation of A Large Physician Management Company,15 973 | guilty verdict in a wendy's murder case,12 974 | Bosnian forces pull back troops and weapons from front line,19 975 | Iowa house race,20 976 | terrorism,19 977 | chief fundraiser for Senator may have received illegal campaign contributions,20 978 | Iraqis want more power to control themselves,16 979 | Prosecution of NYC school superintendent charged with $1m embezzlementt; difficulties in detecting embezzlement in districts with high per-student spending.,12 980 | "FDA proposes ban on importing blood from Europe, upsetting European suppliers",3 981 | indonesian cleric falls ill,19 982 | "Japan's Economy in Debt, despite Japanese ability to save",19 983 | GAO files suit over cheney energy meetings,20 984 | "Oxycontin sales grew, but at a cost",3 985 | costs on loans,15 986 | Bristol Myers Squibb to Yield Patent Rights Over Aids Drugs in Africa,19 987 | china trade deal,18 988 | "Former United States Housing Secretary pleads guilty, lied to FBI",20 989 | Middle East politics; Ariel Sharon,19 990 | Governor-elect Eliot Spitzer likely to ask for ouster of State Comptroller Alan Hevesi,24 991 | domestic violence courts in NY,12 992 | China Bans Text Messaging/E-mail in Protests Against Japan,19 993 | Lacking relief and repair effort in Tsunami ravaged Indonesia,19 994 | Bush declared winner by Florida,20 995 | Palestinians and Israels hold off peace talks despite Madeleine Albright's visit,19 996 | stock market,1 997 | china sending missiles to iran,16 998 | Bomb attacks in Iraq; U.S. soldiers killed.,16 999 | Tora Bora offers many hiding places for Al Qaeda fighters,16 1000 | Russia and China willing to help curb Iran's nuclear ambitions,16 1001 | --------------------------------------------------------------------------------