├── smaberta
├── __init__.py
└── smaberta.py
├── requirements.txt
├── examples
├── execute_finetuning.sh
├── test_finetuning.py
├── Tutorial.ipynb
└── lm_finetuning.py
├── Dockerfile
├── LICENSE
├── setup.py
├── README.md
├── .gitignore
└── data
├── tutorial_test.csv
└── tutorial_train.csv
/smaberta/__init__.py:
--------------------------------------------------------------------------------
1 | from smaberta.smaberta import TransformerModel
2 |
3 |
4 | __version__ = '0.0.1'
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==2.6.0
2 | simpletransformers==0.22.1
3 | pandas
4 | tensorboardX
5 | torch
6 | torchvision
7 | tqdm
--------------------------------------------------------------------------------
/examples/execute_finetuning.sh:
--------------------------------------------------------------------------------
1 | export TRAIN_FILE=./data/lm_train
2 | export TEST_FILE=./data/lm_test
3 |
4 | python3 lm_finetuning.py \
5 | --output_dir=output \
6 | --model_type=roberta-base \
7 | --model_name_or_path=roberta-base \
8 | --line_by_line \
9 | --do_train \
10 | --train_data_file=$TRAIN_FILE \
11 | --do_eval \
12 | --mlm \
13 | --eval_data_file=$TEST_FILE
14 |
15 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use an official Python runtime as a parent image
2 | FROM python:3.7-slim-stretch
3 |
4 | # Set the working directory to /dockerTutorial
5 | WORKDIR /dockerTutorial
6 |
7 | # Copy the current directory contents into the container at /dockerTutorial
8 | COPY . /dockerTutorial
9 |
10 | #Install any needed packages specified in requirements.txt
11 | RUN pip3 install -r requirements.txt
12 |
13 | # Define environment variable
14 | ENV COUNT 0
15 |
16 | # Run test.py when the container launches
17 | CMD ["python", "train_and_classify_clinton_tweets.py"]
18 |
--------------------------------------------------------------------------------
/examples/test_finetuning.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import random
4 | import torch
5 | import pickle
6 | random.seed(1)
7 | np.random.seed(1)
8 | torch.manual_seed(1)
9 | torch.cuda.manual_seed(1)
10 |
11 | from smaberta import TransformerModel
12 |
13 | model = TransformerModel('roberta', 'roberta-base', finetune=True, args={"num_train_epochs":1, 'fp16':False, "output_dir":"test-finetune", "reprocess_input":True})
14 |
15 | #model.lm_evaluate('./data/lm_eval')
16 | print("------------------------------------------------")
17 |
18 | model.finetune("./data/lm_train", "./data/lm_eval")
19 |
20 | print("------------------------------------------------")
21 | model.lm_evaluate('./data/lm_eval')
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Zhanna Terechshenko and Vishakh Padmakumar
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from setuptools import setup
4 |
5 | if sys.version_info[0] != 3:
6 | raise RuntimeError('Unsupported python version "{0}"'.format(
7 | sys.version_info[0]))
8 |
9 | def _get_file_content(file_name):
10 | with open(file_name, 'r') as file_handler:
11 | return str(file_handler.read())
12 | def get_long_description():
13 | return _get_file_content('README.md')
14 |
15 | #on_rtd = os.environ.get('READTHEDOCS') == 'True'
16 |
17 | #if not on_rtd:
18 | # INSTALL_REQUIRES = [
19 | # 'pandas',
20 | # 'requests',
21 | # ]
22 | #else:
23 | # INSTALL_REQUIRES = [
24 | # 'requests',
25 | # ]
26 |
27 | INSTALL_REQUIRES = [
28 | 'transformers==2.6.0',
29 | 'simpletransformers==0.22.1',
30 | 'pandas',
31 | 'torch',
32 | 'torchvision',
33 | 'tensorboardX',
34 | 'tqdm'
35 | ]
36 |
37 | setup(
38 | name="smaberta",
39 | version='0.0.2',
40 | author="Vishakh Padmakumar, Zhanna Terechshenko",
41 | description="a wrapper for the huggingface transformer libraries",
42 | long_description=get_long_description(),
43 | long_description_content_type="text/markdown",
44 | keywords='nlp transformers classification text-classification fine-tuning',
45 | url="https://github.com/SMAPPNYU/SMaBERTa.git",
46 | packages=['smaberta'],
47 | py_modules=['smaberta'],
48 | license="MIT",
49 | classifiers=(
50 | "Programming Language :: Python :: 3",
51 | "License :: OSI Approved :: MIT License",
52 | "Operating System :: OS Independent",
53 | ),
54 | install_requires=INSTALL_REQUIRES
55 | )
56 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SMaBERTa
2 |
3 |
4 |
5 |
6 | This repository contains the code for SMaBERTa, a wrapper for the huggingface transformer libraries.
7 | It was developed by Zhanna Terechshenko and Vishakh Padmakumar through research at the Center for
8 | Social Media and Politics at NYU.
9 |
10 | ## Setup
11 |
12 | To install using pip, run
13 | ```
14 | pip install smaberta
15 | ```
16 |
17 | To install from the source, first download the repository by running
18 |
19 | ```
20 | git clone https://github.com/SMAPPNYU/SMaBERTa.git
21 | ```
22 |
23 | Then, install the dependencies for this repo and setup by running
24 | ```
25 | cd SMaBERTa
26 | pip install -r requirements.txt
27 | python setup.py install
28 | ```
29 |
30 | ## Using the package
31 |
32 | Basic use:
33 |
34 | ```
35 | from smaberta import TransformerModel
36 |
37 | epochs = 3
38 | lr = 4e-6
39 |
40 | training_sample = ['Today is a great day', 'Today is a terrible day']
41 | training_labels = [1, 0]
42 |
43 | model = TransformerModel('roberta', 'roberta-base', num_labels=25, reprocess_input_data=True,
44 | num_train_epochs=epochs, learning_rate=lr, output_dir='./saved_model/',
45 | overwrite_output_dir=True, fp16=False)
46 |
47 | model.train(training_sample, training_labels)
48 |
49 | ```
50 |
51 | For further details, see `Tutorial.ipynb` in the [examples](https://github.com/SMAPPNYU/SMaBERTa/tree/master/examples) directory.
52 |
53 | # Acknowledgements
54 |
55 | Code for this project was adapted from version 0.6 of https://github.com/ThilinaRajapakse/simpletransformers
56 |
57 | Vishakh Padmakumar and Zhanna Terechshenko contributed to the software writing, implementation, and testing.
58 |
59 | Megan Brown contributed to documentation and publication.
60 |
61 | If you use this software in your research please cite it as:
62 |
63 | ```
64 | @misc{padmakumar_terechshenko,
65 | author = {Vishakh Padmakumar and Zhanna Terechshenko},
66 | title = {SMAPPNYU/SMaBERTa},
67 | month = dec,
68 | year = 2020,
69 | doi = {10.5281/zenodo.5090728},
70 | url = {https://doi.org/10.5281/zenodo.5090728}
71 | }
72 | ```
73 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # package specific
2 | .ipynb_checkpoints/*
3 | runs/*
4 | data/*
5 | outputs/*
6 | __pycache__/*
7 | cache_dir/*
8 | *.pkl
9 |
10 | # Byte-compiled / optimized / DLL files
11 | __pycache__/
12 | *.py[cod]
13 | *$py.class
14 |
15 | # C extensions
16 | *.so
17 |
18 | # Distribution / packaging
19 | .Python
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | pip-wheel-metadata/
33 | share/python-wheels/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 | MANIFEST
38 |
39 | # PyInstaller
40 | # Usually these files are written by a python script from a template
41 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
42 | *.manifest
43 | *.spec
44 |
45 | # Installer logs
46 | pip-log.txt
47 | pip-delete-this-directory.txt
48 |
49 | # Unit test / coverage reports
50 | htmlcov/
51 | .tox/
52 | .nox/
53 | .coverage
54 | .coverage.*
55 | .cache
56 | nosetests.xml
57 | coverage.xml
58 | *.cover
59 | *.py,cover
60 | .hypothesis/
61 | .pytest_cache/
62 |
63 | # Translations
64 | *.mo
65 | *.pot
66 |
67 | # Django stuff:
68 | *.log
69 | local_settings.py
70 | db.sqlite3
71 | db.sqlite3-journal
72 |
73 | # Flask stuff:
74 | instance/
75 | .webassets-cache
76 |
77 | # Scrapy stuff:
78 | .scrapy
79 |
80 | # Sphinx documentation
81 | docs/_build/
82 |
83 | # PyBuilder
84 | target/
85 |
86 | # Jupyter Notebook
87 | .ipynb_checkpoints
88 |
89 | # IPython
90 | profile_default/
91 | ipython_config.py
92 |
93 | # pyenv
94 | .python-version
95 |
96 | # pipenv
97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | # install all needed dependencies.
101 | #Pipfile.lock
102 |
103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104 | __pypackages__/
105 |
106 | # Celery stuff
107 | celerybeat-schedule
108 | celerybeat.pid
109 |
110 | # SageMath parsed files
111 | *.sage.py
112 |
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 |
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 |
126 | # Rope project settings
127 | .ropeproject
128 |
129 | # mkdocs documentation
130 | /site
131 |
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 |
137 | # Pyre type checker
138 | .pyre/
139 |
--------------------------------------------------------------------------------
/examples/Tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import warnings\n",
10 | "warnings.filterwarnings('ignore')\n",
11 | "\n",
12 | "import pandas as pd\n",
13 | "import numpy as np\n",
14 | "import random\n",
15 | "import torch\n",
16 | "import pickle\n",
17 | "random.seed(1)\n",
18 | "np.random.seed(1)\n",
19 | "torch.manual_seed(1)\n",
20 | "torch.cuda.manual_seed(1)\n",
21 | "\n",
22 | "import sys\n",
23 | "sys.path.append('../smaberta')\n",
24 | "from smaberta import TransformerModel"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "### Loading Data\n",
32 | "\n",
33 | "Load train data stored in CSV format using Pandas. Pretty much any format is acceptable, just some form of text and accompanying labels. Modify according to your task. For the purpose of this tutorial, we are using a sample from New York Times Front Page Dataset (Boydstun, 2014)."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "train_df = pd.read_csv(\"../data/tutorial_train.csv\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "Loading test data"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "test_df = pd.read_csv(\"../data/tutorial_test.csv\")"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "Just to get an idea of what this dataset looks like"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "Paired data consisting of freeform text accompanied by their supervised labels towards the particular task. Here the text is headlines of news stories and the label categorizes them into the subjects. We have a total of 25 possible labels here, each represented by a separate number."
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "print(len(train_df.label.values))"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 4,
87 | "metadata": {},
88 | "outputs": [
89 | {
90 | "data": {
91 | "text/html": [
92 | "
\n",
93 | "\n",
106 | "
\n",
107 | " \n",
108 | " \n",
109 | " | \n",
110 | " text | \n",
111 | " label | \n",
112 | "
\n",
113 | " \n",
114 | " \n",
115 | " \n",
116 | " | 0 | \n",
117 | " AIDS in prison, treatment costs overwhelm pris... | \n",
118 | " 12 | \n",
119 | "
\n",
120 | " \n",
121 | " | 1 | \n",
122 | " olympics security | \n",
123 | " 19 | \n",
124 | "
\n",
125 | " \n",
126 | " | 2 | \n",
127 | " police brutality | \n",
128 | " 12 | \n",
129 | "
\n",
130 | " \n",
131 | " | 3 | \n",
132 | " Iranian nuclear program; deal with European Un... | \n",
133 | " 16 | \n",
134 | "
\n",
135 | " \n",
136 | " | 4 | \n",
137 | " terror alert raised | \n",
138 | " 16 | \n",
139 | "
\n",
140 | " \n",
141 | "
\n",
142 | "
"
143 | ],
144 | "text/plain": [
145 | " text label\n",
146 | "0 AIDS in prison, treatment costs overwhelm pris... 12\n",
147 | "1 olympics security 19\n",
148 | "2 police brutality 12\n",
149 | "3 Iranian nuclear program; deal with European Un... 16\n",
150 | "4 terror alert raised 16"
151 | ]
152 | },
153 | "execution_count": 4,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "train_df.head()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 5,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "name": "stdout",
169 | "output_type": "stream",
170 | "text": [
171 | "['AIDS in prison, treatment costs overwhelm prison budgets', 'olympics security', 'police brutality', 'Iranian nuclear program; deal with European Union and its leaving of Iran free to develop plutonium.', 'terror alert raised', 'Job report shows unexpected vigor for US economy', \"Clinton proposes West Bank Plan to Isreal's Prime Minister Netanyahu\", 'Senators debate Iraq War policy', 'Myrtle Beach', 'china visit'] [12, 19, 12, 16, 16, 5, 19, 16, 14, 19]\n"
172 | ]
173 | }
174 | ],
175 | "source": [
176 | "print(train_df.text[:10].tolist(), train_df.label[:10].tolist())"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "### Learning Parameters\n",
184 | "These are training arguments that you would use to train the classifier. For the purposes of the tutorial we set some sample values. Presumably in a different case you would perform a grid search or random search CV"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 6,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "name": "stdout",
194 | "output_type": "stream",
195 | "text": [
196 | "Learning Rate 0.001\n",
197 | "Train Epochs 2\n"
198 | ]
199 | }
200 | ],
201 | "source": [
202 | "lr = 1e-3\n",
203 | "epochs = 2\n",
204 | "print(\"Learning Rate \", lr)\n",
205 | "print(\"Train Epochs \", epochs)"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "### Initialise model\n",
213 | "1. First argument is indicative to use the Roberta architecture (alternatives - Bert, XLNet... as provided by Huggingface). Used to specify the right tokenizer and classification head as well \n",
214 | "2. Second argument provides intialisation point as provided by Huggingface [here](https://huggingface.co/transformers/pretrained_models.html). Examples - roberta-base, roberta-large, gpt2-large...\n",
215 | "3. The tokenizer accepts the freeform text input and tansforms it into a sequence of tokens suitable for input to the transformer. The transformer architecture processes these before passing it on to the classifier head which transforms this representation into the label space. \n",
216 | "4. Number of labels is specified below to initialise the classification head appropriately. As per the classification task you would change this.\n",
217 | "5. You can see the training args set above were used in the model initiation below.. \n",
218 | "6. Pass in training arguments as initialised, especially note the output directory where the model is to be saved and also training logs will be output. The overwrite output directory parameter is a safeguard in case you're rerunning the experiment. Similarly if you're rerunning the same experiment with different parameters, you might not want to reprocess the input every time - the first time it's done, it is cached so you might be able to just reuse the same. fp16 refers to floating point precision which you set according to the GPUs available to you, it shouldn't affect the classification result just the performance."
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 7,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "model = TransformerModel('roberta', 'roberta-base', num_labels=25, reprocess_input_data=True, num_train_epochs=epochs, learning_rate=lr, \n",
228 | " output_dir='./saved_model/', overwrite_output_dir=True, fp16=False)"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "### Run training"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 8,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "name": "stdout",
245 | "output_type": "stream",
246 | "text": [
247 | "Starting Epoch: 0\n",
248 | "Starting Epoch: 1\n",
249 | "Training of roberta model complete. Saved to ./saved_model/.\n"
250 | ]
251 | }
252 | ],
253 | "source": [
254 | "model.train(train_df['text'], test_df['label'])"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "To see more in depth logs, set flag show_running_loss=True on the function call of train_model"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "### Inference from model\n",
269 | "\n",
270 | "At training time the model is saved to the output directory that was passed in at initialization. We can either continue retaining the same model object, or load from the directory it was previously saved at. In this example we show the loading to illustrate how you would do the same. This is helpful when you want to train and save a classifier and use the same sporadically. For example in an online setting where you have some labelled training data you would train and save a model, and then load and use it to classify tweets as your collection pipeline progresses."
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 9,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "model = TransformerModel('roberta', 'roberta-base', num_labels=25, location=\"./saved_model/\")"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "### Evaluate on test set\n",
287 | "\n",
288 | "At inference time we have access to the model outputs which we can use to make predictions as shown below. Similarly you could perform any emprical analysis on the output before/after saving the same. Typically you would save the results for replication purposes. You can use the model outputs as you would on a normal Pytorch model, here we just show label predictions and accuracy. In this tutorial we only used a fraction of the available data, hence why the actual accuracy is not great. For full results that we conducted on the experiments, check out our paper."
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 10,
294 | "metadata": {},
295 | "outputs": [
296 | {
297 | "name": "stdout",
298 | "output_type": "stream",
299 | "text": [
300 | "{'mcc': 0.0}\n"
301 | ]
302 | }
303 | ],
304 | "source": [
305 | "result, model_outputs, wrong_predictions = model.evaluate(test_df['text'], test_df['label'])\n",
306 | "preds = np.argmax(model_outputs, axis = 1)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 12,
312 | "metadata": {},
313 | "outputs": [
314 | {
315 | "data": {
316 | "text/plain": [
317 | "(998, 998)"
318 | ]
319 | },
320 | "execution_count": 12,
321 | "metadata": {},
322 | "output_type": "execute_result"
323 | }
324 | ],
325 | "source": [
326 | "len(test_df), len(preds)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 14,
332 | "metadata": {},
333 | "outputs": [
334 | {
335 | "name": "stdout",
336 | "output_type": "stream",
337 | "text": [
338 | "Accuracy: 0.23947895791583165\n"
339 | ]
340 | }
341 | ],
342 | "source": [
343 | "correct = 0\n",
344 | "labels = test_df['label'].tolist()\n",
345 | "for i in range(len(labels)):\n",
346 | " if preds[i] == labels[i]:\n",
347 | " correct+=1\n",
348 | "\n",
349 | "accuracy = correct/len(labels)\n",
350 | "print(\"Accuracy: \", accuracy)"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 15,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "pickle.dump(model_outputs, open(\"../model_outputs.pkl\", \"wb\"))"
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {},
365 | "source": [
366 | "### Run inference \n",
367 | "\n",
368 | "This is the use case when you only have a new set of documents and no labels. For example if we just want to make predictions on a set of new text documents without loading a pandas datafram i.e. if you just have a list of texts, it can be predicted as shown below. Note that here you have the predictions and model outputs."
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 17,
374 | "metadata": {},
375 | "outputs": [],
376 | "source": [
377 | "texts = test_df['text'].tolist()"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 18,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "preds, model_outputs = model.predict(texts)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 19,
392 | "metadata": {},
393 | "outputs": [
394 | {
395 | "name": "stdout",
396 | "output_type": "stream",
397 | "text": [
398 | "Accuracy: 0.23947895791583165\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "correct = 0\n",
404 | "for i in range(len(labels)):\n",
405 | " if preds[i] == labels[i]:\n",
406 | " correct+=1\n",
407 | "\n",
408 | "accuracy = correct/len(labels)\n",
409 | "print(\"Accuracy: \", accuracy)"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "### References"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "Boydstun, Amber E. (2014). New York Times Front Page Dataset. www.comparativeagendas.net. Accessed April 26, 2019.\n",
424 | "\n",
425 | "\n",
426 | "\n"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {},
433 | "outputs": [],
434 | "source": []
435 | }
436 | ],
437 | "metadata": {
438 | "kernelspec": {
439 | "display_name": "Python 3",
440 | "language": "python",
441 | "name": "python3"
442 | },
443 | "language_info": {
444 | "codemirror_mode": {
445 | "name": "ipython",
446 | "version": 3
447 | },
448 | "file_extension": ".py",
449 | "mimetype": "text/x-python",
450 | "name": "python",
451 | "nbconvert_exporter": "python",
452 | "pygments_lexer": "ipython3",
453 | "version": "3.6.9"
454 | }
455 | },
456 | "nbformat": 4,
457 | "nbformat_minor": 4
458 | }
459 |
--------------------------------------------------------------------------------
/examples/lm_finetuning.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
19 | using a masked language modeling (MLM) loss.
20 | """
21 |
22 |
23 | import argparse
24 | import glob
25 | import logging
26 | import os
27 | import pickle
28 | import random
29 | import re
30 | import shutil
31 | from typing import Dict, List, Tuple
32 |
33 | import numpy as np
34 | import torch
35 | from torch.nn.utils.rnn import pad_sequence
36 | from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
37 | from torch.utils.data.distributed import DistributedSampler
38 | from tqdm import tqdm, trange
39 |
40 | from transformers import (
41 | MODEL_WITH_LM_HEAD_MAPPING,
42 | WEIGHTS_NAME,
43 | AdamW,
44 | AutoConfig,
45 | AutoModelWithLMHead,
46 | AutoTokenizer,
47 | PreTrainedModel,
48 | PreTrainedTokenizer,
49 | get_linear_schedule_with_warmup,
50 | )
51 |
52 |
53 | try:
54 | from torch.utils.tensorboard import SummaryWriter
55 | except ImportError:
56 | from tensorboardX import SummaryWriter
57 |
58 |
59 | logger = logging.getLogger(__name__)
60 |
61 |
62 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
63 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
64 |
65 |
66 | class TextDataset(Dataset):
67 | def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
68 | assert os.path.isfile(file_path)
69 |
70 | block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
71 |
72 | directory, filename = os.path.split(file_path)
73 | cached_features_file = os.path.join(
74 | directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
75 | )
76 |
77 | if os.path.exists(cached_features_file) and not args.overwrite_cache:
78 | logger.info("Loading features from cached file %s", cached_features_file)
79 | with open(cached_features_file, "rb") as handle:
80 | self.examples = pickle.load(handle)
81 | else:
82 | logger.info("Creating features from dataset file at %s", directory)
83 |
84 | self.examples = []
85 | with open(file_path, encoding="utf-8") as f:
86 | text = f.read()
87 |
88 | tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
89 |
90 | for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size
91 | self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
92 | # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
93 | # If your dataset is small, first you should loook for a bigger one :-) and second you
94 | # can change this behavior by adding (model specific) padding.
95 |
96 | logger.info("Saving features into cached file %s", cached_features_file)
97 | with open(cached_features_file, "wb") as handle:
98 | pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
99 |
100 | def __len__(self):
101 | return len(self.examples)
102 |
103 | def __getitem__(self, item):
104 | return torch.tensor(self.examples[item], dtype=torch.long)
105 |
106 |
107 | class LineByLineTextDataset(Dataset):
108 | def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
109 | assert os.path.isfile(file_path)
110 | # Here, we do not cache the features, operating under the assumption
111 | # that we will soon use fast multithreaded tokenizers from the
112 | # `tokenizers` repo everywhere =)
113 | logger.info("Creating features from dataset file at %s", file_path)
114 |
115 | with open(file_path, encoding="utf-8") as f:
116 | lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
117 |
118 | self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
119 |
120 | def __len__(self):
121 | return len(self.examples)
122 |
123 | def __getitem__(self, i):
124 | return torch.tensor(self.examples[i], dtype=torch.long)
125 |
126 |
127 | def load_and_cache_examples(args, tokenizer, evaluate=False):
128 | file_path = args.eval_data_file if evaluate else args.train_data_file
129 | if args.line_by_line:
130 | return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
131 | else:
132 | return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
133 |
134 |
135 | def set_seed(args):
136 | random.seed(args.seed)
137 | np.random.seed(args.seed)
138 | torch.manual_seed(args.seed)
139 | if args.n_gpu > 0:
140 | torch.cuda.manual_seed_all(args.seed)
141 |
142 |
143 | def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
144 | ordering_and_checkpoint_path = []
145 |
146 | glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
147 |
148 | for path in glob_checkpoints:
149 | if use_mtime:
150 | ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
151 | else:
152 | regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
153 | if regex_match and regex_match.groups():
154 | ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
155 |
156 | checkpoints_sorted = sorted(ordering_and_checkpoint_path)
157 | checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
158 | return checkpoints_sorted
159 |
160 |
161 | def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
162 | if not args.save_total_limit:
163 | return
164 | if args.save_total_limit <= 0:
165 | return
166 |
167 | # Check if we should delete older checkpoint(s)
168 | checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
169 | if len(checkpoints_sorted) <= args.save_total_limit:
170 | return
171 |
172 | number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
173 | checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
174 | for checkpoint in checkpoints_to_be_deleted:
175 | logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
176 | shutil.rmtree(checkpoint)
177 |
178 |
179 | def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
180 | """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
181 |
182 | if tokenizer.mask_token is None:
183 | raise ValueError(
184 | "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
185 | )
186 |
187 | labels = inputs.clone()
188 | # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
189 | probability_matrix = torch.full(labels.shape, args.mlm_probability)
190 | special_tokens_mask = [
191 | tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
192 | ]
193 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
194 | if tokenizer._pad_token is not None:
195 | padding_mask = labels.eq(tokenizer.pad_token_id)
196 | probability_matrix.masked_fill_(padding_mask, value=0.0)
197 | masked_indices = torch.bernoulli(probability_matrix).bool()
198 | labels[~masked_indices] = -100 # We only compute loss on masked tokens
199 |
200 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
201 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
202 | inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
203 |
204 | # 10% of the time, we replace masked input tokens with random word
205 | indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
206 | random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
207 | inputs[indices_random] = random_words[indices_random]
208 |
209 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged
210 | return inputs, labels
211 |
212 |
213 | def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
214 | """ Train the model """
215 | if args.local_rank in [-1, 0]:
216 | tb_writer = SummaryWriter()
217 |
218 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
219 |
220 | def collate(examples: List[torch.Tensor]):
221 | if tokenizer._pad_token is None:
222 | return pad_sequence(examples, batch_first=True)
223 | return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
224 |
225 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
226 | train_dataloader = DataLoader(
227 | train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
228 | )
229 |
230 | if args.max_steps > 0:
231 | t_total = args.max_steps
232 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
233 | else:
234 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
235 |
236 | # Prepare optimizer and schedule (linear warmup and decay)
237 | no_decay = ["bias", "LayerNorm.weight"]
238 | optimizer_grouped_parameters = [
239 | {
240 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
241 | "weight_decay": args.weight_decay,
242 | },
243 | {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
244 | ]
245 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
246 | scheduler = get_linear_schedule_with_warmup(
247 | optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
248 | )
249 |
250 | # Check if saved optimizer or scheduler states exist
251 | if (
252 | args.model_name_or_path
253 | and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
254 | and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
255 | ):
256 | # Load in optimizer and scheduler states
257 | optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
258 | scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
259 |
260 | if args.fp16:
261 | try:
262 | from apex import amp
263 | except ImportError:
264 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
265 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
266 |
267 | # multi-gpu training (should be after apex fp16 initialization)
268 | if args.n_gpu > 1:
269 | model = torch.nn.DataParallel(model)
270 |
271 | # Distributed training (should be after apex fp16 initialization)
272 | if args.local_rank != -1:
273 | model = torch.nn.parallel.DistributedDataParallel(
274 | model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
275 | )
276 |
277 | # Train!
278 | logger.info("***** Running training *****")
279 | logger.info(" Num examples = %d", len(train_dataset))
280 | logger.info(" Num Epochs = %d", args.num_train_epochs)
281 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
282 | logger.info(
283 | " Total train batch size (w. parallel, distributed & accumulation) = %d",
284 | args.train_batch_size
285 | * args.gradient_accumulation_steps
286 | * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
287 | )
288 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
289 | logger.info(" Total optimization steps = %d", t_total)
290 |
291 | global_step = 0
292 | epochs_trained = 0
293 | steps_trained_in_current_epoch = 0
294 | # Check if continuing training from a checkpoint
295 | if args.model_name_or_path and os.path.exists(args.model_name_or_path):
296 | try:
297 | # set global_step to gobal_step of last saved checkpoint from model path
298 | checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
299 | global_step = int(checkpoint_suffix)
300 | epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
301 | steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
302 |
303 | logger.info(" Continuing training from checkpoint, will skip to saved global_step")
304 | logger.info(" Continuing training from epoch %d", epochs_trained)
305 | logger.info(" Continuing training from global step %d", global_step)
306 | logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
307 | except ValueError:
308 | logger.info(" Starting fine-tuning.")
309 |
310 | tr_loss, logging_loss = 0.0, 0.0
311 |
312 | model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
313 | model_to_resize.resize_token_embeddings(len(tokenizer))
314 |
315 | model.zero_grad()
316 | train_iterator = trange(
317 | epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
318 | )
319 | set_seed(args) # Added here for reproducibility
320 | for _ in train_iterator:
321 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
322 | for step, batch in enumerate(epoch_iterator):
323 |
324 | # Skip past any already trained steps if resuming training
325 | if steps_trained_in_current_epoch > 0:
326 | steps_trained_in_current_epoch -= 1
327 | continue
328 |
329 | inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
330 | inputs = inputs.to(args.device)
331 | labels = labels.to(args.device)
332 | model.train()
333 | outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
334 | loss = outputs[0] # model outputs are always tuple in transformers (see doc)
335 |
336 | if args.n_gpu > 1:
337 | loss = loss.mean() # mean() to average on multi-gpu parallel training
338 | if args.gradient_accumulation_steps > 1:
339 | loss = loss / args.gradient_accumulation_steps
340 |
341 | if args.fp16:
342 | with amp.scale_loss(loss, optimizer) as scaled_loss:
343 | scaled_loss.backward()
344 | else:
345 | loss.backward()
346 |
347 | tr_loss += loss.item()
348 | if (step + 1) % args.gradient_accumulation_steps == 0:
349 | if args.fp16:
350 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
351 | else:
352 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
353 | optimizer.step()
354 | scheduler.step() # Update learning rate schedule
355 | model.zero_grad()
356 | global_step += 1
357 |
358 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
359 | # Log metrics
360 | if (
361 | args.local_rank == -1 and args.evaluate_during_training
362 | ): # Only evaluate when single GPU otherwise metrics may not average well
363 | results = evaluate(args, model, tokenizer)
364 | for key, value in results.items():
365 | tb_writer.add_scalar("eval_{}".format(key), value, global_step)
366 | tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
367 | tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
368 | logging_loss = tr_loss
369 |
370 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
371 | checkpoint_prefix = "checkpoint"
372 | # Save model checkpoint
373 | output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
374 | os.makedirs(output_dir, exist_ok=True)
375 | model_to_save = (
376 | model.module if hasattr(model, "module") else model
377 | ) # Take care of distributed/parallel training
378 | model_to_save.save_pretrained(output_dir)
379 | tokenizer.save_pretrained(output_dir)
380 |
381 | torch.save(args, os.path.join(output_dir, "training_args.bin"))
382 | logger.info("Saving model checkpoint to %s", output_dir)
383 |
384 | _rotate_checkpoints(args, checkpoint_prefix)
385 |
386 | torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
387 | torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
388 | logger.info("Saving optimizer and scheduler states to %s", output_dir)
389 |
390 | if args.max_steps > 0 and global_step > args.max_steps:
391 | epoch_iterator.close()
392 | break
393 | if args.max_steps > 0 and global_step > args.max_steps:
394 | train_iterator.close()
395 | break
396 |
397 | if args.local_rank in [-1, 0]:
398 | tb_writer.close()
399 |
400 | return global_step, tr_loss / global_step
401 |
402 |
403 | def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
404 | # Loop to handle MNLI double evaluation (matched, mis-matched)
405 | eval_output_dir = args.output_dir
406 |
407 | eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
408 |
409 | if args.local_rank in [-1, 0]:
410 | os.makedirs(eval_output_dir, exist_ok=True)
411 |
412 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
413 | # Note that DistributedSampler samples randomly
414 |
415 | def collate(examples: List[torch.Tensor]):
416 | if tokenizer._pad_token is None:
417 | return pad_sequence(examples, batch_first=True)
418 | return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
419 |
420 | eval_sampler = SequentialSampler(eval_dataset)
421 | eval_dataloader = DataLoader(
422 | eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
423 | )
424 |
425 | # multi-gpu evaluate
426 | if args.n_gpu > 1:
427 | model = torch.nn.DataParallel(model)
428 |
429 | # Eval!
430 | logger.info("***** Running evaluation {} *****".format(prefix))
431 | logger.info(" Num examples = %d", len(eval_dataset))
432 | logger.info(" Batch size = %d", args.eval_batch_size)
433 | eval_loss = 0.0
434 | nb_eval_steps = 0
435 | model.eval()
436 |
437 | for batch in tqdm(eval_dataloader, desc="Evaluating"):
438 | inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
439 | inputs = inputs.to(args.device)
440 | labels = labels.to(args.device)
441 |
442 | with torch.no_grad():
443 | outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
444 | lm_loss = outputs[0]
445 | eval_loss += lm_loss.mean().item()
446 | nb_eval_steps += 1
447 |
448 | eval_loss = eval_loss / nb_eval_steps
449 | perplexity = torch.exp(torch.tensor(eval_loss))
450 |
451 | result = {"perplexity": perplexity}
452 |
453 | output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
454 | with open(output_eval_file, "w") as writer:
455 | logger.info("***** Eval results {} *****".format(prefix))
456 | for key in sorted(result.keys()):
457 | logger.info(" %s = %s", key, str(result[key]))
458 | writer.write("%s = %s\n" % (key, str(result[key])))
459 |
460 | return result
461 |
462 |
463 | def main():
464 | parser = argparse.ArgumentParser()
465 |
466 | # Required parameters
467 | parser.add_argument(
468 | "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
469 | )
470 | parser.add_argument(
471 | "--output_dir",
472 | type=str,
473 | required=True,
474 | help="The output directory where the model predictions and checkpoints will be written.",
475 | )
476 | parser.add_argument(
477 | "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.",
478 | )
479 |
480 | # Other parameters
481 | parser.add_argument(
482 | "--eval_data_file",
483 | default=None,
484 | type=str,
485 | help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
486 | )
487 | parser.add_argument(
488 | "--line_by_line",
489 | action="store_true",
490 | help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
491 | )
492 | parser.add_argument(
493 | "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
494 | )
495 | parser.add_argument(
496 | "--model_name_or_path",
497 | default=None,
498 | type=str,
499 | help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
500 | )
501 |
502 | parser.add_argument(
503 | "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
504 | )
505 | parser.add_argument(
506 | "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
507 | )
508 |
509 | parser.add_argument(
510 | "--config_name",
511 | default=None,
512 | type=str,
513 | help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
514 | )
515 | parser.add_argument(
516 | "--tokenizer_name",
517 | default=None,
518 | type=str,
519 | help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
520 | )
521 | parser.add_argument(
522 | "--cache_dir",
523 | default=None,
524 | type=str,
525 | help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
526 | )
527 | parser.add_argument(
528 | "--block_size",
529 | default=-1,
530 | type=int,
531 | help="Optional input sequence length after tokenization."
532 | "The training dataset will be truncated in block of this size for training."
533 | "Default to the model max input length for single sentence inputs (take into account special tokens).",
534 | )
535 | parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
536 | parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
537 | parser.add_argument(
538 | "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
539 | )
540 |
541 | parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
542 | parser.add_argument(
543 | "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
544 | )
545 | parser.add_argument(
546 | "--gradient_accumulation_steps",
547 | type=int,
548 | default=1,
549 | help="Number of updates steps to accumulate before performing a backward/update pass.",
550 | )
551 | parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
552 | parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
553 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
554 | parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
555 | parser.add_argument(
556 | "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
557 | )
558 | parser.add_argument(
559 | "--max_steps",
560 | default=-1,
561 | type=int,
562 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
563 | )
564 | parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
565 |
566 | parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
567 | parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
568 | parser.add_argument(
569 | "--save_total_limit",
570 | type=int,
571 | default=None,
572 | help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
573 | )
574 | parser.add_argument(
575 | "--eval_all_checkpoints",
576 | action="store_true",
577 | help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
578 | )
579 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
580 | parser.add_argument(
581 | "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
582 | )
583 | parser.add_argument(
584 | "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
585 | )
586 | parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
587 |
588 | parser.add_argument(
589 | "--fp16",
590 | action="store_true",
591 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
592 | )
593 | parser.add_argument(
594 | "--fp16_opt_level",
595 | type=str,
596 | default="O1",
597 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
598 | "See details at https://nvidia.github.io/apex/amp.html",
599 | )
600 | parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
601 | parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
602 | parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
603 | args = parser.parse_args()
604 |
605 | if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
606 | raise ValueError(
607 | "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
608 | "flag (masked language modeling)."
609 | )
610 | if args.eval_data_file is None and args.do_eval:
611 | raise ValueError(
612 | "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
613 | "or remove the --do_eval argument."
614 | )
615 | if args.should_continue:
616 | sorted_checkpoints = _sorted_checkpoints(args)
617 | if len(sorted_checkpoints) == 0:
618 | raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
619 | else:
620 | args.model_name_or_path = sorted_checkpoints[-1]
621 |
622 | if (
623 | os.path.exists(args.output_dir)
624 | and os.listdir(args.output_dir)
625 | and args.do_train
626 | and not args.overwrite_output_dir
627 | ):
628 | raise ValueError(
629 | "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
630 | args.output_dir
631 | )
632 | )
633 |
634 | # Setup distant debugging if needed
635 | if args.server_ip and args.server_port:
636 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
637 | import ptvsd
638 |
639 | print("Waiting for debugger attach")
640 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
641 | ptvsd.wait_for_attach()
642 |
643 | # Setup CUDA, GPU & distributed training
644 | if args.local_rank == -1 or args.no_cuda:
645 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
646 | args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
647 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
648 | torch.cuda.set_device(args.local_rank)
649 | device = torch.device("cuda", args.local_rank)
650 | torch.distributed.init_process_group(backend="nccl")
651 | args.n_gpu = 1
652 | args.device = device
653 |
654 | # Setup logging
655 | logging.basicConfig(
656 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
657 | datefmt="%m/%d/%Y %H:%M:%S",
658 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
659 | )
660 | logger.warning(
661 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
662 | args.local_rank,
663 | device,
664 | args.n_gpu,
665 | bool(args.local_rank != -1),
666 | args.fp16,
667 | )
668 |
669 | # Set seed
670 | set_seed(args)
671 |
672 | # Load pretrained model and tokenizer
673 | if args.local_rank not in [-1, 0]:
674 | torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab
675 |
676 | if args.config_name:
677 | config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
678 | elif args.model_name_or_path:
679 | config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
680 | else:
681 | # When we release a pip version exposing CONFIG_MAPPING,
682 | # we can do `config = CONFIG_MAPPING[args.model_type]()`.
683 | raise ValueError(
684 | "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it,"
685 | "and load it from here, using --config_name"
686 | )
687 |
688 | if args.tokenizer_name:
689 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
690 | elif args.model_name_or_path:
691 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
692 | else:
693 | raise ValueError(
694 | "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
695 | "and load it from here, using --tokenizer_name"
696 | )
697 |
698 | if args.block_size <= 0:
699 | args.block_size = tokenizer.max_len
700 | # Our input block size will be the max possible for the model
701 | else:
702 | args.block_size = min(args.block_size, tokenizer.max_len)
703 |
704 | if args.model_name_or_path:
705 | model = AutoModelWithLMHead.from_pretrained(
706 | args.model_name_or_path,
707 | from_tf=bool(".ckpt" in args.model_name_or_path),
708 | config=config,
709 | cache_dir=args.cache_dir,
710 | )
711 | else:
712 | logger.info("Training new model from scratch")
713 | model = AutoModelWithLMHead.from_config(config)
714 |
715 | model.to(args.device)
716 |
717 | if args.local_rank == 0:
718 | torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab
719 |
720 | logger.info("Training/evaluation parameters %s", args)
721 |
722 | # Training
723 | if args.do_train:
724 | if args.local_rank not in [-1, 0]:
725 | torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
726 |
727 | train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
728 |
729 | if args.local_rank == 0:
730 | torch.distributed.barrier()
731 |
732 | global_step, tr_loss = train(args, train_dataset, model, tokenizer)
733 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
734 |
735 | # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
736 | if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
737 | # Create output directory if needed
738 | if args.local_rank in [-1, 0]:
739 | os.makedirs(args.output_dir, exist_ok=True)
740 |
741 | logger.info("Saving model checkpoint to %s", args.output_dir)
742 | # Save a trained model, configuration and tokenizer using `save_pretrained()`.
743 | # They can then be reloaded using `from_pretrained()`
744 | model_to_save = (
745 | model.module if hasattr(model, "module") else model
746 | ) # Take care of distributed/parallel training
747 | model_to_save.save_pretrained(args.output_dir)
748 | tokenizer.save_pretrained(args.output_dir)
749 |
750 | # Good practice: save your training arguments together with the trained model
751 | torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
752 |
753 | # Load a trained model and vocabulary that you have fine-tuned
754 | model = AutoModelWithLMHead.from_pretrained(args.output_dir)
755 | tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
756 | model.to(args.device)
757 |
758 | # Evaluation
759 | results = {}
760 | if args.do_eval and args.local_rank in [-1, 0]:
761 | checkpoints = [args.output_dir]
762 | if args.eval_all_checkpoints:
763 | checkpoints = list(
764 | os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
765 | )
766 | logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
767 | logger.info("Evaluate the following checkpoints: %s", checkpoints)
768 | for checkpoint in checkpoints:
769 | global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
770 | prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
771 |
772 | model = AutoModelWithLMHead.from_pretrained(checkpoint)
773 | model.to(args.device)
774 | result = evaluate(args, model, tokenizer, prefix=prefix)
775 | result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
776 | results.update(result)
777 |
778 | return results
779 |
780 |
781 | if __name__ == "__main__":
782 | main()
783 |
--------------------------------------------------------------------------------
/smaberta/smaberta.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | # Full credit to simpletransformers v0.6
4 |
5 | #TODO more appropriate name since we are just wrapping around RoBERTa
6 |
7 | from __future__ import absolute_import, division, print_function
8 |
9 | import os
10 | import json
11 | import logging
12 | import math
13 | from multiprocessing import cpu_count
14 | import random
15 | from typing import Dict, List, Tuple
16 |
17 |
18 | import numpy as np
19 | from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix
20 | from scipy.stats import pearsonr
21 | from simpletransformers.classification.classification_utils import (convert_examples_to_features, InputExample)
22 | from transformers import (WEIGHTS_NAME, BertConfig,
23 | BertForSequenceClassification, BertTokenizer,
24 | RobertaConfig,
25 | RobertaForSequenceClassification,
26 | RobertaTokenizer,
27 | XLMConfig, XLMForSequenceClassification,
28 | XLMTokenizer, XLNetConfig,
29 | XLNetForSequenceClassification,
30 | XLNetTokenizer,
31 | DistilBertConfig,
32 | DistilBertForSequenceClassification,
33 | DistilBertTokenizer,
34 | PreTrainedTokenizer,
35 | PreTrainedModel,
36 | AutoModelWithLMHead)
37 | from transformers import AdamW, get_linear_schedule_with_warmup
38 | import torch
39 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
40 | TensorDataset)
41 | from torch.utils.data.distributed import DistributedSampler
42 | from torch.nn.utils.rnn import pad_sequence
43 | from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
44 | from tensorboardX import SummaryWriter
45 | from tqdm import trange, tqdm
46 |
47 | logger = logging.getLogger(__name__)
48 |
49 | class LineByLineTextDataset(Dataset):
50 | """
51 | Dataset format for finetuning. Each line of file contains new fine tuning sentence.
52 | """
53 | def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):
54 | assert os.path.isfile(file_path)
55 | logger.info("Creating features from dataset file at %s", file_path)
56 |
57 | with open(file_path, encoding="utf-8") as f:
58 | lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
59 |
60 | self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
61 |
62 | def __len__(self):
63 | return len(self.examples)
64 |
65 | def __getitem__(self, i):
66 | return torch.tensor(self.examples[i], dtype=torch.long)
67 |
68 | def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args, mlm_probability=0.15) -> Tuple[torch.Tensor, torch.Tensor]:
69 | """ Prepare masked tokens inputs/labels for masked language modeling """
70 |
71 | if tokenizer.mask_token is None:
72 | raise ValueError(
73 | "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
74 | )
75 |
76 | labels = inputs.clone()
77 | # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
78 | probability_matrix = torch.full(labels.shape, mlm_probability)
79 | special_tokens_mask = [
80 | tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
81 | ]
82 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
83 | if tokenizer._pad_token is not None:
84 | padding_mask = labels.eq(tokenizer.pad_token_id)
85 | probability_matrix.masked_fill_(padding_mask, value=0.0)
86 | masked_indices = torch.bernoulli(probability_matrix).bool()
87 | labels[~masked_indices] = -100 # We only compute loss on masked tokens
88 |
89 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
90 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
91 | inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
92 |
93 | # 10% of the time, we replace masked input tokens with random word
94 | indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
95 | random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
96 | inputs[indices_random] = random_words[indices_random]
97 |
98 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged
99 | return inputs, labels
100 |
101 |
102 | class TransformerModel:
103 | def __init__(self, model_type, model_name, finetune=False, num_labels=2, use_cuda=True, location="", **kwargs):
104 | """
105 | Initializes a Transformer model.
106 | Args:
107 | model_type: The type of model (bert, xlnet, xlm, roberta, distilbert)
108 | model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
109 | finetune: Set to true or false based on if you want to initialise the model for fine tuning or classification
110 | num_labels (optional): The number of labels or classes in the dataset.
111 | location: To load a saved model from a particular location on your computer and use that as the base as opposed to the standard release from HuggingFace
112 | use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
113 | **kwargs (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
114 | """
115 |
116 | MODEL_CLASSES = {
117 | 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
118 | 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
119 | 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
120 | 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
121 | 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
122 | }
123 |
124 | config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
125 | if location=="":
126 | self.tokenizer = tokenizer_class.from_pretrained(model_name)
127 | if finetune:
128 | self.model=AutoModelWithLMHead.from_pretrained(model_name)
129 | else:
130 | self.model = model_class.from_pretrained(model_name, num_labels=num_labels)
131 | else:
132 | self.tokenizer = tokenizer_class.from_pretrained(location)
133 | if finetune:
134 | self.model=AutoModelWithLMHead.from_pretrained(location)
135 | else:
136 | self.model = model_class.from_pretrained(location, num_labels=num_labels)
137 |
138 | if use_cuda:
139 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
140 | else:
141 | self.device = "cpu"
142 |
143 | self.results = {}
144 |
145 | self.args = {
146 | 'output_dir': 'outputs/',
147 | 'cache_dir': 'cache_dir',
148 | 'fp16': True,
149 | 'fp16_opt_level': 'O1',
150 | 'max_seq_length': 128,
151 | 'train_batch_size': 25,
152 | 'finetune_batch_size': 4,
153 | 'gradient_accumulation_steps': 1,
154 | 'eval_batch_size': 50,
155 | 'finetune_eval_batch_size': 4,
156 | 'num_train_epochs': 1,
157 | 'num_finetune_epochs': 1,
158 | 'weight_decay': 0,
159 | 'learning_rate': 4e-5,
160 | 'finetune_learning_rate': 5e-5,
161 | 'adam_epsilon': 1e-8,
162 | 'warmup_ratio': 0.06,
163 | 'warmup_steps': 0,
164 | 'max_grad_norm': 1.0,
165 | 'mlm': True,
166 | 'logging_steps': 50,
167 | 'finetune_logging_steps': 100,
168 | 'save_steps': 2000,
169 | 'finetune_save_steps': 500,
170 | 'overwrite_output_dir': False,
171 | 'reprocess_input_data': False,
172 | 'process_count': cpu_count() - 2 if cpu_count() > 2 else 1,
173 | 'device': self.device,
174 | 'model_name_or_path': False,
175 | }
176 |
177 |
178 | self.args.update(kwargs)
179 |
180 | if use_cuda:
181 | self.args['n_gpu'] : torch.cuda.device_count()
182 |
183 | self.args['model_name'] = model_name
184 | self.args['model_type'] = model_type
185 |
186 | def train(self, training_samples, training_labels, output_dir=None, show_running_loss=False, **kwargs):
187 | """
188 | Trains the model using 'train_df'
189 | Args:
190 | training_samples: Iterable list or pandas series of text samples for training
191 | training_labels: Iterable list of the output labels corresponding to the text samples in `training_samples`
192 | output_dir: The directory where model files will be saved. If not given, self.args['output_dir'] will be used.
193 | show_running_loss (optional): Set to False to prevent running loss from being printed to console. Defaults to True.
194 | **kwargs (optional): Optional changes to the args dict of the model. Any changes made will persist for the model.
195 | Returns:
196 | None
197 | """
198 |
199 | self.args.update(kwargs)
200 |
201 | if not output_dir:
202 | output_dir = self.args['output_dir']
203 |
204 | if os.path.exists(output_dir) and os.listdir(output_dir) and not self.args['overwrite_output_dir']:
205 | raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(output_dir))
206 |
207 | if not isinstance(training_samples, list):
208 | try:
209 | training_samples = list(training_samples)
210 | except:
211 | raise Exception('Training samples must be iterable')
212 |
213 | if not isinstance(training_labels, list):
214 | try:
215 | training_labels = list(training_labels)
216 | except:
217 | raise Exception('Training labels must be iterable')
218 |
219 | self.model.to(self.device)
220 |
221 | train_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(training_samples, training_labels))]
222 |
223 | train_dataset = self._load_and_cache_examples(train_examples)
224 | global_step, tr_loss = self._train(train_dataset, output_dir, show_running_loss=show_running_loss)
225 |
226 | if not os.path.exists(output_dir):
227 | os.makedirs(output_dir)
228 |
229 | model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
230 | model_to_save.save_pretrained(output_dir)
231 | self.tokenizer.save_pretrained(output_dir)
232 | torch.save(self.args, os.path.join(output_dir, 'training_args.bin'))
233 |
234 | print(f'Training of {self.args["model_type"]} model complete. Saved to {output_dir}.')
235 |
236 | def evaluate(self, testing_samples, testing_labels, output_dir=None, verbose=False, **kwargs):
237 | """
238 | Evaluates the model on eval_df. Saves results to output_dir.
239 | Args:
240 | testing_samples: an iterable list of texts for testing
241 | testing_labels: the labels corresponding to the testing samples
242 | output_dir: The directory where model files will be saved. If not given, self.args['output_dir'] will be used.
243 | verbose: If verbose, results will be printed to the console on completion of evaluation.
244 | **kwargs: Additional metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use). E.g. f1=sklearn.metrics.f1_score.
245 | A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions.
246 | Returns:
247 | result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn)
248 | model_outputs: List of model outputs for each row in eval_df
249 | wrong_preds: List of InputExample objects corresponding to each incorrect prediction by the model
250 | """
251 |
252 | if not output_dir:
253 | output_dir = self.args['output_dir']
254 |
255 | self.model.to(self.device)
256 |
257 | result, model_outputs, wrong_preds = self._evaluate(testing_samples, testing_labels, output_dir, kwargs)
258 | self.results.update(result)
259 |
260 | if not verbose:
261 | print(self.results)
262 |
263 | return result, model_outputs, wrong_preds
264 |
265 | def _evaluate(self, testing_samples, testing_labels, output_dir, prefix="", **kwargs):
266 | """
267 | Evaluates the model on eval_df.
268 | Utility function to be used by the evaluate() method. Not intended to be used directly.
269 | """
270 | self.args.update(kwargs)
271 |
272 | tokenizer = self.tokenizer
273 | device = self.device
274 | model = self.model
275 | args = self.args
276 | eval_output_dir = output_dir
277 |
278 | if not isinstance(testing_samples, list):
279 | try:
280 | testing_samples = list(testing_samples)
281 | except:
282 | raise Exception('Testing samples must be iterable')
283 |
284 | if not isinstance(testing_labels, list):
285 | try:
286 | testing_labels = list(testing_labels)
287 | except:
288 | raise Exception('Testing labels must be iterable')
289 |
290 | results = {}
291 |
292 | eval_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(testing_samples, testing_labels))]
293 | eval_dataset = self._load_and_cache_examples(eval_examples, evaluate=True)
294 | if not os.path.exists(eval_output_dir):
295 | os.makedirs(eval_output_dir)
296 |
297 | eval_sampler = SequentialSampler(eval_dataset)
298 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])
299 |
300 | eval_loss = 0.0
301 | nb_eval_steps = 0
302 | preds = None
303 | out_label_ids = None
304 | #for batch in tqdm(eval_dataloader):
305 | for batch in eval_dataloader:
306 | model.eval()
307 | batch = tuple(t.to(device) for t in batch)
308 |
309 | with torch.no_grad():
310 | inputs = {'input_ids': batch[0],
311 | 'attention_mask': batch[1],
312 | 'labels': batch[3]}
313 | if self.args['model_type'] != 'distilbert':
314 | inputs['token_type_ids'] = batch[2] if self.args['model_type'] in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
315 | outputs = model(**inputs)
316 | tmp_eval_loss, logits = outputs[:2]
317 |
318 | eval_loss += tmp_eval_loss.mean().item()
319 | nb_eval_steps += 1
320 | if preds is None:
321 | preds = logits.detach().cpu().numpy()
322 | out_label_ids = inputs['labels'].detach().cpu().numpy()
323 | else:
324 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
325 | out_label_ids = np.append(
326 | out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
327 |
328 | eval_loss = eval_loss / nb_eval_steps
329 | model_outputs = preds
330 | preds = np.argmax(preds, axis=1)
331 | result, wrong = self.compute_metrics(preds, out_label_ids, eval_examples, **kwargs)
332 | results.update(result)
333 |
334 | output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
335 | with open(output_eval_file, "w") as writer:
336 | for key in sorted(result.keys()):
337 | writer.write("%s = %s\n" % (key, str(result[key])))
338 |
339 | return results, model_outputs, wrong
340 |
341 |
342 | def _load_and_cache_examples(self, examples, evaluate=False, no_cache=False):
343 | """
344 | Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures.
345 | Utility function for train() and eval() methods. Not intended to be used directly.
346 | """
347 |
348 | process_count = self.args['process_count']
349 |
350 | tokenizer = self.tokenizer
351 | output_mode = 'classification'
352 | args=self.args
353 |
354 | if not os.path.isdir(self.args['cache_dir']):
355 | os.mkdir(self.args['cache_dir'])
356 |
357 | mode = 'dev' if evaluate else 'train'
358 | cached_features_file = os.path.join(args['cache_dir'], f"cached_{mode}_{args['model_type']}_{args['max_seq_length']}_binary")
359 |
360 | if os.path.exists(cached_features_file) and not args['reprocess_input_data'] and not no_cache:
361 | features = torch.load(cached_features_file)
362 |
363 | else:
364 | features = convert_examples_to_features(examples, args['max_seq_length'], tokenizer, output_mode,
365 | # xlnet has a cls token at the end
366 | cls_token_at_end=bool(args['model_type'] in ['xlnet']),
367 | cls_token=tokenizer.cls_token,
368 | cls_token_segment_id=2 if self.args['model_type'] in ['xlnet'] else 0,
369 | sep_token=tokenizer.sep_token,
370 | # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
371 | sep_token_extra=bool(args['model_type'] in ['roberta']),
372 | # pad on the left for xlnet
373 | pad_on_left=bool(args['model_type'] in ['xlnet']),
374 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
375 | pad_token_segment_id=4 if self.args['model_type'] in ['xlnet'] else 0,
376 | process_count=process_count, silent=True)
377 |
378 | if not no_cache:
379 | torch.save(features, cached_features_file)
380 |
381 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
382 | all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
383 | all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
384 | if output_mode == "classification":
385 | all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
386 | elif output_mode == "regression":
387 | all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
388 |
389 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
390 | return dataset
391 |
392 |
393 | def _train(self, train_dataset, output_dir, show_running_loss=True):
394 | """
395 | Trains the model on train_dataset.
396 | Utility function to be used by the train_model() method. Not intended to be used directly.
397 | """
398 | tokenizer = self.tokenizer
399 | device = self.device
400 | model = self.model
401 | args = self.args
402 | tb_writer = SummaryWriter()
403 | train_sampler = RandomSampler(train_dataset)
404 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])
405 |
406 | t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
407 |
408 | no_decay = ['bias', 'LayerNorm.weight']
409 | optimizer_grouped_parameters = [
410 | {'params': [p for n, p in model.named_parameters() if not any(
411 | nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
412 | {'params': [p for n, p in model.named_parameters() if any(
413 | nd in n for nd in no_decay)], 'weight_decay': 0.0}
414 | ]
415 |
416 | warmup_steps = math.ceil(t_total * args['warmup_ratio'])
417 | args['warmup_steps'] = warmup_steps if self.args['warmup_steps'] == 0 else args['warmup_steps']
418 |
419 | optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
420 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps=t_total)
421 |
422 | if self.args['fp16']:
423 | try:
424 | from apex import amp
425 | except ImportError:
426 | raise ImportError(
427 | "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
428 | model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level'])
429 |
430 | global_step = 0
431 | tr_loss, logging_loss = 0.0, 0.0
432 | model.zero_grad()
433 | train_iterator = range(int(args['num_train_epochs']))#, desc="Epoch")
434 | ctr = 0
435 | for _ in train_iterator:
436 | print("Starting Epoch: ", ctr)
437 | ctr+=1
438 | # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
439 | for step, batch in enumerate(train_dataloader):#, desc="Current iteration"):
440 | model.train()
441 | batch = tuple(t.to(device) for t in batch)
442 | inputs = {'input_ids': batch[0],
443 | 'attention_mask': batch[1],
444 | 'labels': batch[3]}
445 | # XLM, DistilBERT and RoBERTa don't use segment_ids
446 | if self.args['model_type'] != 'distilbert':
447 | inputs['token_type_ids'] = batch[2] if self.args['model_type'] in ['bert', 'xlnet'] else None
448 | outputs = model(**inputs)
449 | # model outputs are always tuple in pytorch-transformers (see doc)
450 | loss = outputs[0]
451 | if show_running_loss:
452 | print("\rRunning loss: %f" % loss, end='')
453 |
454 | if self.args['gradient_accumulation_steps'] > 1:
455 | loss = loss / args['gradient_accumulation_steps']
456 |
457 | if self.args['fp16']:
458 | with amp.scale_loss(loss, optimizer) as scaled_loss:
459 | scaled_loss.backward()
460 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])
461 |
462 | else:
463 | loss.backward()
464 | torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])
465 |
466 | tr_loss += loss.item()
467 | if (step + 1) % args['gradient_accumulation_steps'] == 0:
468 | optimizer.step()
469 | scheduler.step() # Update learning rate schedule
470 | model.zero_grad()
471 | global_step += 1
472 |
473 | if self.args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
474 | # Log metrics
475 | # Only evaluate when single GPU otherwise metrics may not average well
476 | tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
477 | tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step)
478 | logging_loss = tr_loss
479 |
480 | if self.args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
481 | # Save model checkpoint
482 | output_dir = os.path.join(
483 | output_dir, 'checkpoint-{}'.format(global_step))
484 | if not os.path.exists(output_dir):
485 | os.makedirs(output_dir)
486 | # Take care of distributed/parallel training
487 | model_to_save = model.module if hasattr(
488 | model, 'module') else model
489 | model_to_save.save_pretrained(output_dir)
490 | return global_step, tr_loss / global_step
491 |
492 |
493 | def compute_metrics(self, preds, labels, eval_examples, **kwargs):
494 | """
495 | Computes the evaluation metrics for the model predictions.
496 | Args:
497 | preds: Model predictions
498 | labels: Ground truth labels
499 | eval_examples: List of examples on which evaluation was performed
500 | **kwargs: Additional metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use). E.g. f1=sklearn.metrics.f1_score.
501 | A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions.
502 | Returns:
503 | result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn)
504 | wrong: List of InputExample objects corresponding to each incorrect prediction by the model
505 | """
506 | assert len(preds) == len(labels)
507 |
508 | mcc = matthews_corrcoef(labels, preds)
509 |
510 | extra_metrics = {}
511 | for metric, func in kwargs.items():
512 | extra_metrics[metric] = func(labels, preds)
513 |
514 | mismatched = labels != preds
515 | wrong = [i for (i, v) in zip(eval_examples, mismatched) if v]
516 |
517 | if self.model.num_labels == 2:
518 | tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
519 | return {**{
520 | "mcc": mcc,
521 | "tp": tp,
522 | "tn": tn,
523 | "fp": fp,
524 | "fn": fn
525 | }, **extra_metrics}, wrong
526 |
527 | else:
528 | return {**{"mcc": mcc}, **extra_metrics}, wrong
529 |
530 | def predict(self, to_predict):
531 | """
532 | Performs predictions on a list of text.
533 | Args:
534 | to_predict: A python list of text (str) to be sent to the model for prediction.
535 | Returns:
536 | preds: A python list of the predictions (0 or 1) for each text.
537 | model_outputs: A python list of the raw model outputs for each text.
538 | """
539 |
540 | tokenizer = self.tokenizer
541 | device = self.device
542 | model = self.model
543 | args = self.args
544 |
545 | self.model.to(self.device)
546 |
547 | eval_examples = [InputExample(i, text, None, 0) for i, text in enumerate(to_predict)]
548 |
549 | eval_dataset = self._load_and_cache_examples(eval_examples, evaluate=True, no_cache=True)
550 |
551 | eval_sampler = SequentialSampler(eval_dataset)
552 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])
553 |
554 | eval_loss = 0.0
555 | nb_eval_steps = 0
556 | preds = None
557 | out_label_ids = None
558 | #for batch in tqdm(eval_dataloader):
559 | for batch in eval_dataloader:
560 | model.eval()
561 | batch = tuple(t.to(device) for t in batch)
562 |
563 | with torch.no_grad():
564 | inputs = {'input_ids': batch[0],
565 | 'attention_mask': batch[1],
566 | # XLM don't use segment_ids
567 | 'token_type_ids': batch[2] if self.args['model_type'] in ['bert', 'xlnet'] else None,
568 | 'labels': batch[3]}
569 | outputs = model(**inputs)
570 | tmp_eval_loss, logits = outputs[:2]
571 |
572 | eval_loss += tmp_eval_loss.mean().item()
573 | nb_eval_steps += 1
574 | if preds is None:
575 | preds = logits.detach().cpu().numpy()
576 | out_label_ids = inputs['labels'].detach().cpu().numpy()
577 | else:
578 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
579 | out_label_ids = np.append(
580 | out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
581 |
582 | eval_loss = eval_loss / nb_eval_steps
583 | model_outputs = preds
584 | preds = np.argmax(preds, axis=1)
585 |
586 | return preds, model_outputs
587 |
588 | def finetune(self, train_file_path, eval_file_path):
589 | """
590 | Fine tune the probability distribution of the language model on your own text
591 | Args:
592 | train_file_path: File containing samples of your text in consecutive lines. No labels necessary
593 | eval_file_path: File containing samples of your text in consecutive lines, used as the validation set to perform a sanity check on fine tuning
594 | Returns:
595 | global_step: Number of training steps
596 | Average loss per step
597 | Also saves the model in the output_dir provided as an argument on init
598 | """
599 | model = self.model
600 | tokenizer = self.tokenizer
601 | args = self.args
602 | print(args)
603 | #print("Starting model finetuning")
604 | train_dataset = LineByLineTextDataset(tokenizer, file_path=train_file_path)
605 | """ Train the model """
606 | tb_writer = SummaryWriter()
607 |
608 | def collate(examples: List[torch.Tensor]):
609 | if tokenizer._pad_token is None:
610 | return pad_sequence(examples, batch_first=True)
611 | return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
612 |
613 | train_sampler = RandomSampler(train_dataset)
614 | train_dataloader = DataLoader(
615 | train_dataset, sampler=train_sampler, batch_size=args["finetune_batch_size"], collate_fn=collate
616 | )
617 |
618 | t_total = len(train_dataloader) // args["num_finetune_epochs"]
619 |
620 | # Prepare optimizer and schedule (linear warmup and decay)
621 | no_decay = ["bias", "LayerNorm.weight"]
622 | optimizer_grouped_parameters = [
623 | {
624 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
625 | "weight_decay": args["weight_decay"],
626 | },
627 | {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
628 | ]
629 | optimizer = AdamW(optimizer_grouped_parameters, lr=args["finetune_learning_rate"], eps=args["adam_epsilon"])
630 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=t_total)
631 |
632 | # Check if saved optimizer or scheduler states exist
633 | if (
634 | args["model_name_or_path"]
635 | and os.path.isfile(os.path.join(args["model_name_or_path"], "optimizer.pt"))
636 | and os.path.isfile(os.path.join(args["model_name_or_path"], "scheduler.pt"))
637 | ):
638 | # Load in optimizer and scheduler states
639 | optimizer.load_state_dict(torch.load(os.path.join(args["model_name_or_path"], "optimizer.pt")))
640 | scheduler.load_state_dict(torch.load(os.path.join(args["model_name_or_path"], "scheduler.pt")))
641 |
642 | if self.args["fp16"]:
643 | try:
644 | from apex import amp
645 | except ImportError:
646 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
647 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
648 |
649 | # multi-gpu training (should be after apex fp16 initialization)
650 | #if args["n_gpu"] > 1:
651 | # model = torch.nn.DataParallel(model)
652 |
653 | # Train!
654 | logger.info("***** Running training *****")
655 | logger.info(" Num examples = %d", len(train_dataset))
656 | logger.info(" Num Epochs = %d", args["num_finetune_epochs"])
657 | logger.info(" Instantaneous batch size per GPU = %d", args["finetune_batch_size"])
658 | logger.info(" Gradient Accumulation steps = %d", 1)
659 | logger.info(" Total optimization steps = %d", t_total)
660 | #print("Beginning")
661 | global_step = 0
662 | epochs_trained = 0
663 | steps_trained_in_current_epoch = 0
664 | # Check if continuing training from a checkpoint
665 | if self.args["model_name_or_path"] and os.path.exists(self.args["model_name_or_path"]):
666 | try:
667 | # set global_step to gobal_step of last saved checkpoint from model path
668 | checkpoint_suffix = args["model_name_or_path"].split("-")[-1].split("/")[0]
669 | global_step = int(checkpoint_suffix)
670 | epochs_trained = global_step // (len(train_dataloader))
671 | steps_trained_in_current_epoch = global_step % (len(train_dataloader))
672 |
673 | logger.info(" Continuing training from checkpoint, will skip to saved global_step")
674 | logger.info(" Continuing training from epoch %d", epochs_trained)
675 | logger.info(" Continuing training from global step %d", global_step)
676 | logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
677 | except ValueError:
678 | logger.info(" Starting fine-tuning.")
679 |
680 | tr_loss, logging_loss = 0.0, 0.0
681 |
682 | model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
683 | model_to_resize.resize_token_embeddings(len(tokenizer))
684 | model.to(self.device)
685 | model.zero_grad()
686 | train_iterator = trange(
687 | epochs_trained, int(args["num_train_epochs"]), desc="Epoch", disable=False)
688 | #set_seed(args) # Added here for reproducibility
689 | for _ in train_iterator:
690 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
691 | for step, batch in enumerate(epoch_iterator):
692 |
693 | # Skip past any already trained steps if resuming training
694 | if steps_trained_in_current_epoch > 0:
695 | steps_trained_in_current_epoch -= 1
696 | continue
697 |
698 | inputs, labels = mask_tokens(batch, tokenizer, args) if self.args["mlm"] else (batch, batch)
699 | inputs = inputs.to(args["device"])
700 | labels = labels.to(args["device"])
701 | model.train()
702 | outputs = model(inputs, masked_lm_labels=labels) if self.args["mlm"] else model(inputs, labels=labels)
703 | loss = outputs[0] # model outputs are always tuple in transformers (see doc)
704 |
705 | #if args["n_gpu"] >Also saves the model in the output_dir:
706 | # loss = loss.mean() # mean() to average on multi-gpu parallel training
707 |
708 | if self.args["fp16"]:
709 | with amp.scale_loss(loss, optimizer) as scaled_loss:
710 | scaled_loss.backward()
711 | else:
712 | loss.backward()
713 |
714 | tr_loss += loss.item()
715 |
716 | if self.args["fp16"]:
717 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"])
718 | else:
719 | torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"])
720 |
721 | optimizer.step()
722 | scheduler.step() # Update learning rate schedule
723 | model.zero_grad()
724 | global_step += 1
725 |
726 | if global_step % args["finetune_logging_steps"] == 0:
727 | # Log metrics
728 | results = self.lm_evaluate(eval_file_path)
729 | for key, value in results.items():
730 | tb_writer.add_scalar("eval_{}".format(key), value, global_step)
731 | tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
732 | tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["finetune_logging_steps"], global_step)
733 | logging_loss = tr_loss
734 |
735 | if global_step % args["finetune_save_steps"] == 0 or global_step==t_total-1:
736 | checkpoint_prefix = "checkpoint"
737 | # Save model checkpoint
738 | output_dir = os.path.join(args["output_dir"], "{}-{}".format(checkpoint_prefix, global_step))
739 | os.makedirs(output_dir, exist_ok=True)
740 | model_to_save = (
741 | model.module if hasattr(model, "module") else model
742 | ) # Take care of distributed/parallel training
743 | model_to_save.save_pretrained(output_dir)
744 | tokenizer.save_pretrained(output_dir)
745 |
746 | torch.save(args, os.path.join(output_dir, "training_args.bin"))
747 | logger.info("Saving model checkpoint to %s", output_dir)
748 |
749 | #_rotate_checkpoints(args, checkpoint_prefix)
750 |
751 | torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
752 | torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
753 | logger.info("Saving optimizer and scheduler states to %s", output_dir)
754 |
755 | #if args.max_steps > 0 and global_step > args.max_steps:
756 | # epoch_iterator.close()
757 | # break
758 | #if args.max_steps > 0 and global_step > args.max_steps:
759 | # train_iterator.close()
760 | # break
761 |
762 | #if args.local_rank in [-1, 0]:
763 | tb_writer.close()
764 |
765 | return global_step, tr_loss / global_step
766 |
767 | def lm_evaluate(self, eval_file_path, prefix="") -> Dict:
768 | """
769 | Evaluates the language model for perplexity on the set provided
770 | Args:
771 | eval_file_path: Location of file containing sample text for validation
772 | prefix: Prefix for saving results of evaluation
773 | TODO: Better saving/removing this argument
774 | Returns:
775 | Final evaluation perplexity score
776 | Saves results of evaluation to output_dir
777 | """
778 | model = self.model
779 | tokenizer = self.tokenizer
780 | args = self.args
781 | #print(args)
782 | #print("Starting evaluation")
783 | # Loop to handle MNLI double evaluation (matched, mis-matched)
784 | eval_output_dir = args["output_dir"]
785 |
786 | eval_dataset = LineByLineTextDataset(tokenizer, file_path=eval_file_path)
787 |
788 | #if args.local_rank in [-1, 0]:
789 | os.makedirs(eval_output_dir, exist_ok=True)
790 |
791 | def collate(examples: List[torch.Tensor]):
792 | if tokenizer._pad_token is None:
793 | return pad_sequence(examples, batch_first=True)
794 | return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
795 |
796 | eval_sampler = SequentialSampler(eval_dataset)
797 | eval_dataloader = DataLoader(
798 | eval_dataset, sampler=eval_sampler, batch_size=args["finetune_eval_batch_size"], collate_fn=collate
799 | )
800 |
801 | # multi-gpu evaluate
802 | #if args.n_gpu > 1:
803 | # model = torch.nn.DataParallel(model)
804 |
805 | # Eval!
806 | logger.info("***** Running evaluation {} *****".format(prefix))
807 | logger.info(" Num examples = %d", len(eval_dataset))
808 | logger.info(" Batch size = %d", args["finetune_eval_batch_size"])
809 | eval_loss = 0.0
810 | nb_eval_steps = 0
811 | model.to(self.device)
812 | model.eval()
813 |
814 | for batch in tqdm(eval_dataloader, desc="Evaluating"):
815 | inputs, labels = mask_tokens(batch, tokenizer, args) if self.args["mlm"] else (batch, batch)
816 | inputs = inputs.to(args["device"])
817 | labels = labels.to(args["device"])
818 |
819 | with torch.no_grad():
820 | outputs = model(inputs, masked_lm_labels=labels) if self.args["mlm"] else model(inputs, labels=labels)
821 | lm_loss = outputs[0]
822 | eval_loss += lm_loss.mean().item()
823 | nb_eval_steps += 1
824 |
825 | eval_loss = eval_loss / nb_eval_steps
826 | perplexity = torch.exp(torch.tensor(eval_loss))
827 |
828 | result = {"perplexity": perplexity}
829 | print("Evaluation perplexity: ", result)
830 | output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
831 | with open(output_eval_file, "w") as writer:
832 | logger.info("***** Eval results {} *****".format(prefix))
833 | for key in sorted(result.keys()):
834 | logger.info(" %s = %s", key, str(result[key]))
835 | writer.write("%s = %s\n" % (key, str(result[key])))
836 |
837 | return result
838 |
839 |
--------------------------------------------------------------------------------
/data/tutorial_test.csv:
--------------------------------------------------------------------------------
1 | text,label
2 | "AIDS in prison, treatment costs overwhelm prison budgets",12
3 | olympics security,19
4 | police brutality,12
5 | Iranian nuclear program; deal with European Union and its leaving of Iran free to develop plutonium.,16
6 | terror alert raised,16
7 | Job report shows unexpected vigor for US economy,5
8 | Clinton proposes West Bank Plan to Isreal's Prime Minister Netanyahu,19
9 | Senators debate Iraq War policy,16
10 | Myrtle Beach,14
11 | china visit,19
12 | elections in Rwanda,19
13 | Sudan tires of war between Arabs and Christians,19
14 | Enron scandal,12
15 | primaries - McCain,20
16 | US to sign a treaty banning land mines,16
17 | ross perot to run for president,20
18 | European law prohibits American-style buying and selling of personal data,19
19 | clinton's list of donors,20
20 | Guantanamo Bay opposition,2
21 | US demands Iraq disarm in a meeting with allies,16
22 | conservatives attack job training bill,5
23 | Underground rumors and popularity of supposed obesity miracle drug; doctors' call for caution.,3
24 | old style farm in Illinois,4
25 | Questions about whether NYC mayor Bloomberg's plan to impose strict requirements for passing the third grade would not be counter-productive.,6
26 | nyc schools chancellor ousted,6
27 | Fed Reserve Chairman and former treasury sec against dropping tax to stimulate economy,1
28 | supreme court ruling on redistricting,20
29 | Governor of California to sign welfare law to move aid recipients into jobs,13
30 | AT&T may split up,15
31 | immgration crackdown at walmart,9
32 | Urban development programs to replace high-rise public housing,14
33 | jungle trees hides ancient archetectual buildings,19
34 | Difficulties faced by prospective parents whose unborn child is diagnosed with serious conditions.,12
35 | NY school cirriculum,6
36 | witness against Al-Qaeda,19
37 | evidence from 1918 flu found in genetics,3
38 | ground zero reconstuction,21
39 | Growing field of geriatric care managers,3
40 | israel may release arafat,19
41 | billboards and smoking,3
42 | push for a nationwide product liability bill,15
43 | White House selection for new Treasury Secretary,20
44 | Internal pressure for reforms in China; use of centenary of Deng Xiaoping for their promotion.,19
45 | Afghan girls and education,19
46 | Growth in the number of uninsured in US,3
47 | Israel captures hebron,19
48 | plan to revamp Lower Manhatten transit network,10
49 | AIDS medication; AZT,3
50 | supreme court ruling on vouchers for private schools,6
51 | Canadian sues US for detaining and beating him for 10 months,2
52 | New spy gear used in Iraq,16
53 | US-French split over iraq,16
54 | get out the vote campaigns,20
55 | Two Congressmen and their recovery from alcoholism,3
56 | retail boom,1
57 | medicare drug plan backed by aarp,3
58 | Blue collar workers whose jobs have gone overseas find themselves abandoned by labor unions,5
59 | State Department annual report finds an increase in terrorist incidents.,16
60 | Japan struggles to adopt Western-style capitalism,19
61 | US and russia sign a pact on nuclear arms cuts,16
62 | New Orleans reconstruction,15
63 | legal battle begins over expansion of government's powers in fighting terrorism,19
64 | independent council investigating the clintons,20
65 | Thanksgiving in immigrant households.,9
66 | genetic engineering regulations,4
67 | E-bay auctions,15
68 | Yeltsin nominates Primakov as Prime Minister,19
69 | ken starr testimony,20
70 | Fire disrupts Brooklyn subway lines,10
71 | war in Iraq:US abuse of prioners in Afghanistan,19
72 | Clinton impeachment trial,20
73 | Afghanistan reconstruction; mine- and explosives-clearing in preparation for airport re-opening.,16
74 | bush legislative plan,20
75 | Britain's royal navy actively recruits homosexuals,19
76 | "Series on life of people with A.L.S., or Lou Gherig's disease.",3
77 | poll on americans' opinions on microsoft,15
78 | "Saudi leaders voice public support for US, but are sensitive to launching military operations",16
79 | Man on death row appeals for clemency,12
80 | NY senate debate,20
81 | more men are experiencing sexual harrassment from other males,2
82 | Chinese Sneak Into Taiwan in Hopes of Prosperity,19
83 | anniversary of waco fire,16
84 | "Clinton Retirement, Calls on Republicans",20
85 | "Damage done to Columbia University's Research, Result of Power Outage in New York City",8
86 | the number of dead and missing firefighters,16
87 | Israel studying formerly unthinkable proposals,19
88 | Profile of current population of drug felons in NYC prisons.,12
89 | National Governors' Association decide not to try to change new Federal welfare law; pressure from Congressional Republicans; creating a compromise over benefits for legal immigrants,13
90 | Elian Gonzalez,9
91 | stroke therapy,3
92 | Congressman sentenced in bribery case,20
93 | GOP primary,20
94 | rapid expansion of technology requires area codes in phone calls,17
95 | "US warns China to abstain from military action against Taiwan, Urge for peaceful resolve",19
96 | Kerry fires campaign chief,20
97 | cancer drugs in mice,3
98 | Saudi Arabia faces end of oil boom and beginning of recession,19
99 | sniper attacks,12
100 | Germany accuses U.S. of illegally kidnapping innocent terror suspect; CIA,16
101 | Home prices rise leading to bidding wars,14
102 | Terror's influence on the art world,19
103 | countries cracking down on cartels,18
104 | New discoveries about the nature of brain injury,3
105 | "despite depictions from credit card companies, most Americans bamkrupt due to hard luck",15
106 | US college students under quarantine from SARS,3
107 | murder charge,12
108 | Kofi Annan visits Iraq,19
109 | Tenet healthcare settles fraud accusations,3
110 | Coretta Scott King's funeral service,2
111 | Congressional Republicans and White House officials near budget agreement,1
112 | "Hurricane Katrina, New Orleans police force falling apart",12
113 | car bomb in pakistan outside American Consulate,16
114 | President Bush wants to send more troops to Iraq,16
115 | Democrat Senators-to-Governors; Corzine announcement of run for post of NJ Governor.,20
116 | death penalty debate,12
117 | clinton orders forces to the persian gulf,16
118 | NY times neediest cases fund,17
119 | Narrow approval of Central American free-trade deal.,18
120 | legacy of the los angeles riots,2
121 | USS cole returns to duty,16
122 | Chinese Government Publicizes Falun Gong Self-Immolation,19
123 | Ariel Sharon to found a new party,19
124 | U.S. troops capture high number of prisoners in Iraq not protected under Geneva convention,16
125 | water management in the west,21
126 | anti-american sentiment in Falluja,16
127 | black businessmen,15
128 | New York Stock Exchange C.E.O. accused of stealing money,15
129 | freed Hamas prisoners,19
130 | AOL accidentally releases the identity of one of its users to public,2
131 | Former President Bush promotes business interests of equity firm,18
132 | killings in kosovo,16
133 | Women giving birth in Africa,19
134 | Overview of the 1998 campaigns,20
135 | Dedication of WWII Memorial in Washington D.C..,21
136 | NASA experiment fails,17
137 | Federal Panel overrules science fraud charges against Thereza Imanishi- Kari,3
138 | mutual funds trying to assure their investors,12
139 | bickering over Iraq in European countries,16
140 | Shooting of Amadou Diallo results in citizens not trusting police,12
141 | Refusal of Sonia Ghandi to serve as prime minister.,19
142 | Giuliani upset by lesser charges in criminal death of a police officer,12
143 | election problems in duval county florida,20
144 | couriers serve immigrants,9
145 | China has Trouble Becoming Producer of Crocodile Goods,19
146 | Netanyahu becomes first Prime Minister to cross into Gaza to meet with Palestinians,19
147 | "Clinton and Dole fundraise in same areas, benefit from added attention",20
148 | "Presidential election; rise to prominence of Barak Obama, culminating in his selection as keynote speaker at the Democratic convention.",20
149 | NYC councilman killed at city hall,12
150 | Lieberman's presidential candidacy,20
151 | Child welfare in New Jersey,12
152 | Clark enters democratic race for president,20
153 | Liberians ask for US help,16
154 | US generals meet in a palace to discuss rebuilding,16
155 | father who killed at kids hockey game sentenced,12
156 | police killing youth in venezuela,19
157 | Bush warns Hussein to allow UN inspectors to search for wmds,16
158 | Catholic priest abuse scandal,12
159 | Scandal involving Tom DeLay's family being paid by his campaign committees,20
160 | death of 2 afghan inmates,19
161 | Bush calls for allies to help with transferring Iraq sovereignty,16
162 | Legal status of terrorist suspects; federal judge's halting of military trial of Osama Bin Laden's driver.,2
163 | soaring birthrates in NYC,3
164 | Berlin: Where it is today compared with the past,19
165 | Orchard owners negotiating with home developers rather than collecting apples,4
166 | Advice to investors and reflection on financial lessons,15
167 | corporation and the community its in,15
168 | Pataki calls for financing to promote environment,7
169 | "Zaire: looking into the new era, economics",19
170 | Philip Morris trying to win favor from New York Legislature,3
171 | people travelling for July 4 holiday,20
172 | New welfare programs allow welfare recipients to keep more benefits when they get jobs,13
173 | guerrillas in the phillipenes,19
174 | hiring of Foreign Service officers,19
175 | Growth in demand for services of individual taking down public pay-phone locations and numbers.,17
176 | Clinton impeachment trial,20
177 | states changing stringent anticrime measures,12
178 | Court awards billions of dollars to New York City schools,6
179 | Chinese trade bill,18
180 | Cleveland: No Cell Phones While Driving,10
181 | NYC Mayor Bloomberg announces large budget proposal and estimates there will be a large surplus.,24
182 | Bin Laden linked to kenya embassy bombing,16
183 | new accounting standards for local and state governments,24
184 | immigration laws,9
185 | information on bin laden whereabouts found,19
186 | U.S. presence in Iraq; U.S. propaganda efforts,16
187 | postwar chaos in Mosul leads to anger at US,16
188 | mediator fails to find settlement between California and power suppliers,8
189 | Defense Dept considers changes to military tribunals,2
190 | new building to be built in times square,14
191 | Federal Reserve unexpectedly cuts interest rates,1
192 | group of 14 senators averts showdown over judges,20
193 | democratic convention,20
194 | teenager burns a boat belonging to bush,12
195 | Giuliani defends police officers in confrontation with marchers at Million Youth march,12
196 | strikes in Yugoslavia,19
197 | Discovery of ancient ruins in eastern Utah.,17
198 | Bush administration expanding NSA and bypassing Congress,20
199 | nytimes neediest cases fund ad,17
200 | bosnian election,19
201 | louisiana caucuses,20
202 | congress and white house agree on budget,1
203 | economy trouble,1
204 | new elections in Peru,19
205 | US visa policy called unfair and arbitrary,9
206 | prosecutors want a genetic test of lewinsky's dress,20
207 | Federal appeals rules to continue with recall as scheduled,24
208 | congress to act on firestone tire problems,15
209 | rape of a jogger in central park,12
210 | war in chechnya,19
211 | "Carnival docks cruise ships in Brooklyn, revitalizing the port",14
212 | Kosovo: attack by Serbs ends cease-fire,19
213 | Hyde accedes to Democratic demands on some inquiry issues,20
214 | efforts to move people from welfare to work,13
215 | Iraq reconstruction; major creditors of Iraq agreement for large debt write-off.,16
216 | Gov. Pataki negotiates agreement with Consolidated Edison favoring businesses,8
217 | Supreme Court declines to hear Terri Schiavo case,2
218 | Cambodia: leader resists punishing Khmer Rouge,19
219 | US role in Afghan rebuilding,16
220 | Bush's campaign strategy,20
221 | bush proposes medicare overhaul,3
222 | clinton talks about sending peacekeepers in kosovo,16
223 | Negative impact of global economic crisis on Russia,19
224 | legal reform in Morocco,19
225 | 9/11 memorial service,21
226 | white house inquiry-hilary clinton,20
227 | Changing role of juries in the legal system,12
228 | Difficulties in procuring non-oil sources of energy.,8
229 | Senator blocks promotions to get planes for Idaho national guard,16
230 | UN treaty bans nuclear testing,16
231 | terrorism,19
232 | schools looking for principals,6
233 | investigation into anthrax mailings,16
234 | soldiers in Iraq and Afghanistan to have extended tours of duty,16
235 | Saddam Hussein captivity and trial; Iraqi officials expectation that U.S. will soon transfer him to their custody.,16
236 | Boeing Company Stock Plummets,15
237 | Stock Market Bubble burst for technology stocks,1
238 | new york legislature,24
239 | campaign finance reform,20
240 | NY times neediest cases fund,13
241 | Clinton impeachment trial,20
242 | vietnamese immigration to the US,9
243 | serbs attacks kosovo rebels,19
244 | louima case,12
245 | Bush lowers expectations of a quick war,16
246 | Afghan tribes come before government,19
247 | Clinton's state of the union,20
248 | Russian bank scandal,12
249 | Underground gambling games in New York City,15
250 | clinton puts sanctions on iran and libya,18
251 | reorganization of board of education HQ,6
252 | investigation into catholic preist abuse scandal made public,12
253 | Vermont sport of hunting fish with firearms.,12
254 | abortion pill,2
255 | bill bradley needs some primary victories,20
256 | investigation of hillary clinton,20
257 | Growth of federally financed tutoring industry as a result of No Child Left Behind,6
258 | Cyclospora outbreak in raspberries,4
259 | Israel-Palestine; death of children in the territories; conflicting accounts.,19
260 | Working for the Clintons; Margaret A. Williams,20
261 | Inquiry into problems with U.N.'s Iraq oil-for-food program,19
262 | Presidential elections; Primat der Innenpolitik in both candidates' campaigns.,20
263 | cuban exiles dying off,19
264 | Study of nuns helps with understanding Alzheimer's disease,3
265 | 11 EU countries prepare to introduce the Euro,19
266 | surge in donations to WTC victims leads to items sitting unused,16
267 | NJ senate race,20
268 | Bush looks elsewhere for oil and gas resources after facing opposition in Artic,8
269 | debt in the apple industry after marketing new Red Delicious,4
270 | White House and Rupublican negotiators make push toward agreement on budget and tax cuts,1
271 | intelligence projections,16
272 | Medicaid system not adequately serving poor in the Bronx,3
273 | Federal government producing news clips for positive public relations,20
274 | Elections and violence directed toward election workers in Iraq,19
275 | catholic abuse scandal,12
276 | NJ senate race,20
277 | "Partial, de-facto transfer of powers to Iraqi authorities prior to date of formal devolution of sovereignity.",16
278 | nyc subway station renovations,10
279 | Two Dominicans extradited to the U.S. to face drug and murder charges,12
280 | supreme court reviews violence against women law,2
281 | Iraqis angry over Turkish role in reconstruction,19
282 | accidental chinese embassy bombing,16
283 | censorship of school drama material that religious community members call immoral,2
284 | airlines to inspect Boeing fuel pumps,10
285 | Hurricane Katrina victims in new unfamiliar areas,14
286 | cuban embargo,19
287 | Oneida Indian lawsuit against New York for unlawfully acquiring land,21
288 | Supreme court rules on death row prisoners' challenge rights,12
289 | Catholic preist abuse scandal,12
290 | Pakistani foreign bank corrpution,19
291 | Book details Secretary of State Colin Powell's warnings to Bush regarding the invasion of Iraq.,16
292 | 9/11 inquiry,16
293 | Increasing mortality of elderly WWII veterans.,16
294 | Presidential election; voters that remain uncertain after final debate.,20
295 | election campaigns,20
296 | Maine's prescription drug plan approved by Supreme Court,3
297 | Terror Suspect freed of U.S. and allowed to go to Jordan,19
298 | GOP praises clinton trip to china,19
299 | healthcare,3
300 | senate kills nuclear test ban treaty,16
301 | Senator votes to keep bill that allows for overhaling of national political campaign financing,20
302 | Bush plan to change Medicare,3
303 | witness describes Bin Laden plotting against US,16
304 | Paula Jones and her accusations of sexual harassment against Clinton,20
305 | Pentagon trying to attack supreme leader of Taliban,16
306 | Tawana Brawley Trial; raped by white men; Rev. Al Sharpton stands as witness,2
307 | Vision,20
308 | Scientist suspected as China spy when hired fired for security breaches,16
309 | animal rights and chinese food,7
310 | Vodafone's AT&T Wireless bid.,15
311 | South Korea on terrorism alert; 2 men attempt to assasinate North Korean defector; Increase in Cold War Tension,19
312 | "Cruise Line, Sexual Assaults Disclosed",12
313 | pictures of combat in Afghanistan,16
314 | US suing intel for antitrust,15
315 | Iraqis change their names from Saddam to avoid being killed by Shiite militia,19
316 | italian election,19
317 | right to protest in hong kong will be curtailed,19
318 | Presidential election; Republican convention; further articles.,20
319 | Israel buries those killed in bus rampage,19
320 | new wtc tower,21
321 | SEC looking for new auidting head,15
322 | The U.S. brings Palestinian and Israeli leaders together for Mideast negotiations,19
323 | Role of lobbyists in Madison Square Garden stadium deal in New York,14
324 | wounded US army soldiers,16
325 | NYC plan to export trash will take more time and money than projected,7
326 | Militas battle for southeastern Iraqi city,19
327 | NATO presses Serbs to release 16 Bosnian citizens,16
328 | 2000 campaign; gun control,12
329 | House to review nation's intelligence agencies,16
330 | crackdown on abusive nursing homes,3
331 | candidates touting their experience,20
332 | Continuing corruption scandal at former Connecticut Governor's office.,24
333 | Chinese obtained U.S. technology and arms secrets,16
334 | WHO decides to kill the last smallpox viruses,3
335 | Three-day Jewish centennial conference in Switzerland confronts Holocaust,19
336 | American car makers plan for larger models,10
337 | China acting on fuel economy standards,19
338 | migrant smuggling route through Africa,19
339 | "downturn in telecommunications cutting jobs, affecting region",1
340 | sharon continues israeli assault in west bank,19
341 | story of two suicide bombers,19
342 | Clinton talks to TV meteorologists about global warming,7
343 | republicans urge a curb on gun sales,12
344 | last ditch efforts to collect soft money donations,20
345 | memorial day,20
346 | Bush trying to smooth relations with McCain,20
347 | New York's immigrations courts hurt by increased burden,9
348 | Assassination of Russian-backed Chechen president.,19
349 | Virginity Testing in Africa,19
350 | Traffic in body parts discovered at UCLA.,3
351 | Presidents of top universities to step down,6
352 | Improvement among America's elementary school students,6
353 | U.S. intelligence officials had received warning of coming attack on American Embassy,16
354 | terror insurance in Iraq,19
355 | Vietnam remembers Vietnam War,19
356 | Democratic primary; significance of Southern states.,20
357 | congressional resolution on Iraq,16
358 | Campaign has raised money for Clinton legal defense,20
359 | President Clinton; sex case,20
360 | Pentagon succeeds in shooting down an IBM with an interceptor,16
361 | investigation into bombing of UN building in Iraq,16
362 | new york vacation spots and the clintons,20
363 | welfare caseworkers,13
364 | Iraq insurgency; assault by Marines on city held by Shiite milita.,16
365 | Clinton Impeachment Trial,20
366 | gun control debate and suburban districts,12
367 | new products with the Windows operating system.,15
368 | plans for WTC site,14
369 | Crew withdraws threat to resign after tensions with Giuliani over voucher issue eases,6
370 | AT&T to cut long-distance rates,17
371 | China's Need for Metal Keeps U.S. Scrap Dealers Scrounging,18
372 | gay men in military,2
373 | presidential election results,20
374 | "Iran drops Rushdie death threat, and Britain restores full diplomatic relation with Tehran",19
375 | Gore's debate style,20
376 | expressing doubt about document.,20
377 | Higher rate of AIDS incidence in African Americans; role of prison-time in spreading AIDS in black neighborhoods.,3
378 | chief executive of Ford Motor Company to resign,15
379 | stock market falls,1
380 | clinton to require welfare recipients to work,13
381 | suburbs struggling to keep elderly population,14
382 | speech and language gene found,3
383 | bad doctors,3
384 | Clinton Impeachment Trial,20
385 | NY senator will not run again,20
386 | Clinton Urges House to Settle Its Differences Over Gingrich,20
387 | Lott apologizes for Thurmond comments but won't resign,20
388 | Comcast's Disney bid; probable effect on the media industry.,15
389 | Nuclear proliferation; Pakistan gov. admits some of its citizens may have sold data on nuclear weapons.,16
390 | Two men revive inquiry on Waco,16
391 | cooling of real estate merkets,14
392 | Clinton administration cover-up scandal,20
393 | Comission absolves Prime Minister Netanyahu of attack on Hamas official,19
394 | Presidential election; determination of African-Americans to avoid 2000 experience and make their ballots count.,20
395 | Clinton Impeachment Trial,20
396 | US starts direct combat with ground troops in Afghanistan,16
397 | Party change in Minnesota legislature mirrors what will happen to the nation's capitol in a few weeks,20
398 | Protest over killing of Amadou Diallo at memorial service,12
399 | Growing Iraqi Army; capture of men responsible for shooting down helicopter,16
400 | G 7 summit meeting,19
401 | pension changes,5
402 | UN weapons inspectors visit Iran,16
403 | Iraq insurgency; Falluja assault; re-capture of one-third of the city.,16
404 | Iraqi self-rule delayed,16
405 | Ford Explorer problems,15
406 | "Hurricane Katrina aftermath, Bush visits the area",15
407 | Man on a box in Abu Ghraib,19
408 | Order by Iraqi government for arrest of Ahmad Chalabi on charges of counterfeiting.,19
409 | No Child Left Behind Act requires annual testing in math and reading,6
410 | Clinton's defenders attack the credibility of Paula Jones,20
411 | Iraq Crisis: Plans for strike; diplomacy,16
412 | "Democrats gain power in Senate, but presents task of leading effectively",20
413 | Japan tells U.S. that their banking system is acutely short of capital,19
414 | Bush's appointment of the Deputy Director of National Intelligence,20
415 | arafat and the mideast violence,19
416 | MCI offers local residents telephone service,17
417 | clinton scandal,20
418 | Israel's ground war raises potential for casualties,19
419 | counterterrorism in Belgium,19
420 | Terri Schiavo case emboldens religious right,2
421 | Saddam Hussein trial,16
422 | Serbs continue to displace ethnic Albanians and force them to flee,19
423 | rape of a jogger in central park,12
424 | Growth of the richest class in the U.S.,1
425 | Petty Officers admits Japanese trawler was on radar before accident,16
426 | "Insurgency in Afghanistan, Navy Seal rescued",16
427 | Israeli Prime Minister Netanyahu begins aggressive campaign at home after returning from the U.S.,19
428 | Modernization in China threatens traditional tribes,19
429 | NY construction wall fell,10
430 | Iran President delivers inaugural address,19
431 | Peru election,19
432 | supreme court nominee John Roberts,20
433 | new york times wins pulitzers for 9/11 coverage,17
434 | Internet users loosing initial draw to eclectic possibilities,17
435 | clinton scandal,20
436 | teenage brothers admit to killing their father,12
437 | lebanon and israel fighting,19
438 | Iraqi premier moves to establish regional talks,19
439 | Bush administration and problems with Middle East foreign policy,19
440 | Republicans split after impeachment issue,20
441 | gay and lesbian pride parade,2
442 | NYC budget woes; cuts in police force,12
443 | peru hostage crisis,19
444 | MTA to use video surveillance in subways,10
445 | heirs of song suing inspiration for use of song title,15
446 | a body of an everest climber found,12
447 | Republicans not getting any help from economy in reelection bids,20
448 | Schundler's campaign for NJ governor lagging behind opponent,24
449 | lobbyists and medicare,20
450 | class action sexual harassment lawsuit at smith barney,15
451 | How Clinton will be judged in Lewinsky scandal,20
452 | Japan-China ecomonic ties,19
453 | Escape of a U.S. hostage; death of soldiers in attacks.,16
454 | meetings about iraq policy,16
455 | priest abuse scandal,12
456 | Commerce Department's regulation of satellites may harm American satellite makers in foreign markets,18
457 | Afghan government moving forward with efforts to get Osama Bin Laden to leave,16
458 | abortion vote,2
459 | Nationalist veterans demand more retirement benefits fromTaiwan's Government,19
460 | NYC Mayor Michael Bloomberg's property-tax rebate proposal,24
461 | Congressional elections,20
462 | Migrants in Mexico endure poor conditions,19
463 | Prudential Insurance Company; Arthur Ryan asks NJ comissioner to investigate after customer complaints,15
464 | high school suffers with loss of loved ones in terrorist attacks,16
465 | End of assault weapons ban had little effect despite predictions,12
466 | Iraqis killed in suicide bomb attack,19
467 | IMF bailout of south korea,19
468 | massacre of Kosovo men by Serbs,19
469 | supreme court upholds new campaign finance law,20
470 | "Chileans hail death of Augosto Pinochet, but violence mars celebration",19
471 | Encephalitis virus in New York is much more serious,3
472 | justice investigation involves Clinton; campaign financing,20
473 | transit strike in NYC,10
474 | Giuliani criticized on city charter plan,24
475 | return of exiles to Iraq,16
476 | hawaiian estate controversy,24
477 | special military tribunals to try foreigners charged with terrorism,19
478 | news anchor injured in Iraq,16
479 | more job losses,1
480 | pentagon keeping helicopters away from NATO forces,16
481 | Gephardt's views on gays,2
482 | George Allen falters in U.S. Senate race in Virginia,20
483 | terrorism arrest,16
484 | areas offering sanctuary.,16
485 | Iraq blames US for market explosion,16
486 | losses from terrorist attacks could force major airline carriers to bankruptcy,10
487 | election reform bill passed,20
488 | Economic policy leaders from the U.S. and Japan fail to agree on global economic cure,19
489 | US wants defections from iraq,16
490 | Serbian war criminal,16
491 | American spacecraft lands on Mars,17
492 | textbook explaining how-to terrorist activities introduced in embassy bombing trials,16
493 | Presidential election/Democratic primary; further articles.,20
494 | Executive branch failures in preventing 9/11; NSC advisor Rice given memo about Al Qaeda,16
495 | peace in chechnya,19
496 | McCain's wife's new role,20
497 | Saddam Hussein was a regional terror for 30 years,16
498 | Senate races,20
499 | arson in northern ireland,19
500 | Death of Yassir Arafat,19
501 | Scientists using genes to enhance breeding of crops and livestock,4
502 | plea tossed in Iraqi abuse case,19
503 | Iraq reconstruction; large-scale billing fraud by American security company.,16
504 | China tightens rein on freedom of speech and press,19
505 | Iraq insurgency; assault on Falluja; Marines' experience.,16
506 | Board to determine which airlines to aid,10
507 | Clinton's Brother-in-law paid for lobbying pardons,20
508 | Britain: drug testing laws,19
509 | Madeleine Albright brings fighting Kosovo together for peace talks,16
510 | Britain: ruling to allow gay soldiers,19
511 | U.S. is debating talks with Iran over nukes,16
512 | U.S. may try new approach with North Korea,19
513 | New drug to prevent heart failure,3
514 | princess diana divorce,19
515 | Debate on what should be built on the site of the WTC.,14
516 | Quantum experts win Nobel Prizes,17
517 | pinochet can be extradited to stand trial,19
518 | 9/11 aftermath; behavior of office workers near Ground Zero.,16
519 | Mideast violence,19
520 | Bush administration's decision to oppose lawsuits of drug and medical device manufacturers for faulty products.,3
521 | Iraqi and American forces kill insurgents,16
522 | tobacco company damages,3
523 | Presidential election; Bush's National Guard service; investigation into Bush's activities during the period.,20
524 | Video of Jose Padilla reveals life of terror suspect,16
525 | Merger between Kmart and Sears.,15
526 | senate committee divided on clinton's air force secretary nomination,20
527 | Yemen and its foreign policy,19
528 | China: Taklimakan Desert to be cultivated,19
529 | Clinton promises veto in Republican tax cut,20
530 | editor admits crack expose was flawed,17
531 | Superintendant of New York City school district get success in tough area,6
532 | market rally,1
533 | General says US is still at war in Iraq,16
534 | Shortage of nurses in African countries due to large-scale emigration to developed world.,19
535 | "Summer School wrongly ordered for 8,600 students",6
536 | Syrian president buried,19
537 | Other countries question fairness of international aid policies after Asian tsunami,19
538 | ny welfare reform,13
539 | Judge refuses postponement of McVeigh execution,12
540 | louima case,12
541 | child porn case,12
542 | livery cabs a growing problem in NYC,14
543 | Newark in worse shape now than in 1967,14
544 | Donor to Democratic Party accused of receiving foreigners' cash,20
545 | NYSE may move to new jersey,15
546 | Iraq elections; Sunni Arabs' statements that their followers could boycott the election.,19
547 | Congress tries to limit Drug Cartel Money Launderers from sending money to Columbia,12
548 | EPA to clean up homes poisoned by 9/11 dust and ash,7
549 | Bush vows to aid countries in war on terror,16
550 | ross perot barred from debates,20
551 | NJ senate race,20
552 | drunk driver sentence,12
553 | Question whether Viagra will improve the sex life of women,3
554 | Saddam Hussein war crimes trial,16
555 | Exxon and Mobil oil merger,15
556 | Federal court upholds law giving notice of sex offenders,12
557 | juror's education,12
558 | Nuclear proliferation; role of network organized by Pakistani; further revelations thereon.,16
559 | south africa gun law debate,19
560 | China Transitions Leadership Peacefully; Hu Jintao,19
561 | "Death toll rises in Lebanon, Lebanese Prime Minister calls for international involvement",19
562 | Eliot Spitzer and New York gubernatorial race,24
563 | "Roberts Confirmation, relationship between the courts and congress",20
564 | fish going extinct in the hudson,4
565 | Aftermath of meeting between President Clinton and nation's most powerful bankers,15
566 | 2000 campaign for vice president,20
567 | Efforts by Shiite leaders to persuade Moqtada al-Sadr to withdraw militia units and permit the deployment of Iraqi government forces.,16
568 | Trent Lott tries to fix consumer price index,1
569 | Security against a potential New Year's terrorist attack.,16
570 | bush demands israeli withdrawal,19
571 | piracy in mexico,15
572 | new land conservation effort,7
573 | Hezbollah and Israel both choose violence to resolve recent conflict,19
574 | UN allows Iraq to export oil to help civilian population,19
575 | Poor conditions in Russia impair the ability of figure skaters to practice,19
576 | War on terror; U.S. government claims that Osama Bin Laden is personally preparing an attack on U.S. soil.,16
577 | Dentists notice a rise in Meth use,3
578 | F.B.I investigates Democratic campaign money,20
579 | Haitian crisis; seizure of second-largest city by rebels.,19
580 | Google buys out YouTube,15
581 | some refusing to pay taxes,1
582 | air traffic controllers,10
583 | california governor and abortion,2
584 | Democrats hopeful of success in 2006 elections,20
585 | Photo: victims of 9/11 honored,16
586 | Female Condom: Important Weapon against AIDS,3
587 | white house says prewar Iraq intelligence was flawed,16
588 | Photo-Hilary Clinton visits Harrient Tubman Learing Center in Harlem,20
589 | Improving political style of NYC Mayor Michael Bloomberg.,24
590 | Turkey planning to occupy iraq in the event of war to prevent refugee entrance,19
591 | "Corruption, spying, and leaks in Silicon Valley",15
592 | Elizabeth Dole; Red Cross; Presidential Campiagn,20
593 | NJ sprawl,14
594 | funeral for a sniper victim,12
595 | arab leaders to meet,19
596 | Enron scandal; plight of workers rendered unemployed by corporation's collapse.,15
597 | fringe parties in NY politics,24
598 | speculation that russian president is sick,19
599 | Tobacco industry to gain from settlement,3
600 | antiterror in europe,19
601 | Proposed Freedom center at ground zero,21
602 | Bush administration scaling back oil drilling in Gulf of Mexico,8
603 | South Korea wants longer range missiles,19
604 | recall probable in CA,24
605 | "Oklahoma City Bombing: trial, friend sticks to story in cross-examination",16
606 | Israel's Barak decides to quit politics,19
607 | indian politician,19
608 | plane crash,10
609 | Gore attacks Bush tax cut plan,1
610 | chinese trade bill,18
611 | astronomy satellite,17
612 | chaos in Liberia,19
613 | UN troops leaving haiti,19
614 | Indian economy,19
615 | Federal budget,1
616 | concern over doctor/investor relationships,3
617 | Suspects in Madrid attacks blow themselves up after being surrounded by police.,19
618 | hand recounts in florida can continue,20
619 | Milosevic trial will test international law,19
620 | Health expenditures in the United States as a proportion of GDP.,3
621 | Tobacco companies selling cigarettes to traders to funnel them into black markets,12
622 | Tough sentence for former WorldCom chairman,12
623 | gore campaign,20
624 | Free Trade Zone of the Americas given the go-ahead,18
625 | Former Private Secretary reveals information about deals made between narcotics traffickers and political leaders,12
626 | Al Qaeda defector used by prosecution in terrorism cases.,16
627 | 2000 campaign-cheney chosen,20
628 | Clinton Impeachment Trial,20
629 | religious practices vs. health concerns; New York City politics,2
630 | American spy plane lands in China after crashing with Chinese fighter jet,16
631 | Auto industry; reduce SUV emissions,7
632 | Senate and White House promoting measures that increase use of ethanol,8
633 | Promotion of John Edwards as running-mate of Kerry.,20
634 | Europeans debate US plan for UN involvement in Iraq,19
635 | E.P.A:air quality standards,7
636 | Abu Ghraib scandal; Bush apology combined with continued support for Rumsfeld.,19
637 | immigrants in suburbia,9
638 | inquiry into fraud by MCI,12
639 | Domestic surveillance,2
640 | chinese dissident sent to the US,19
641 | Bush's pick for secretary of defense,20
642 | Pataki barely breaking a sweat in race for second term as Governor,24
643 | New Jersey jail raid,12
644 | Kennedy relative sentenced in a murder trial,12
645 | criminal inquiry leads to raid in Marine unit,16
646 | takeover battle for sprint,15
647 | German leader warns about iraq war,16
648 | auto industry mileage plan,10
649 | Death of Arafat; analysis.,19
650 | State of the stock markets; analysis of signs of recovery.,1
651 | Standoff at Falluja; discussion in American command whether U.S. should pull out of the city.,16
652 | refugees in Kosovo need food,19
653 | CIA and FBI agree to truce,16
654 | panel says US should require insurance to pay for vaccines,3
655 | Martha Stewart trial; dismissal of most serious charge.,12
656 | cocaine fight in columbia,12
657 | Abu Ghraib scandal; order by U.S. commander in Iraq to halt use of all coercive interrogation techniques.,19
658 | jack kemp,20
659 | hummers,10
660 | Stalemated election,20
661 | Deaf Mexican immigrants held captive in North Carolina,9
662 | US near a trade deal with china,18
663 | Enron scandal,15
664 | UN resolution on Iraq's future,16
665 | Real estate broker completes largest transaction in U.S. history and buys property along East River,15
666 | U.S. has not been tracking weapons intended for Iraqi security forces,16
667 | clinton in bosnia,19
668 | reactions to the start of the Iraq war in America,16
669 | Bush and Cheney reaching out to democrats,20
670 | Hezbollah works to rebuild Lebanon to win popular support,19
671 | "peer-to-peer services being used for pornography, not just music",17
672 | in NYC.,21
673 | undecided washington sentate race,20
674 | France reveals evidence against Nazi war criminal to stand trial,19
675 | mentally ill health care,3
676 | Special report on use of wireless technology in America.,17
677 | 2000 campaign; bush after college,20
678 | first soldier killed in Afghanistan buried,16
679 | Washington State voters face affirmative action measure,2
680 | egyptians joining the palestinian cause,19
681 | IRA to help disarm ulster fighters,19
682 | Hussein rallies his troops,19
683 | bob dole challenges clinton's ethics,20
684 | Remembering Dr. Martin Luther King Jr,2
685 | Increasing nuclear proliferation despite diplomatic agreements; damage wrought by export of Pakistani nuclear expertise.,16
686 | Republicans questioning of Bush's ban on stem-cell research.,3
687 | new subpoenas over campaign finance violations,20
688 | Chinese trade bill,18
689 | Veteran health care,16
690 | women in India changing their roles,19
691 | new communications law,17
692 | poll on opinions of new yorkers,24
693 | Federal investigators link deaths to same suspect,12
694 | shootout in the bronx,12
695 | technological breakthrough in computing,17
696 | Revelation of prescient pre-war report about danger of post-Saddam Iraqi civil war; administration attempts to minimize its significance.,16
697 | Iraq's oil industry,19
698 | Britain: Prime Minister Campaigns,19
699 | Iraqi war casualties are up sharply,16
700 | US donations to jewish settlers,19
701 | "Bush announces government will take stronger role in airline security, will station troops",10
702 | hospital worker died of anthrax inhalation,16
703 | Shooting at the Empire State Building; the gunman,12
704 | limits to Putin's power,19
705 | graves uncovered in Sri Lanka have not lead to charges,19
706 | "Indian computer security is bad, files at risk",21
707 | fossils of second largest dinosaur found in Egypt,19
708 | naturalized citizens in NY,9
709 | NATO: Russia agrees to alliance expansion,16
710 | Virginia offer of scholarships to black students denied access to high school during segregation wars.,2
711 | greenspan says the economy is good,1
712 | FBI agent charged with spying for russia,16
713 | California plan for large-scale cutbacks in greenhouse-gas emissions.,7
714 | Microsoft antitrust case,15
715 | House approved bill to turn airport security to government,10
716 | irish peace referendum,19
717 | Trinity College and the revival of Hartford,6
718 | Clinton heart surgery.,20
719 | online prescription drug sales,3
720 | Sotheby's chairman convicted of price-fixing,12
721 | Key role of 9/11 widows in formation and activities of the 9/11 Comission.,16
722 | Rice reviews progress in Iraq; rejects exit strategy,16
723 | Al Qaeda in Karachi,16
724 | China to Protect Private Property Rights; Boon to Entreprenuerial Class,19
725 | welfare reform in Italy,19
726 | More on the Clinton inauguration,20
727 | South Carolina campaign,20
728 | Employment trends in the U.S.,1
729 | "stock slump hurts 401Ks, makes many rethink retirement plans",15
730 | European opinion,19
731 | Clinton denies Paula Jones' accusations,20
732 | exit polling from the presidential election,20
733 | U.S. shift to support cease fire in Lebanon started frantic round of negotiations in U.N.,19
734 | senate approves online contracts,15
735 | tax plans in campaign,1
736 | Bush calls for end to loans to buy stock,15
737 | Panama takes control of the Panama canal,19
738 | halliburton overcharging for fuel,16
739 | Foul Air and Water Part of Cost of Boom in China's Exports,19
740 | housing conditions for the poor,14
741 | drug review process,3
742 | Gates to create foundation to bring internet into public libraries,6
743 | Cubans still struggling to make ends meet,19
744 | Supreme Court nomination; partisan dispute,20
745 | Bali bombing,19
746 | Doctors' pay regains ground despite the effects of HMOs,3
747 | suicide scandal in Germany,19
748 | Cruise lines pay little income tax because of loophole in tax law,10
749 | More Palestinian and Israeli struggle,19
750 | Japanese elections,19
751 | burning of chemical weapons,7
752 | "New York law holding car owners liable for car accidents, whatever the driver, limiting car-leasing in the State.",10
753 | "weather data predicts years of frequent, stronger hurricanes",17
754 | tobacco settlement money held up by new york politics,3
755 | FTC ruling on doctors to let them band together,3
756 | israel and lebanon relations,19
757 | NYC Chancellor cuts school budget by cutting program spending,6
758 | I.B.M. guilty of selling advanced computers to Russian nuclear weapons laboratory,12
759 | Hugo Chavez opposition,19
760 | heightened terror alert,16
761 | Meningitis Epidemic in West Africa,19
762 | Iraqi constitution to be voted on,19
763 | uprising in Ivory Coast,19
764 | Research showing that aspirin use can help prevent breast cancer.,3
765 | more questions arising over Clinton pardons,20
766 | Prosecutors stop Haitian murder suspect's efforts to leave country,12
767 | fraud claim in Iraqi election,19
768 | Businesses use people's YouTube and MySpace videos as free advertising,17
769 | Cleanliness takes a back seat to financial survival in airline industry,10
770 | scores of top students rise while those of average students decline on reading tests,6
771 | mideast violence,19
772 | applying for college,6
773 | Israel requests shipment of U.S. artillery rockets,16
774 | "Bush threatens to veto Senate's patients' bill of rights, Democrats say he'll have to accept it",3
775 | Difficulties for NYC mayor Michael Bloomberg in playing host to the Republican convention.,20
776 | states propose reducing medicaid,3
777 | unauthorized wiretaps,2
778 | house races,20
779 | debate on war strategy disappears after advance on Baghdad,16
780 | British bracing for Bush protesters,19
781 | Bush rejects a quick pullout from Iraq,16
782 | interrogation of terror suspects,16
783 | NYC to pay overtime to police and fire chiefs despite concerns,12
784 | "France: parliament elections, little mention of economic plan",19
785 | campigner for US senate avoids SEC investigation,20
786 | Declining energy prices has caused the U.S. to give up all gains made in conserving energy,8
787 | first Latino becomes Los Angeles mayor in more than a century,24
788 | "New York City Budget, tax cuts",24
789 | Young blacks link tobacco use to marijuana,3
790 | U.S. to restore relations with Libya,19
791 | Bolivian Leader in Exile After Efforts to Eradicate Coca,19
792 | Chinese backlash against closing of news journal; censorship,19
793 | use of carbon fuel declining,7
794 | Sky scrapers advertising value,15
795 | Europeans begin to fear growing Muslim minorities in their countries,19
796 | bombing in jerusalem,19
797 | Clinton urges bipartisanship to get budget negotiations moving again,1
798 | Bush prepares for war,16
799 | Sunni militia force Shiite bakeries in Baghdad to close,19
800 | us peace efforts in the middle east,19
801 | Pressure grows on GOP House leadership over Foley scandal,20
802 | new organ transplant strategies,3
803 | Bankers Trust Company admits to diverting money to enhance financial performance,12
804 | Lebanon's ex-premier killed in car bomb attack; Syrian influence in Lebanon,19
805 | Roberts Confirmation hearings,20
806 | 3 rich kids moved around after mother kills their father,12
807 | copyright law changes,15
808 | Bush mideast speech,19
809 | auto industry agrees to design changes to improve safety,10
810 | soldier leaving for iraq,16
811 | "witness accuses former boss for price-fixing between Sotheby's, Christy's",12
812 | Law Enforcement fears that domestic terrorist attacks are linked by white supremacists,12
813 | Controversy over acquisition of U.S. ports by state-owned Middle-Eastern countries.,21
814 | White house dealings with enron,20
815 | wealthy taking year off before starting college,6
816 | Debate over who will pay for repairs of beach erosion in New Jersey,7
817 | 1996 Election; public opinion polls,20
818 | Israeli commandos raid Hezbollah stronghold despite truce,19
819 | Stricter elementary school standards,6
820 | Questionable legal practices in Checnya highlighted in torture of woman accused of adultery,19
821 | Saddam Hussein; Life in Iraq,19
822 | re-designed station wagons enter the auto market,10
823 | US tells citizens in India to leave,19
824 | "Internet message boards allows company employees to vent, sometimes ugly, conversations",5
825 | europe bans british beef,19
826 | priest abuse scandal,12
827 | terror suspect,16
828 | Iran blamed for the killing of Iranian dissidents in Germany,19
829 | Supreme court rules on recruiting at universities,16
830 | India Genral Election; Congress Party Losing Power,19
831 | Generic AIDS Drug makers want to sell in South Africa,19
832 | Taiwan election,19
833 | Juveniles punished for killing 5 people in school shooting,6
834 | calling up reserves in the iraq war,16
835 | Murders of women along the Mexican border,19
836 | the political future of Indonesia,19
837 | Clintom impeachment trial,20
838 | Fish market in NYC closing,15
839 | elderly people in philadelphia,3
840 | Bush won't continue plan to rid of weapons with plutonium,16
841 | Al Sharpton's bid for the Presidency,20
842 | kennedy and castro,19
843 | Markets Surge after Investor concers about inflation eased,1
844 | NJ doctors protest high insurance costs,3
845 | Bush seeks to expand NAFTA throughout Central and South America,18
846 | Bombings in London,19
847 | communications equipment maker announces lay-offs,5
848 | The difficuluties of scheduling a war; Olympics; Islamic holidays,16
849 | New York traffic court,12
850 | "Russian President fired Prime Minister and appointed a former KGB officer, Vladmir Putin",19
851 | conjoined twins separated,3
852 | Livestock testing at state fairs,4
853 | NJ budget problems,24
854 | NJ troopers using hotel staffs to stop drug smugglers,12
855 | Iraqi Prime Minister denounces Israeli attacks on Lebanon,19
856 | medicare-prescription drug benefits,3
857 | north korea has access to plutonium,16
858 | Comcast bid for Disney; probable strategy of Disney leader to counter it.,15
859 | surrogate mothers have babies for gay couples,2
860 | Civilian death toll in Iraq reaches new high,16
861 | culture war in Israel,19
862 | US plans for a palestinian state,19
863 | Guilty verdict on prominent investment banker.,12
864 | mourning a school shooting in scotland,19
865 | US attacks iraq,16
866 | Tactics of American unit against Shiite militia of Moqtada al-Sadr.,16
867 | Red Cross criticizes Guantanamo Bay,19
868 | Israeli Prime Minister; Isreali cabinet,19
869 | virus on a cruise ship,3
870 | Russian president's plan to tighten executive control over the legislative branch and local governents; stated rationale in terrorist threat.,19
871 | Chief executives see 22 percent raise in salary in last decade,5
872 | political terrorists kill Cambodians at democratic rally,19
873 | John Kerry and the Cambodian Swift Boat incident,20
874 | "Health care costs, Medicare",3
875 | recount analysis,20
876 | poll find NYC split over mayoral candidates,24
877 | Democratic primary; further articles.,20
878 | Human embryo cloning in South Korea; significance.,3
879 | NYC school board dispute,6
880 | American offensive against Shiite militia.,16
881 | supreme court ends a ban on ads for casino gambling,15
882 | Abortion doctor eulogized as killer is sought,2
883 | US reliance on Saudi oil,8
884 | School uniforms in Public School 7 in New York,6
885 | bob dole candidacy,20
886 | Bush talks about AIDS in South Africa,19
887 | Army veteran accuses her top-ranked Army boss with sexual assault,16
888 | nytimes neediest cases charity,9
889 | Abu Ghraib prisoner abuse scandal; interrogation unit alleges having supplied early reports to superior officers.,19
890 | big landlord,14
891 | Mad Cow disease in the US,4
892 | Reminder that standard time has resumed,15
893 | Iranian politics,19
894 | Afghani president's description of private armies as the principal threat to his country.,19
895 | mergers in europe,19
896 | Isreali curfew in nablus,19
897 | Immigration debate in Congress,9
898 | German election results,19
899 | man had hand cut off by Taliban on charge of theft,19
900 | israel and lebanon occupation,19
901 | Merging of American Companies,15
902 | Profile of American contractor beheaded in Iraq.,16
903 | Theory of gene flaw proposed to explain evolution of human beings.,17
904 | Life of John Roberts,20
905 | Inefficient evidence to look into Interior Secretary in his role in denial of Indian casino application,21
906 | peace in Angola,19
907 | bad week for Italian PM,19
908 | Senate rejects rival proposal to campaign finance law,20
909 | radio communication in NYC,12
910 | Re-development of WTO site.,21
911 | patents on drugs end,3
912 | nyc labor disputes,5
913 | change in laws causes deportations,9
914 | Chinese Poor Struggle with HIV/AIDS,19
915 | smallpox vaccine,16
916 | High cost of potentially effective anti-cancer drug,3
917 | airport security,10
918 | Police protest over pay,12
919 | Internet message boards facilitate dialogue on race,17
920 | negotiations on interim Afghan government stall,16
921 | captured spy chief returns to Peru,19
922 | Sheriffs department in columbine under scrutiny,6
923 | Supreme Court divided on interpretation of Clean Water Act,7
924 | Senate confirms Ashcroft,20
925 | campaign finance limits,20
926 | Chicago orders big retail stores to raise minimum wage,5
927 | Army recruiting helped by bad economy,16
928 | Federal Reserve considers cutting interest rates,1
929 | Conservative Christian politician hurt by ties to Jack Abramoff lobbying scandal,20
930 | Bush's Social Security reform plan,13
931 | osteoporosis gene found,3
932 | defect in heart devices,3
933 | US dispatches agents to Germany to uncover terrorist network,16
934 | bacteria in chickens,4
935 | people who live in homes where BTK murdered,12
936 | affirmative action poll,2
937 | suicide attack in Israel,19
938 | Space exploration; infrequent success therein and high costs thereof.,17
939 | politicians on parade,20
940 | whitewater scandal trial,20
941 | AT&T's withdrawal from the residential-phone business.,17
942 | ground zero cleanup nears the end,16
943 | large purchase of woods and wetlands for public use and preservation,7
944 | Danger to VA mental health services from large estimated numbers of future patients among veterans of Iraq war.,16
945 | Dow Chemical Company knowingly deceived women on breast implants,3
946 | Clinton scanal fallout,20
947 | lobbyists sway legislators on Chinese trade policy,18
948 | Standard time resumes,15
949 | second day of blackout continues as company makes accusations,8
950 | bob dole attacks clinton in the presidential debate,20
951 | PHOTO: Senator John Glenn at Brooks Air Force Base,20
952 | New York's hospitals receive Federal aid package,3
953 | East Germany's economic revival,19
954 | shady democratic party fund raising,20
955 | Bush aides defending budget in Congress,1
956 | New York State Legislature approves early retirement for most experienced teachers,6
957 | "fireworks explosions in Lima, Peru kill hundreds",19
958 | House Democrats choose Steny Hoyer as House majority leader over Nancy Pelosi's choice,20
959 | McCain endorses Bush,20
960 | fighting in the mideast,19
961 | reform party presidential candidate,20
962 | Iraq; Sadam Hussein; split in Arab nations,19
963 | inquiry into columbia breakup,17
964 | barak agrees to halt settlements in the west bank,19
965 | Comcast's Disney bid; rejection by Disney.,15
966 | welfare and the states,13
967 | NY mayor race,24
968 | Bush challenges Mideast to try democracy,16
969 | Swiss failed to payback Nazi payments after WWII,19
970 | North Korea fires ballistic missile over Japan,19
971 | US soldiers killed in Iraq,16
972 | Medpartners/Mullikin to Buy Caremark International; Creation of A Large Physician Management Company,15
973 | guilty verdict in a wendy's murder case,12
974 | Bosnian forces pull back troops and weapons from front line,19
975 | Iowa house race,20
976 | terrorism,19
977 | chief fundraiser for Senator may have received illegal campaign contributions,20
978 | Iraqis want more power to control themselves,16
979 | Prosecution of NYC school superintendent charged with $1m embezzlementt; difficulties in detecting embezzlement in districts with high per-student spending.,12
980 | "FDA proposes ban on importing blood from Europe, upsetting European suppliers",3
981 | indonesian cleric falls ill,19
982 | "Japan's Economy in Debt, despite Japanese ability to save",19
983 | GAO files suit over cheney energy meetings,20
984 | "Oxycontin sales grew, but at a cost",3
985 | costs on loans,15
986 | Bristol Myers Squibb to Yield Patent Rights Over Aids Drugs in Africa,19
987 | china trade deal,18
988 | "Former United States Housing Secretary pleads guilty, lied to FBI",20
989 | Middle East politics; Ariel Sharon,19
990 | Governor-elect Eliot Spitzer likely to ask for ouster of State Comptroller Alan Hevesi,24
991 | domestic violence courts in NY,12
992 | China Bans Text Messaging/E-mail in Protests Against Japan,19
993 | Lacking relief and repair effort in Tsunami ravaged Indonesia,19
994 | Bush declared winner by Florida,20
995 | Palestinians and Israels hold off peace talks despite Madeleine Albright's visit,19
996 | stock market,1
997 | china sending missiles to iran,16
998 | Bomb attacks in Iraq; U.S. soldiers killed.,16
999 | Tora Bora offers many hiding places for Al Qaeda fighters,16
1000 |
--------------------------------------------------------------------------------
/data/tutorial_train.csv:
--------------------------------------------------------------------------------
1 | text,label
2 | "AIDS in prison, treatment costs overwhelm prison budgets",12
3 | olympics security,19
4 | police brutality,12
5 | Iranian nuclear program; deal with European Union and its leaving of Iran free to develop plutonium.,16
6 | terror alert raised,16
7 | Job report shows unexpected vigor for US economy,5
8 | Clinton proposes West Bank Plan to Isreal's Prime Minister Netanyahu,19
9 | Senators debate Iraq War policy,16
10 | Myrtle Beach,14
11 | china visit,19
12 | elections in Rwanda,19
13 | Sudan tires of war between Arabs and Christians,19
14 | Enron scandal,12
15 | primaries - McCain,20
16 | US to sign a treaty banning land mines,16
17 | ross perot to run for president,20
18 | European law prohibits American-style buying and selling of personal data,19
19 | clinton's list of donors,20
20 | Guantanamo Bay opposition,2
21 | US demands Iraq disarm in a meeting with allies,16
22 | conservatives attack job training bill,5
23 | Underground rumors and popularity of supposed obesity miracle drug; doctors' call for caution.,3
24 | old style farm in Illinois,4
25 | Questions about whether NYC mayor Bloomberg's plan to impose strict requirements for passing the third grade would not be counter-productive.,6
26 | nyc schools chancellor ousted,6
27 | Fed Reserve Chairman and former treasury sec against dropping tax to stimulate economy,1
28 | supreme court ruling on redistricting,20
29 | Governor of California to sign welfare law to move aid recipients into jobs,13
30 | AT&T may split up,15
31 | immgration crackdown at walmart,9
32 | Urban development programs to replace high-rise public housing,14
33 | jungle trees hides ancient archetectual buildings,19
34 | Difficulties faced by prospective parents whose unborn child is diagnosed with serious conditions.,12
35 | NY school cirriculum,6
36 | witness against Al-Qaeda,19
37 | evidence from 1918 flu found in genetics,3
38 | ground zero reconstuction,21
39 | Growing field of geriatric care managers,3
40 | israel may release arafat,19
41 | billboards and smoking,3
42 | push for a nationwide product liability bill,15
43 | White House selection for new Treasury Secretary,20
44 | Internal pressure for reforms in China; use of centenary of Deng Xiaoping for their promotion.,19
45 | Afghan girls and education,19
46 | Growth in the number of uninsured in US,3
47 | Israel captures hebron,19
48 | plan to revamp Lower Manhatten transit network,10
49 | AIDS medication; AZT,3
50 | supreme court ruling on vouchers for private schools,6
51 | Canadian sues US for detaining and beating him for 10 months,2
52 | New spy gear used in Iraq,16
53 | US-French split over iraq,16
54 | get out the vote campaigns,20
55 | Two Congressmen and their recovery from alcoholism,3
56 | retail boom,1
57 | medicare drug plan backed by aarp,3
58 | Blue collar workers whose jobs have gone overseas find themselves abandoned by labor unions,5
59 | State Department annual report finds an increase in terrorist incidents.,16
60 | Japan struggles to adopt Western-style capitalism,19
61 | US and russia sign a pact on nuclear arms cuts,16
62 | New Orleans reconstruction,15
63 | legal battle begins over expansion of government's powers in fighting terrorism,19
64 | independent council investigating the clintons,20
65 | Thanksgiving in immigrant households.,9
66 | genetic engineering regulations,4
67 | E-bay auctions,15
68 | Yeltsin nominates Primakov as Prime Minister,19
69 | ken starr testimony,20
70 | Fire disrupts Brooklyn subway lines,10
71 | war in Iraq:US abuse of prioners in Afghanistan,19
72 | Clinton impeachment trial,20
73 | Afghanistan reconstruction; mine- and explosives-clearing in preparation for airport re-opening.,16
74 | bush legislative plan,20
75 | Britain's royal navy actively recruits homosexuals,19
76 | "Series on life of people with A.L.S., or Lou Gherig's disease.",3
77 | poll on americans' opinions on microsoft,15
78 | "Saudi leaders voice public support for US, but are sensitive to launching military operations",16
79 | Man on death row appeals for clemency,12
80 | NY senate debate,20
81 | more men are experiencing sexual harrassment from other males,2
82 | Chinese Sneak Into Taiwan in Hopes of Prosperity,19
83 | anniversary of waco fire,16
84 | "Clinton Retirement, Calls on Republicans",20
85 | "Damage done to Columbia University's Research, Result of Power Outage in New York City",8
86 | the number of dead and missing firefighters,16
87 | Israel studying formerly unthinkable proposals,19
88 | Profile of current population of drug felons in NYC prisons.,12
89 | National Governors' Association decide not to try to change new Federal welfare law; pressure from Congressional Republicans; creating a compromise over benefits for legal immigrants,13
90 | Elian Gonzalez,9
91 | stroke therapy,3
92 | Congressman sentenced in bribery case,20
93 | GOP primary,20
94 | rapid expansion of technology requires area codes in phone calls,17
95 | "US warns China to abstain from military action against Taiwan, Urge for peaceful resolve",19
96 | Kerry fires campaign chief,20
97 | cancer drugs in mice,3
98 | Saudi Arabia faces end of oil boom and beginning of recession,19
99 | sniper attacks,12
100 | Germany accuses U.S. of illegally kidnapping innocent terror suspect; CIA,16
101 | Home prices rise leading to bidding wars,14
102 | Terror's influence on the art world,19
103 | countries cracking down on cartels,18
104 | New discoveries about the nature of brain injury,3
105 | "despite depictions from credit card companies, most Americans bamkrupt due to hard luck",15
106 | US college students under quarantine from SARS,3
107 | murder charge,12
108 | Kofi Annan visits Iraq,19
109 | Tenet healthcare settles fraud accusations,3
110 | Coretta Scott King's funeral service,2
111 | Congressional Republicans and White House officials near budget agreement,1
112 | "Hurricane Katrina, New Orleans police force falling apart",12
113 | car bomb in pakistan outside American Consulate,16
114 | President Bush wants to send more troops to Iraq,16
115 | Democrat Senators-to-Governors; Corzine announcement of run for post of NJ Governor.,20
116 | death penalty debate,12
117 | clinton orders forces to the persian gulf,16
118 | NY times neediest cases fund,17
119 | Narrow approval of Central American free-trade deal.,18
120 | legacy of the los angeles riots,2
121 | USS cole returns to duty,16
122 | Chinese Government Publicizes Falun Gong Self-Immolation,19
123 | Ariel Sharon to found a new party,19
124 | U.S. troops capture high number of prisoners in Iraq not protected under Geneva convention,16
125 | water management in the west,21
126 | anti-american sentiment in Falluja,16
127 | black businessmen,15
128 | New York Stock Exchange C.E.O. accused of stealing money,15
129 | freed Hamas prisoners,19
130 | AOL accidentally releases the identity of one of its users to public,2
131 | Former President Bush promotes business interests of equity firm,18
132 | killings in kosovo,16
133 | Women giving birth in Africa,19
134 | Overview of the 1998 campaigns,20
135 | Dedication of WWII Memorial in Washington D.C..,21
136 | NASA experiment fails,17
137 | Federal Panel overrules science fraud charges against Thereza Imanishi- Kari,3
138 | mutual funds trying to assure their investors,12
139 | bickering over Iraq in European countries,16
140 | Shooting of Amadou Diallo results in citizens not trusting police,12
141 | Refusal of Sonia Ghandi to serve as prime minister.,19
142 | Giuliani upset by lesser charges in criminal death of a police officer,12
143 | election problems in duval county florida,20
144 | couriers serve immigrants,9
145 | China has Trouble Becoming Producer of Crocodile Goods,19
146 | Netanyahu becomes first Prime Minister to cross into Gaza to meet with Palestinians,19
147 | "Clinton and Dole fundraise in same areas, benefit from added attention",20
148 | "Presidential election; rise to prominence of Barak Obama, culminating in his selection as keynote speaker at the Democratic convention.",20
149 | NYC councilman killed at city hall,12
150 | Lieberman's presidential candidacy,20
151 | Child welfare in New Jersey,12
152 | Clark enters democratic race for president,20
153 | Liberians ask for US help,16
154 | US generals meet in a palace to discuss rebuilding,16
155 | father who killed at kids hockey game sentenced,12
156 | police killing youth in venezuela,19
157 | Bush warns Hussein to allow UN inspectors to search for wmds,16
158 | Catholic priest abuse scandal,12
159 | Scandal involving Tom DeLay's family being paid by his campaign committees,20
160 | death of 2 afghan inmates,19
161 | Bush calls for allies to help with transferring Iraq sovereignty,16
162 | Legal status of terrorist suspects; federal judge's halting of military trial of Osama Bin Laden's driver.,2
163 | soaring birthrates in NYC,3
164 | Berlin: Where it is today compared with the past,19
165 | Orchard owners negotiating with home developers rather than collecting apples,4
166 | Advice to investors and reflection on financial lessons,15
167 | corporation and the community its in,15
168 | Pataki calls for financing to promote environment,7
169 | "Zaire: looking into the new era, economics",19
170 | Philip Morris trying to win favor from New York Legislature,3
171 | people travelling for July 4 holiday,20
172 | New welfare programs allow welfare recipients to keep more benefits when they get jobs,13
173 | guerrillas in the phillipenes,19
174 | hiring of Foreign Service officers,19
175 | Growth in demand for services of individual taking down public pay-phone locations and numbers.,17
176 | Clinton impeachment trial,20
177 | states changing stringent anticrime measures,12
178 | Court awards billions of dollars to New York City schools,6
179 | Chinese trade bill,18
180 | Cleveland: No Cell Phones While Driving,10
181 | NYC Mayor Bloomberg announces large budget proposal and estimates there will be a large surplus.,24
182 | Bin Laden linked to kenya embassy bombing,16
183 | new accounting standards for local and state governments,24
184 | immigration laws,9
185 | information on bin laden whereabouts found,19
186 | U.S. presence in Iraq; U.S. propaganda efforts,16
187 | postwar chaos in Mosul leads to anger at US,16
188 | mediator fails to find settlement between California and power suppliers,8
189 | Defense Dept considers changes to military tribunals,2
190 | new building to be built in times square,14
191 | Federal Reserve unexpectedly cuts interest rates,1
192 | group of 14 senators averts showdown over judges,20
193 | democratic convention,20
194 | teenager burns a boat belonging to bush,12
195 | Giuliani defends police officers in confrontation with marchers at Million Youth march,12
196 | strikes in Yugoslavia,19
197 | Discovery of ancient ruins in eastern Utah.,17
198 | Bush administration expanding NSA and bypassing Congress,20
199 | nytimes neediest cases fund ad,17
200 | bosnian election,19
201 | louisiana caucuses,20
202 | congress and white house agree on budget,1
203 | economy trouble,1
204 | new elections in Peru,19
205 | US visa policy called unfair and arbitrary,9
206 | prosecutors want a genetic test of lewinsky's dress,20
207 | Federal appeals rules to continue with recall as scheduled,24
208 | congress to act on firestone tire problems,15
209 | rape of a jogger in central park,12
210 | war in chechnya,19
211 | "Carnival docks cruise ships in Brooklyn, revitalizing the port",14
212 | Kosovo: attack by Serbs ends cease-fire,19
213 | Hyde accedes to Democratic demands on some inquiry issues,20
214 | efforts to move people from welfare to work,13
215 | Iraq reconstruction; major creditors of Iraq agreement for large debt write-off.,16
216 | Gov. Pataki negotiates agreement with Consolidated Edison favoring businesses,8
217 | Supreme Court declines to hear Terri Schiavo case,2
218 | Cambodia: leader resists punishing Khmer Rouge,19
219 | US role in Afghan rebuilding,16
220 | Bush's campaign strategy,20
221 | bush proposes medicare overhaul,3
222 | clinton talks about sending peacekeepers in kosovo,16
223 | Negative impact of global economic crisis on Russia,19
224 | legal reform in Morocco,19
225 | 9/11 memorial service,21
226 | white house inquiry-hilary clinton,20
227 | Changing role of juries in the legal system,12
228 | Difficulties in procuring non-oil sources of energy.,8
229 | Senator blocks promotions to get planes for Idaho national guard,16
230 | UN treaty bans nuclear testing,16
231 | terrorism,19
232 | schools looking for principals,6
233 | investigation into anthrax mailings,16
234 | soldiers in Iraq and Afghanistan to have extended tours of duty,16
235 | Saddam Hussein captivity and trial; Iraqi officials expectation that U.S. will soon transfer him to their custody.,16
236 | Boeing Company Stock Plummets,15
237 | Stock Market Bubble burst for technology stocks,1
238 | new york legislature,24
239 | campaign finance reform,20
240 | NY times neediest cases fund,13
241 | Clinton impeachment trial,20
242 | vietnamese immigration to the US,9
243 | serbs attacks kosovo rebels,19
244 | louima case,12
245 | Bush lowers expectations of a quick war,16
246 | Afghan tribes come before government,19
247 | Clinton's state of the union,20
248 | Russian bank scandal,12
249 | Underground gambling games in New York City,15
250 | clinton puts sanctions on iran and libya,18
251 | reorganization of board of education HQ,6
252 | investigation into catholic preist abuse scandal made public,12
253 | Vermont sport of hunting fish with firearms.,12
254 | abortion pill,2
255 | bill bradley needs some primary victories,20
256 | investigation of hillary clinton,20
257 | Growth of federally financed tutoring industry as a result of No Child Left Behind,6
258 | Cyclospora outbreak in raspberries,4
259 | Israel-Palestine; death of children in the territories; conflicting accounts.,19
260 | Working for the Clintons; Margaret A. Williams,20
261 | Inquiry into problems with U.N.'s Iraq oil-for-food program,19
262 | Presidential elections; Primat der Innenpolitik in both candidates' campaigns.,20
263 | cuban exiles dying off,19
264 | Study of nuns helps with understanding Alzheimer's disease,3
265 | 11 EU countries prepare to introduce the Euro,19
266 | surge in donations to WTC victims leads to items sitting unused,16
267 | NJ senate race,20
268 | Bush looks elsewhere for oil and gas resources after facing opposition in Artic,8
269 | debt in the apple industry after marketing new Red Delicious,4
270 | White House and Rupublican negotiators make push toward agreement on budget and tax cuts,1
271 | intelligence projections,16
272 | Medicaid system not adequately serving poor in the Bronx,3
273 | Federal government producing news clips for positive public relations,20
274 | Elections and violence directed toward election workers in Iraq,19
275 | catholic abuse scandal,12
276 | NJ senate race,20
277 | "Partial, de-facto transfer of powers to Iraqi authorities prior to date of formal devolution of sovereignity.",16
278 | nyc subway station renovations,10
279 | Two Dominicans extradited to the U.S. to face drug and murder charges,12
280 | supreme court reviews violence against women law,2
281 | Iraqis angry over Turkish role in reconstruction,19
282 | accidental chinese embassy bombing,16
283 | censorship of school drama material that religious community members call immoral,2
284 | airlines to inspect Boeing fuel pumps,10
285 | Hurricane Katrina victims in new unfamiliar areas,14
286 | cuban embargo,19
287 | Oneida Indian lawsuit against New York for unlawfully acquiring land,21
288 | Supreme court rules on death row prisoners' challenge rights,12
289 | Catholic preist abuse scandal,12
290 | Pakistani foreign bank corrpution,19
291 | Book details Secretary of State Colin Powell's warnings to Bush regarding the invasion of Iraq.,16
292 | 9/11 inquiry,16
293 | Increasing mortality of elderly WWII veterans.,16
294 | Presidential election; voters that remain uncertain after final debate.,20
295 | election campaigns,20
296 | Maine's prescription drug plan approved by Supreme Court,3
297 | Terror Suspect freed of U.S. and allowed to go to Jordan,19
298 | GOP praises clinton trip to china,19
299 | healthcare,3
300 | senate kills nuclear test ban treaty,16
301 | Senator votes to keep bill that allows for overhaling of national political campaign financing,20
302 | Bush plan to change Medicare,3
303 | witness describes Bin Laden plotting against US,16
304 | Paula Jones and her accusations of sexual harassment against Clinton,20
305 | Pentagon trying to attack supreme leader of Taliban,16
306 | Tawana Brawley Trial; raped by white men; Rev. Al Sharpton stands as witness,2
307 | Vision,20
308 | Scientist suspected as China spy when hired fired for security breaches,16
309 | animal rights and chinese food,7
310 | Vodafone's AT&T Wireless bid.,15
311 | South Korea on terrorism alert; 2 men attempt to assasinate North Korean defector; Increase in Cold War Tension,19
312 | "Cruise Line, Sexual Assaults Disclosed",12
313 | pictures of combat in Afghanistan,16
314 | US suing intel for antitrust,15
315 | Iraqis change their names from Saddam to avoid being killed by Shiite militia,19
316 | italian election,19
317 | right to protest in hong kong will be curtailed,19
318 | Presidential election; Republican convention; further articles.,20
319 | Israel buries those killed in bus rampage,19
320 | new wtc tower,21
321 | SEC looking for new auidting head,15
322 | The U.S. brings Palestinian and Israeli leaders together for Mideast negotiations,19
323 | Role of lobbyists in Madison Square Garden stadium deal in New York,14
324 | wounded US army soldiers,16
325 | NYC plan to export trash will take more time and money than projected,7
326 | Militas battle for southeastern Iraqi city,19
327 | NATO presses Serbs to release 16 Bosnian citizens,16
328 | 2000 campaign; gun control,12
329 | House to review nation's intelligence agencies,16
330 | crackdown on abusive nursing homes,3
331 | candidates touting their experience,20
332 | Continuing corruption scandal at former Connecticut Governor's office.,24
333 | Chinese obtained U.S. technology and arms secrets,16
334 | WHO decides to kill the last smallpox viruses,3
335 | Three-day Jewish centennial conference in Switzerland confronts Holocaust,19
336 | American car makers plan for larger models,10
337 | China acting on fuel economy standards,19
338 | migrant smuggling route through Africa,19
339 | "downturn in telecommunications cutting jobs, affecting region",1
340 | sharon continues israeli assault in west bank,19
341 | story of two suicide bombers,19
342 | Clinton talks to TV meteorologists about global warming,7
343 | republicans urge a curb on gun sales,12
344 | last ditch efforts to collect soft money donations,20
345 | memorial day,20
346 | Bush trying to smooth relations with McCain,20
347 | New York's immigrations courts hurt by increased burden,9
348 | Assassination of Russian-backed Chechen president.,19
349 | Virginity Testing in Africa,19
350 | Traffic in body parts discovered at UCLA.,3
351 | Presidents of top universities to step down,6
352 | Improvement among America's elementary school students,6
353 | U.S. intelligence officials had received warning of coming attack on American Embassy,16
354 | terror insurance in Iraq,19
355 | Vietnam remembers Vietnam War,19
356 | Democratic primary; significance of Southern states.,20
357 | congressional resolution on Iraq,16
358 | Campaign has raised money for Clinton legal defense,20
359 | President Clinton; sex case,20
360 | Pentagon succeeds in shooting down an IBM with an interceptor,16
361 | investigation into bombing of UN building in Iraq,16
362 | new york vacation spots and the clintons,20
363 | welfare caseworkers,13
364 | Iraq insurgency; assault by Marines on city held by Shiite milita.,16
365 | Clinton Impeachment Trial,20
366 | gun control debate and suburban districts,12
367 | new products with the Windows operating system.,15
368 | plans for WTC site,14
369 | Crew withdraws threat to resign after tensions with Giuliani over voucher issue eases,6
370 | AT&T to cut long-distance rates,17
371 | China's Need for Metal Keeps U.S. Scrap Dealers Scrounging,18
372 | gay men in military,2
373 | presidential election results,20
374 | "Iran drops Rushdie death threat, and Britain restores full diplomatic relation with Tehran",19
375 | Gore's debate style,20
376 | expressing doubt about document.,20
377 | Higher rate of AIDS incidence in African Americans; role of prison-time in spreading AIDS in black neighborhoods.,3
378 | chief executive of Ford Motor Company to resign,15
379 | stock market falls,1
380 | clinton to require welfare recipients to work,13
381 | suburbs struggling to keep elderly population,14
382 | speech and language gene found,3
383 | bad doctors,3
384 | Clinton Impeachment Trial,20
385 | NY senator will not run again,20
386 | Clinton Urges House to Settle Its Differences Over Gingrich,20
387 | Lott apologizes for Thurmond comments but won't resign,20
388 | Comcast's Disney bid; probable effect on the media industry.,15
389 | Nuclear proliferation; Pakistan gov. admits some of its citizens may have sold data on nuclear weapons.,16
390 | Two men revive inquiry on Waco,16
391 | cooling of real estate merkets,14
392 | Clinton administration cover-up scandal,20
393 | Comission absolves Prime Minister Netanyahu of attack on Hamas official,19
394 | Presidential election; determination of African-Americans to avoid 2000 experience and make their ballots count.,20
395 | Clinton Impeachment Trial,20
396 | US starts direct combat with ground troops in Afghanistan,16
397 | Party change in Minnesota legislature mirrors what will happen to the nation's capitol in a few weeks,20
398 | Protest over killing of Amadou Diallo at memorial service,12
399 | Growing Iraqi Army; capture of men responsible for shooting down helicopter,16
400 | G 7 summit meeting,19
401 | pension changes,5
402 | UN weapons inspectors visit Iran,16
403 | Iraq insurgency; Falluja assault; re-capture of one-third of the city.,16
404 | Iraqi self-rule delayed,16
405 | Ford Explorer problems,15
406 | "Hurricane Katrina aftermath, Bush visits the area",15
407 | Man on a box in Abu Ghraib,19
408 | Order by Iraqi government for arrest of Ahmad Chalabi on charges of counterfeiting.,19
409 | No Child Left Behind Act requires annual testing in math and reading,6
410 | Clinton's defenders attack the credibility of Paula Jones,20
411 | Iraq Crisis: Plans for strike; diplomacy,16
412 | "Democrats gain power in Senate, but presents task of leading effectively",20
413 | Japan tells U.S. that their banking system is acutely short of capital,19
414 | Bush's appointment of the Deputy Director of National Intelligence,20
415 | arafat and the mideast violence,19
416 | MCI offers local residents telephone service,17
417 | clinton scandal,20
418 | Israel's ground war raises potential for casualties,19
419 | counterterrorism in Belgium,19
420 | Terri Schiavo case emboldens religious right,2
421 | Saddam Hussein trial,16
422 | Serbs continue to displace ethnic Albanians and force them to flee,19
423 | rape of a jogger in central park,12
424 | Growth of the richest class in the U.S.,1
425 | Petty Officers admits Japanese trawler was on radar before accident,16
426 | "Insurgency in Afghanistan, Navy Seal rescued",16
427 | Israeli Prime Minister Netanyahu begins aggressive campaign at home after returning from the U.S.,19
428 | Modernization in China threatens traditional tribes,19
429 | NY construction wall fell,10
430 | Iran President delivers inaugural address,19
431 | Peru election,19
432 | supreme court nominee John Roberts,20
433 | new york times wins pulitzers for 9/11 coverage,17
434 | Internet users loosing initial draw to eclectic possibilities,17
435 | clinton scandal,20
436 | teenage brothers admit to killing their father,12
437 | lebanon and israel fighting,19
438 | Iraqi premier moves to establish regional talks,19
439 | Bush administration and problems with Middle East foreign policy,19
440 | Republicans split after impeachment issue,20
441 | gay and lesbian pride parade,2
442 | NYC budget woes; cuts in police force,12
443 | peru hostage crisis,19
444 | MTA to use video surveillance in subways,10
445 | heirs of song suing inspiration for use of song title,15
446 | a body of an everest climber found,12
447 | Republicans not getting any help from economy in reelection bids,20
448 | Schundler's campaign for NJ governor lagging behind opponent,24
449 | lobbyists and medicare,20
450 | class action sexual harassment lawsuit at smith barney,15
451 | How Clinton will be judged in Lewinsky scandal,20
452 | Japan-China ecomonic ties,19
453 | Escape of a U.S. hostage; death of soldiers in attacks.,16
454 | meetings about iraq policy,16
455 | priest abuse scandal,12
456 | Commerce Department's regulation of satellites may harm American satellite makers in foreign markets,18
457 | Afghan government moving forward with efforts to get Osama Bin Laden to leave,16
458 | abortion vote,2
459 | Nationalist veterans demand more retirement benefits fromTaiwan's Government,19
460 | NYC Mayor Michael Bloomberg's property-tax rebate proposal,24
461 | Congressional elections,20
462 | Migrants in Mexico endure poor conditions,19
463 | Prudential Insurance Company; Arthur Ryan asks NJ comissioner to investigate after customer complaints,15
464 | high school suffers with loss of loved ones in terrorist attacks,16
465 | End of assault weapons ban had little effect despite predictions,12
466 | Iraqis killed in suicide bomb attack,19
467 | IMF bailout of south korea,19
468 | massacre of Kosovo men by Serbs,19
469 | supreme court upholds new campaign finance law,20
470 | "Chileans hail death of Augosto Pinochet, but violence mars celebration",19
471 | Encephalitis virus in New York is much more serious,3
472 | justice investigation involves Clinton; campaign financing,20
473 | transit strike in NYC,10
474 | Giuliani criticized on city charter plan,24
475 | return of exiles to Iraq,16
476 | hawaiian estate controversy,24
477 | special military tribunals to try foreigners charged with terrorism,19
478 | news anchor injured in Iraq,16
479 | more job losses,1
480 | pentagon keeping helicopters away from NATO forces,16
481 | Gephardt's views on gays,2
482 | George Allen falters in U.S. Senate race in Virginia,20
483 | terrorism arrest,16
484 | areas offering sanctuary.,16
485 | Iraq blames US for market explosion,16
486 | losses from terrorist attacks could force major airline carriers to bankruptcy,10
487 | election reform bill passed,20
488 | Economic policy leaders from the U.S. and Japan fail to agree on global economic cure,19
489 | US wants defections from iraq,16
490 | Serbian war criminal,16
491 | American spacecraft lands on Mars,17
492 | textbook explaining how-to terrorist activities introduced in embassy bombing trials,16
493 | Presidential election/Democratic primary; further articles.,20
494 | Executive branch failures in preventing 9/11; NSC advisor Rice given memo about Al Qaeda,16
495 | peace in chechnya,19
496 | McCain's wife's new role,20
497 | Saddam Hussein was a regional terror for 30 years,16
498 | Senate races,20
499 | arson in northern ireland,19
500 | Death of Yassir Arafat,19
501 | Scientists using genes to enhance breeding of crops and livestock,4
502 | plea tossed in Iraqi abuse case,19
503 | Iraq reconstruction; large-scale billing fraud by American security company.,16
504 | China tightens rein on freedom of speech and press,19
505 | Iraq insurgency; assault on Falluja; Marines' experience.,16
506 | Board to determine which airlines to aid,10
507 | Clinton's Brother-in-law paid for lobbying pardons,20
508 | Britain: drug testing laws,19
509 | Madeleine Albright brings fighting Kosovo together for peace talks,16
510 | Britain: ruling to allow gay soldiers,19
511 | U.S. is debating talks with Iran over nukes,16
512 | U.S. may try new approach with North Korea,19
513 | New drug to prevent heart failure,3
514 | princess diana divorce,19
515 | Debate on what should be built on the site of the WTC.,14
516 | Quantum experts win Nobel Prizes,17
517 | pinochet can be extradited to stand trial,19
518 | 9/11 aftermath; behavior of office workers near Ground Zero.,16
519 | Mideast violence,19
520 | Bush administration's decision to oppose lawsuits of drug and medical device manufacturers for faulty products.,3
521 | Iraqi and American forces kill insurgents,16
522 | tobacco company damages,3
523 | Presidential election; Bush's National Guard service; investigation into Bush's activities during the period.,20
524 | Video of Jose Padilla reveals life of terror suspect,16
525 | Merger between Kmart and Sears.,15
526 | senate committee divided on clinton's air force secretary nomination,20
527 | Yemen and its foreign policy,19
528 | China: Taklimakan Desert to be cultivated,19
529 | Clinton promises veto in Republican tax cut,20
530 | editor admits crack expose was flawed,17
531 | Superintendant of New York City school district get success in tough area,6
532 | market rally,1
533 | General says US is still at war in Iraq,16
534 | Shortage of nurses in African countries due to large-scale emigration to developed world.,19
535 | "Summer School wrongly ordered for 8,600 students",6
536 | Syrian president buried,19
537 | Other countries question fairness of international aid policies after Asian tsunami,19
538 | ny welfare reform,13
539 | Judge refuses postponement of McVeigh execution,12
540 | louima case,12
541 | child porn case,12
542 | livery cabs a growing problem in NYC,14
543 | Newark in worse shape now than in 1967,14
544 | Donor to Democratic Party accused of receiving foreigners' cash,20
545 | NYSE may move to new jersey,15
546 | Iraq elections; Sunni Arabs' statements that their followers could boycott the election.,19
547 | Congress tries to limit Drug Cartel Money Launderers from sending money to Columbia,12
548 | EPA to clean up homes poisoned by 9/11 dust and ash,7
549 | Bush vows to aid countries in war on terror,16
550 | ross perot barred from debates,20
551 | NJ senate race,20
552 | drunk driver sentence,12
553 | Question whether Viagra will improve the sex life of women,3
554 | Saddam Hussein war crimes trial,16
555 | Exxon and Mobil oil merger,15
556 | Federal court upholds law giving notice of sex offenders,12
557 | juror's education,12
558 | Nuclear proliferation; role of network organized by Pakistani; further revelations thereon.,16
559 | south africa gun law debate,19
560 | China Transitions Leadership Peacefully; Hu Jintao,19
561 | "Death toll rises in Lebanon, Lebanese Prime Minister calls for international involvement",19
562 | Eliot Spitzer and New York gubernatorial race,24
563 | "Roberts Confirmation, relationship between the courts and congress",20
564 | fish going extinct in the hudson,4
565 | Aftermath of meeting between President Clinton and nation's most powerful bankers,15
566 | 2000 campaign for vice president,20
567 | Efforts by Shiite leaders to persuade Moqtada al-Sadr to withdraw militia units and permit the deployment of Iraqi government forces.,16
568 | Trent Lott tries to fix consumer price index,1
569 | Security against a potential New Year's terrorist attack.,16
570 | bush demands israeli withdrawal,19
571 | piracy in mexico,15
572 | new land conservation effort,7
573 | Hezbollah and Israel both choose violence to resolve recent conflict,19
574 | UN allows Iraq to export oil to help civilian population,19
575 | Poor conditions in Russia impair the ability of figure skaters to practice,19
576 | War on terror; U.S. government claims that Osama Bin Laden is personally preparing an attack on U.S. soil.,16
577 | Dentists notice a rise in Meth use,3
578 | F.B.I investigates Democratic campaign money,20
579 | Haitian crisis; seizure of second-largest city by rebels.,19
580 | Google buys out YouTube,15
581 | some refusing to pay taxes,1
582 | air traffic controllers,10
583 | california governor and abortion,2
584 | Democrats hopeful of success in 2006 elections,20
585 | Photo: victims of 9/11 honored,16
586 | Female Condom: Important Weapon against AIDS,3
587 | white house says prewar Iraq intelligence was flawed,16
588 | Photo-Hilary Clinton visits Harrient Tubman Learing Center in Harlem,20
589 | Improving political style of NYC Mayor Michael Bloomberg.,24
590 | Turkey planning to occupy iraq in the event of war to prevent refugee entrance,19
591 | "Corruption, spying, and leaks in Silicon Valley",15
592 | Elizabeth Dole; Red Cross; Presidential Campiagn,20
593 | NJ sprawl,14
594 | funeral for a sniper victim,12
595 | arab leaders to meet,19
596 | Enron scandal; plight of workers rendered unemployed by corporation's collapse.,15
597 | fringe parties in NY politics,24
598 | speculation that russian president is sick,19
599 | Tobacco industry to gain from settlement,3
600 | antiterror in europe,19
601 | Proposed Freedom center at ground zero,21
602 | Bush administration scaling back oil drilling in Gulf of Mexico,8
603 | South Korea wants longer range missiles,19
604 | recall probable in CA,24
605 | "Oklahoma City Bombing: trial, friend sticks to story in cross-examination",16
606 | Israel's Barak decides to quit politics,19
607 | indian politician,19
608 | plane crash,10
609 | Gore attacks Bush tax cut plan,1
610 | chinese trade bill,18
611 | astronomy satellite,17
612 | chaos in Liberia,19
613 | UN troops leaving haiti,19
614 | Indian economy,19
615 | Federal budget,1
616 | concern over doctor/investor relationships,3
617 | Suspects in Madrid attacks blow themselves up after being surrounded by police.,19
618 | hand recounts in florida can continue,20
619 | Milosevic trial will test international law,19
620 | Health expenditures in the United States as a proportion of GDP.,3
621 | Tobacco companies selling cigarettes to traders to funnel them into black markets,12
622 | Tough sentence for former WorldCom chairman,12
623 | gore campaign,20
624 | Free Trade Zone of the Americas given the go-ahead,18
625 | Former Private Secretary reveals information about deals made between narcotics traffickers and political leaders,12
626 | Al Qaeda defector used by prosecution in terrorism cases.,16
627 | 2000 campaign-cheney chosen,20
628 | Clinton Impeachment Trial,20
629 | religious practices vs. health concerns; New York City politics,2
630 | American spy plane lands in China after crashing with Chinese fighter jet,16
631 | Auto industry; reduce SUV emissions,7
632 | Senate and White House promoting measures that increase use of ethanol,8
633 | Promotion of John Edwards as running-mate of Kerry.,20
634 | Europeans debate US plan for UN involvement in Iraq,19
635 | E.P.A:air quality standards,7
636 | Abu Ghraib scandal; Bush apology combined with continued support for Rumsfeld.,19
637 | immigrants in suburbia,9
638 | inquiry into fraud by MCI,12
639 | Domestic surveillance,2
640 | chinese dissident sent to the US,19
641 | Bush's pick for secretary of defense,20
642 | Pataki barely breaking a sweat in race for second term as Governor,24
643 | New Jersey jail raid,12
644 | Kennedy relative sentenced in a murder trial,12
645 | criminal inquiry leads to raid in Marine unit,16
646 | takeover battle for sprint,15
647 | German leader warns about iraq war,16
648 | auto industry mileage plan,10
649 | Death of Arafat; analysis.,19
650 | State of the stock markets; analysis of signs of recovery.,1
651 | Standoff at Falluja; discussion in American command whether U.S. should pull out of the city.,16
652 | refugees in Kosovo need food,19
653 | CIA and FBI agree to truce,16
654 | panel says US should require insurance to pay for vaccines,3
655 | Martha Stewart trial; dismissal of most serious charge.,12
656 | cocaine fight in columbia,12
657 | Abu Ghraib scandal; order by U.S. commander in Iraq to halt use of all coercive interrogation techniques.,19
658 | jack kemp,20
659 | hummers,10
660 | Stalemated election,20
661 | Deaf Mexican immigrants held captive in North Carolina,9
662 | US near a trade deal with china,18
663 | Enron scandal,15
664 | UN resolution on Iraq's future,16
665 | Real estate broker completes largest transaction in U.S. history and buys property along East River,15
666 | U.S. has not been tracking weapons intended for Iraqi security forces,16
667 | clinton in bosnia,19
668 | reactions to the start of the Iraq war in America,16
669 | Bush and Cheney reaching out to democrats,20
670 | Hezbollah works to rebuild Lebanon to win popular support,19
671 | "peer-to-peer services being used for pornography, not just music",17
672 | in NYC.,21
673 | undecided washington sentate race,20
674 | France reveals evidence against Nazi war criminal to stand trial,19
675 | mentally ill health care,3
676 | Special report on use of wireless technology in America.,17
677 | 2000 campaign; bush after college,20
678 | first soldier killed in Afghanistan buried,16
679 | Washington State voters face affirmative action measure,2
680 | egyptians joining the palestinian cause,19
681 | IRA to help disarm ulster fighters,19
682 | Hussein rallies his troops,19
683 | bob dole challenges clinton's ethics,20
684 | Remembering Dr. Martin Luther King Jr,2
685 | Increasing nuclear proliferation despite diplomatic agreements; damage wrought by export of Pakistani nuclear expertise.,16
686 | Republicans questioning of Bush's ban on stem-cell research.,3
687 | new subpoenas over campaign finance violations,20
688 | Chinese trade bill,18
689 | Veteran health care,16
690 | women in India changing their roles,19
691 | new communications law,17
692 | poll on opinions of new yorkers,24
693 | Federal investigators link deaths to same suspect,12
694 | shootout in the bronx,12
695 | technological breakthrough in computing,17
696 | Revelation of prescient pre-war report about danger of post-Saddam Iraqi civil war; administration attempts to minimize its significance.,16
697 | Iraq's oil industry,19
698 | Britain: Prime Minister Campaigns,19
699 | Iraqi war casualties are up sharply,16
700 | US donations to jewish settlers,19
701 | "Bush announces government will take stronger role in airline security, will station troops",10
702 | hospital worker died of anthrax inhalation,16
703 | Shooting at the Empire State Building; the gunman,12
704 | limits to Putin's power,19
705 | graves uncovered in Sri Lanka have not lead to charges,19
706 | "Indian computer security is bad, files at risk",21
707 | fossils of second largest dinosaur found in Egypt,19
708 | naturalized citizens in NY,9
709 | NATO: Russia agrees to alliance expansion,16
710 | Virginia offer of scholarships to black students denied access to high school during segregation wars.,2
711 | greenspan says the economy is good,1
712 | FBI agent charged with spying for russia,16
713 | California plan for large-scale cutbacks in greenhouse-gas emissions.,7
714 | Microsoft antitrust case,15
715 | House approved bill to turn airport security to government,10
716 | irish peace referendum,19
717 | Trinity College and the revival of Hartford,6
718 | Clinton heart surgery.,20
719 | online prescription drug sales,3
720 | Sotheby's chairman convicted of price-fixing,12
721 | Key role of 9/11 widows in formation and activities of the 9/11 Comission.,16
722 | Rice reviews progress in Iraq; rejects exit strategy,16
723 | Al Qaeda in Karachi,16
724 | China to Protect Private Property Rights; Boon to Entreprenuerial Class,19
725 | welfare reform in Italy,19
726 | More on the Clinton inauguration,20
727 | South Carolina campaign,20
728 | Employment trends in the U.S.,1
729 | "stock slump hurts 401Ks, makes many rethink retirement plans",15
730 | European opinion,19
731 | Clinton denies Paula Jones' accusations,20
732 | exit polling from the presidential election,20
733 | U.S. shift to support cease fire in Lebanon started frantic round of negotiations in U.N.,19
734 | senate approves online contracts,15
735 | tax plans in campaign,1
736 | Bush calls for end to loans to buy stock,15
737 | Panama takes control of the Panama canal,19
738 | halliburton overcharging for fuel,16
739 | Foul Air and Water Part of Cost of Boom in China's Exports,19
740 | housing conditions for the poor,14
741 | drug review process,3
742 | Gates to create foundation to bring internet into public libraries,6
743 | Cubans still struggling to make ends meet,19
744 | Supreme Court nomination; partisan dispute,20
745 | Bali bombing,19
746 | Doctors' pay regains ground despite the effects of HMOs,3
747 | suicide scandal in Germany,19
748 | Cruise lines pay little income tax because of loophole in tax law,10
749 | More Palestinian and Israeli struggle,19
750 | Japanese elections,19
751 | burning of chemical weapons,7
752 | "New York law holding car owners liable for car accidents, whatever the driver, limiting car-leasing in the State.",10
753 | "weather data predicts years of frequent, stronger hurricanes",17
754 | tobacco settlement money held up by new york politics,3
755 | FTC ruling on doctors to let them band together,3
756 | israel and lebanon relations,19
757 | NYC Chancellor cuts school budget by cutting program spending,6
758 | I.B.M. guilty of selling advanced computers to Russian nuclear weapons laboratory,12
759 | Hugo Chavez opposition,19
760 | heightened terror alert,16
761 | Meningitis Epidemic in West Africa,19
762 | Iraqi constitution to be voted on,19
763 | uprising in Ivory Coast,19
764 | Research showing that aspirin use can help prevent breast cancer.,3
765 | more questions arising over Clinton pardons,20
766 | Prosecutors stop Haitian murder suspect's efforts to leave country,12
767 | fraud claim in Iraqi election,19
768 | Businesses use people's YouTube and MySpace videos as free advertising,17
769 | Cleanliness takes a back seat to financial survival in airline industry,10
770 | scores of top students rise while those of average students decline on reading tests,6
771 | mideast violence,19
772 | applying for college,6
773 | Israel requests shipment of U.S. artillery rockets,16
774 | "Bush threatens to veto Senate's patients' bill of rights, Democrats say he'll have to accept it",3
775 | Difficulties for NYC mayor Michael Bloomberg in playing host to the Republican convention.,20
776 | states propose reducing medicaid,3
777 | unauthorized wiretaps,2
778 | house races,20
779 | debate on war strategy disappears after advance on Baghdad,16
780 | British bracing for Bush protesters,19
781 | Bush rejects a quick pullout from Iraq,16
782 | interrogation of terror suspects,16
783 | NYC to pay overtime to police and fire chiefs despite concerns,12
784 | "France: parliament elections, little mention of economic plan",19
785 | campigner for US senate avoids SEC investigation,20
786 | Declining energy prices has caused the U.S. to give up all gains made in conserving energy,8
787 | first Latino becomes Los Angeles mayor in more than a century,24
788 | "New York City Budget, tax cuts",24
789 | Young blacks link tobacco use to marijuana,3
790 | U.S. to restore relations with Libya,19
791 | Bolivian Leader in Exile After Efforts to Eradicate Coca,19
792 | Chinese backlash against closing of news journal; censorship,19
793 | use of carbon fuel declining,7
794 | Sky scrapers advertising value,15
795 | Europeans begin to fear growing Muslim minorities in their countries,19
796 | bombing in jerusalem,19
797 | Clinton urges bipartisanship to get budget negotiations moving again,1
798 | Bush prepares for war,16
799 | Sunni militia force Shiite bakeries in Baghdad to close,19
800 | us peace efforts in the middle east,19
801 | Pressure grows on GOP House leadership over Foley scandal,20
802 | new organ transplant strategies,3
803 | Bankers Trust Company admits to diverting money to enhance financial performance,12
804 | Lebanon's ex-premier killed in car bomb attack; Syrian influence in Lebanon,19
805 | Roberts Confirmation hearings,20
806 | 3 rich kids moved around after mother kills their father,12
807 | copyright law changes,15
808 | Bush mideast speech,19
809 | auto industry agrees to design changes to improve safety,10
810 | soldier leaving for iraq,16
811 | "witness accuses former boss for price-fixing between Sotheby's, Christy's",12
812 | Law Enforcement fears that domestic terrorist attacks are linked by white supremacists,12
813 | Controversy over acquisition of U.S. ports by state-owned Middle-Eastern countries.,21
814 | White house dealings with enron,20
815 | wealthy taking year off before starting college,6
816 | Debate over who will pay for repairs of beach erosion in New Jersey,7
817 | 1996 Election; public opinion polls,20
818 | Israeli commandos raid Hezbollah stronghold despite truce,19
819 | Stricter elementary school standards,6
820 | Questionable legal practices in Checnya highlighted in torture of woman accused of adultery,19
821 | Saddam Hussein; Life in Iraq,19
822 | re-designed station wagons enter the auto market,10
823 | US tells citizens in India to leave,19
824 | "Internet message boards allows company employees to vent, sometimes ugly, conversations",5
825 | europe bans british beef,19
826 | priest abuse scandal,12
827 | terror suspect,16
828 | Iran blamed for the killing of Iranian dissidents in Germany,19
829 | Supreme court rules on recruiting at universities,16
830 | India Genral Election; Congress Party Losing Power,19
831 | Generic AIDS Drug makers want to sell in South Africa,19
832 | Taiwan election,19
833 | Juveniles punished for killing 5 people in school shooting,6
834 | calling up reserves in the iraq war,16
835 | Murders of women along the Mexican border,19
836 | the political future of Indonesia,19
837 | Clintom impeachment trial,20
838 | Fish market in NYC closing,15
839 | elderly people in philadelphia,3
840 | Bush won't continue plan to rid of weapons with plutonium,16
841 | Al Sharpton's bid for the Presidency,20
842 | kennedy and castro,19
843 | Markets Surge after Investor concers about inflation eased,1
844 | NJ doctors protest high insurance costs,3
845 | Bush seeks to expand NAFTA throughout Central and South America,18
846 | Bombings in London,19
847 | communications equipment maker announces lay-offs,5
848 | The difficuluties of scheduling a war; Olympics; Islamic holidays,16
849 | New York traffic court,12
850 | "Russian President fired Prime Minister and appointed a former KGB officer, Vladmir Putin",19
851 | conjoined twins separated,3
852 | Livestock testing at state fairs,4
853 | NJ budget problems,24
854 | NJ troopers using hotel staffs to stop drug smugglers,12
855 | Iraqi Prime Minister denounces Israeli attacks on Lebanon,19
856 | medicare-prescription drug benefits,3
857 | north korea has access to plutonium,16
858 | Comcast bid for Disney; probable strategy of Disney leader to counter it.,15
859 | surrogate mothers have babies for gay couples,2
860 | Civilian death toll in Iraq reaches new high,16
861 | culture war in Israel,19
862 | US plans for a palestinian state,19
863 | Guilty verdict on prominent investment banker.,12
864 | mourning a school shooting in scotland,19
865 | US attacks iraq,16
866 | Tactics of American unit against Shiite militia of Moqtada al-Sadr.,16
867 | Red Cross criticizes Guantanamo Bay,19
868 | Israeli Prime Minister; Isreali cabinet,19
869 | virus on a cruise ship,3
870 | Russian president's plan to tighten executive control over the legislative branch and local governents; stated rationale in terrorist threat.,19
871 | Chief executives see 22 percent raise in salary in last decade,5
872 | political terrorists kill Cambodians at democratic rally,19
873 | John Kerry and the Cambodian Swift Boat incident,20
874 | "Health care costs, Medicare",3
875 | recount analysis,20
876 | poll find NYC split over mayoral candidates,24
877 | Democratic primary; further articles.,20
878 | Human embryo cloning in South Korea; significance.,3
879 | NYC school board dispute,6
880 | American offensive against Shiite militia.,16
881 | supreme court ends a ban on ads for casino gambling,15
882 | Abortion doctor eulogized as killer is sought,2
883 | US reliance on Saudi oil,8
884 | School uniforms in Public School 7 in New York,6
885 | bob dole candidacy,20
886 | Bush talks about AIDS in South Africa,19
887 | Army veteran accuses her top-ranked Army boss with sexual assault,16
888 | nytimes neediest cases charity,9
889 | Abu Ghraib prisoner abuse scandal; interrogation unit alleges having supplied early reports to superior officers.,19
890 | big landlord,14
891 | Mad Cow disease in the US,4
892 | Reminder that standard time has resumed,15
893 | Iranian politics,19
894 | Afghani president's description of private armies as the principal threat to his country.,19
895 | mergers in europe,19
896 | Isreali curfew in nablus,19
897 | Immigration debate in Congress,9
898 | German election results,19
899 | man had hand cut off by Taliban on charge of theft,19
900 | israel and lebanon occupation,19
901 | Merging of American Companies,15
902 | Profile of American contractor beheaded in Iraq.,16
903 | Theory of gene flaw proposed to explain evolution of human beings.,17
904 | Life of John Roberts,20
905 | Inefficient evidence to look into Interior Secretary in his role in denial of Indian casino application,21
906 | peace in Angola,19
907 | bad week for Italian PM,19
908 | Senate rejects rival proposal to campaign finance law,20
909 | radio communication in NYC,12
910 | Re-development of WTO site.,21
911 | patents on drugs end,3
912 | nyc labor disputes,5
913 | change in laws causes deportations,9
914 | Chinese Poor Struggle with HIV/AIDS,19
915 | smallpox vaccine,16
916 | High cost of potentially effective anti-cancer drug,3
917 | airport security,10
918 | Police protest over pay,12
919 | Internet message boards facilitate dialogue on race,17
920 | negotiations on interim Afghan government stall,16
921 | captured spy chief returns to Peru,19
922 | Sheriffs department in columbine under scrutiny,6
923 | Supreme Court divided on interpretation of Clean Water Act,7
924 | Senate confirms Ashcroft,20
925 | campaign finance limits,20
926 | Chicago orders big retail stores to raise minimum wage,5
927 | Army recruiting helped by bad economy,16
928 | Federal Reserve considers cutting interest rates,1
929 | Conservative Christian politician hurt by ties to Jack Abramoff lobbying scandal,20
930 | Bush's Social Security reform plan,13
931 | osteoporosis gene found,3
932 | defect in heart devices,3
933 | US dispatches agents to Germany to uncover terrorist network,16
934 | bacteria in chickens,4
935 | people who live in homes where BTK murdered,12
936 | affirmative action poll,2
937 | suicide attack in Israel,19
938 | Space exploration; infrequent success therein and high costs thereof.,17
939 | politicians on parade,20
940 | whitewater scandal trial,20
941 | AT&T's withdrawal from the residential-phone business.,17
942 | ground zero cleanup nears the end,16
943 | large purchase of woods and wetlands for public use and preservation,7
944 | Danger to VA mental health services from large estimated numbers of future patients among veterans of Iraq war.,16
945 | Dow Chemical Company knowingly deceived women on breast implants,3
946 | Clinton scanal fallout,20
947 | lobbyists sway legislators on Chinese trade policy,18
948 | Standard time resumes,15
949 | second day of blackout continues as company makes accusations,8
950 | bob dole attacks clinton in the presidential debate,20
951 | PHOTO: Senator John Glenn at Brooks Air Force Base,20
952 | New York's hospitals receive Federal aid package,3
953 | East Germany's economic revival,19
954 | shady democratic party fund raising,20
955 | Bush aides defending budget in Congress,1
956 | New York State Legislature approves early retirement for most experienced teachers,6
957 | "fireworks explosions in Lima, Peru kill hundreds",19
958 | House Democrats choose Steny Hoyer as House majority leader over Nancy Pelosi's choice,20
959 | McCain endorses Bush,20
960 | fighting in the mideast,19
961 | reform party presidential candidate,20
962 | Iraq; Sadam Hussein; split in Arab nations,19
963 | inquiry into columbia breakup,17
964 | barak agrees to halt settlements in the west bank,19
965 | Comcast's Disney bid; rejection by Disney.,15
966 | welfare and the states,13
967 | NY mayor race,24
968 | Bush challenges Mideast to try democracy,16
969 | Swiss failed to payback Nazi payments after WWII,19
970 | North Korea fires ballistic missile over Japan,19
971 | US soldiers killed in Iraq,16
972 | Medpartners/Mullikin to Buy Caremark International; Creation of A Large Physician Management Company,15
973 | guilty verdict in a wendy's murder case,12
974 | Bosnian forces pull back troops and weapons from front line,19
975 | Iowa house race,20
976 | terrorism,19
977 | chief fundraiser for Senator may have received illegal campaign contributions,20
978 | Iraqis want more power to control themselves,16
979 | Prosecution of NYC school superintendent charged with $1m embezzlementt; difficulties in detecting embezzlement in districts with high per-student spending.,12
980 | "FDA proposes ban on importing blood from Europe, upsetting European suppliers",3
981 | indonesian cleric falls ill,19
982 | "Japan's Economy in Debt, despite Japanese ability to save",19
983 | GAO files suit over cheney energy meetings,20
984 | "Oxycontin sales grew, but at a cost",3
985 | costs on loans,15
986 | Bristol Myers Squibb to Yield Patent Rights Over Aids Drugs in Africa,19
987 | china trade deal,18
988 | "Former United States Housing Secretary pleads guilty, lied to FBI",20
989 | Middle East politics; Ariel Sharon,19
990 | Governor-elect Eliot Spitzer likely to ask for ouster of State Comptroller Alan Hevesi,24
991 | domestic violence courts in NY,12
992 | China Bans Text Messaging/E-mail in Protests Against Japan,19
993 | Lacking relief and repair effort in Tsunami ravaged Indonesia,19
994 | Bush declared winner by Florida,20
995 | Palestinians and Israels hold off peace talks despite Madeleine Albright's visit,19
996 | stock market,1
997 | china sending missiles to iran,16
998 | Bomb attacks in Iraq; U.S. soldiers killed.,16
999 | Tora Bora offers many hiding places for Al Qaeda fighters,16
1000 | Russia and China willing to help curb Iran's nuclear ambitions,16
1001 |
--------------------------------------------------------------------------------