├── .gitignore ├── LICENSE ├── README.md ├── config ├── config_keyee_ace05e.json ├── config_keyee_ace05ep.json └── config_keyee_ere.json ├── keyee ├── dataset.py ├── eval.py ├── generate_data.py ├── model.py ├── template_ace.py ├── template_base.py ├── template_ere.py ├── train.py └── utils.py ├── preprocessing ├── get_doc_statistics.py ├── process_ace05e.py ├── process_ace05ep.py ├── process_ere.py ├── split_dataset.py └── split_dataset_dygie.py ├── requirements.txt ├── resource ├── low_resource_split │ ├── ace05e │ │ ├── doc_list_001 │ │ ├── doc_list_002 │ │ ├── doc_list_003 │ │ ├── doc_list_005 │ │ ├── doc_list_010 │ │ ├── doc_list_020 │ │ ├── doc_list_030 │ │ ├── doc_list_050 │ │ └── doc_list_075 │ ├── ace05ep │ │ ├── doc_list_001 │ │ ├── doc_list_002 │ │ ├── doc_list_003 │ │ ├── doc_list_005 │ │ ├── doc_list_010 │ │ ├── doc_list_020 │ │ ├── doc_list_030 │ │ ├── doc_list_050 │ │ └── doc_list_075 │ └── ere │ │ ├── doc_list_001 │ │ ├── doc_list_002 │ │ ├── doc_list_003 │ │ ├── doc_list_005 │ │ ├── doc_list_010 │ │ ├── doc_list_020 │ │ ├── doc_list_030 │ │ ├── doc_list_050 │ │ └── doc_list_075 └── splits │ ├── ACE05-EP │ ├── dev.doc.txt │ ├── test.doc.txt │ ├── train.doc.txt │ └── train_more.doc.txt │ └── ERE-EN │ ├── dev.doc.txt │ ├── test.doc.txt │ └── train.doc.txt └── scripts ├── eval.sh ├── process_ace05e.sh ├── process_ace05ep.sh ├── process_ere.sh └── train.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | .vscode/ 163 | data/ 164 | processed_data/ 165 | output/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KeyEE: Enhancing Low-resource Generative Event Extraction with Auxiliary Keyword Sub-Prompt 2 | Code repository for paper "KeyEE: Enhancing Low-resource Generative Event Extraction with Auxiliary Keyword Sub-Prompt". 3 | 4 | Our code is mainly based on [DEGREE](https://github.com/PlusLabNLP/DEGREE). We deeply thank the contribution from the authors of the paper. 5 | 6 | ## Environment 7 | - torch==1.8.0 8 | - transformers==4.25.1 9 | - protobuf==3.20.3 10 | - tensorboardx==2.6 11 | - lxml==4.9.2 12 | - beautifulsoup4==4.11.2 13 | - bs4==0.0.1 14 | - stanza==1.4.2 15 | - ipdb==0.13.11 16 | 17 | ```bash 18 | conda create -n keyee python=3.8 19 | conda activate keyee 20 | python -m pip install -r requirements.txt 21 | ``` 22 | 23 | ## Datasets 24 | 25 | We support `ace05e`, `ace05ep`, and `ere`. 26 | 27 | ### Preprocessing 28 | Our preprocessing mainly adapts [DEGREE's](https://github.com/PlusLabNLP/DEGREE) and [OneIE's](https://blender.cs.illinois.edu/software/oneie/) released scripts with minor modifications. We deeply thank the contribution from the authors of the paper. We propose to create a new virtual environment to complete the data preprocessing. 29 | 30 | #### `ace05e` 31 | 1. Prepare data processed from [DyGIE++](https://github.com/dwadden/dygiepp#ace05-event) 32 | 2. Put the processed data into the folder `processed_data/ace05e_dygieppformat` 33 | 3. Run `./scripts/process_ace05e.sh` 34 | 35 | #### `ace05ep` 36 | 1. Download ACE data from [LDC](https://catalog.ldc.upenn.edu/LDC2006T06) 37 | 2. Run `./scripts/process_ace05ep.sh` 38 | 39 | #### `ere` 40 | 1. Download ERE English data from LDC, specifically, "LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2", "LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2", "LDC2015E78_DEFT_Rich_ERE_Chinese_and_English_Parallel_Annotation_V2" 41 | 2. Collect all these data under a directory with such setup: 42 | ``` 43 | ERE 44 | ├── LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2 45 | │ ├── data 46 | │ ├── docs 47 | │ └── ... 48 | ├── LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2 49 | │ ├── data 50 | │ ├── docs 51 | │ └── ... 52 | └── LDC2015E78_DEFT_Rich_ERE_Chinese_and_English_Parallel_Annotation_V2 53 | ├── data 54 | ├── docs 55 | └── ... 56 | ``` 57 | 3. Run `./scripts/process_ere.sh` 58 | 59 | The above scripts will generate processed data (including the full training set and the low-resourece sets) in `./process_data`. 60 | 61 | 62 | ## Training 63 | 64 | All training configurations are listed in `config` directory, you should check your configurations before experiments. 65 | 66 | Run `./scripts/train.sh` or use the following commands: 67 | 68 | Generate data 69 | ```bash 70 | python keyee/generate_data.py -c config/config_keyee_ace05e.json 71 | ``` 72 | 73 | Train 74 | ```bash 75 | python keyee/train.py -c config/config_keyee_ace05e.json 76 | ``` 77 | 78 | ## Evaluation 79 | 80 | We negatively sampled those sentences that were missing a certain event type during the training phase to reduce training time, which means we did not retrain full dev and test dataset in training stage. So it is important to do extra evaluation on the whole test datset. 81 | 82 | To do this, you can run `./scripts/eval.sh` or use the following commands: 83 | 84 | ```bash 85 | python keyee/eval.py \ 86 | -c config/config_keyee_ace05e.json \ 87 | -e $OUTPUT_DIR/best_model.mdl \ 88 | --eval_batch_size 16 \ 89 | --write_file $OUTPUT_DIR/eval_result.json \ 90 | --no_dev 91 | ``` 92 | 93 | ## Citation 94 | 95 | If you find that the code is useful in your research, please consider citing our paper. 96 | 97 | @ARTICLE{BDMA2024_KeyEE, 98 | author = {Duan, Junwen and Liao, Xincheng and An, Ying and Wang, Jianxin}, 99 | journal = {Big Data Mining and Analytics}, 100 | title = {KeyEE: Enhancing Low-Resource Generative Event Extraction with Auxiliary Keyword Sub-Prompt}, 101 | year = {2024}, 102 | volume = {7}, 103 | number = {2}, 104 | pages = {547-560},, 105 | doi = {10.26599/BDMA.2023.9020036} 106 | } 107 | 108 | 109 | ## Contact 110 | 111 | If you have any issue, please contact Xincheng Liao at (ostars@csu.edu.cn) -------------------------------------------------------------------------------- /config/config_keyee_ace05e.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": "ace05e", 3 | "gpu_device": 1, 4 | "seed": 42, 5 | "train_file": "./processed_data/ace05e_bart/train.w1.oneie.json", 6 | "dev_file": "./processed_data/ace05e_bart/dev.w1.oneie.json", 7 | "test_file": "./processed_data/ace05e_bart/test.w1.oneie.json", 8 | "finetune_dir": "./data/ace05e/", 9 | "train_finetune_file": "./data/ace05e/train_all.pkl", 10 | "dev_finetune_file": "./data/ace05e/dev_all.pkl", 11 | "test_finetune_file": "./data/ace05e/test_all.pkl", 12 | "keyword_train_finetune_file": "./data/ace05e/train_keywords_all.pkl", 13 | "keyword_dev_finetune_file": "./data/ace05e/dev_keywords_all.pkl", 14 | "keyword_test_finetune_file": "./data/ace05e/test_keywords_all.pkl", 15 | "vocab_file": "./data/ace05e/vocab.json", 16 | "output_dir": "./output/ace05e_high_resources/", 17 | "cache_dir": "/shared_data/pretrained_models/", 18 | "model_name": "facebook/bart-large", 19 | "input_style": ["event_type_sent", "template"], 20 | "output_style": ["trigger:sentence", "argument:sentence"], 21 | "n_negative": 15, 22 | "max_epoch": 45, 23 | "warmup_epoch": 5, 24 | "train_batch_size": 16, 25 | "eval_batch_size": 8, 26 | "accumulate_step": 1, 27 | "learning_rate": 1e-05, 28 | "weight_decay": 1e-05, 29 | "grad_clipping": 5.0, 30 | "beam_size": 1, 31 | "max_length": 250, 32 | "max_output_length": 100, 33 | "ignore_first_header": true 34 | } 35 | -------------------------------------------------------------------------------- /config/config_keyee_ace05ep.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": "ace05ep", 3 | "gpu_device": 1, 4 | "seed": 42, 5 | "train_file": "./processed_data/ace05ep_bart/train.w1.oneie.json", 6 | "dev_file": "./processed_data/ace05ep_bart/dev.w1.oneie.json", 7 | "test_file": "./processed_data/ace05ep_bart/test.w1.oneie.json", 8 | "finetune_dir": "./data/ace05ep/", 9 | "train_finetune_file": "./data/ace05ep/train_all.pkl", 10 | "dev_finetune_file": "./data/ace05ep/dev_all.pkl", 11 | "test_finetune_file": "./data/ace05ep/test_all.pkl", 12 | "keyword_train_finetune_file": "./data/ace05ep/train_keywords_all.pkl", 13 | "keyword_dev_finetune_file": "./data/ace05ep/dev_keywords_all.pkl", 14 | "keyword_test_finetune_file": "./data/ace05ep/test_keywords_all.pkl", 15 | "vocab_file": "./data/ace05ep/vocab.json", 16 | "output_dir": "./output/ace05ep_high_resources/", 17 | "cache_dir": "/home/xcliao/pretrained_models/", 18 | "model_name": "facebook/bart-large", 19 | "input_style": ["event_type_sent", "template"], 20 | "output_style": ["trigger:sentence", "argument:sentence"], 21 | "n_negative": 15, 22 | "max_epoch": 45, 23 | "warmup_epoch": 5, 24 | "train_batch_size": 24, 25 | "eval_batch_size": 8, 26 | "accumulate_step": 1, 27 | "learning_rate": 1e-05, 28 | "weight_decay": 1e-05, 29 | "grad_clipping": 5.0, 30 | "beam_size": 1, 31 | "max_length": 250, 32 | "max_output_length": 100, 33 | "ignore_first_header": true 34 | } 35 | -------------------------------------------------------------------------------- /config/config_keyee_ere.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": "ere", 3 | "gpu_device": 1, 4 | "seed": 42, 5 | "train_file": "./processed_data/ere_bart/train.w1.oneie.json", 6 | "dev_file": "./processed_data/ere_bart/dev.w1.oneie.json", 7 | "test_file": "./processed_data/ere_bart/test.w1.oneie.json", 8 | "finetune_dir": "./data/ere/", 9 | "train_finetune_file": "./data/ere/train_all.pkl", 10 | "dev_finetune_file": "./data/ere/dev_all.pkl", 11 | "test_finetune_file": "./data/ere/test_all.pkl", 12 | "keyword_train_finetune_file": "./data/ere/train_keywords_all.pkl", 13 | "keyword_dev_finetune_file": "./data/ere/dev_keywords_all.pkl", 14 | "keyword_test_finetune_file": "./data/ere/test_keywords_all.pkl", 15 | "vocab_file": "./data/ere/vocab.json", 16 | "output_dir": "./output/ere_high_resources/", 17 | "cache_dir": "/home/xcliao/pretrained_models/", 18 | "model_name": "facebook/bart-large", 19 | "input_style": ["event_type_sent", "template"], 20 | "output_style": ["trigger:sentence", "argument:sentence"], 21 | "n_negative": 15, 22 | "max_epoch": 45, 23 | "warmup_epoch": 5, 24 | "train_batch_size": 16, 25 | "eval_batch_size": 8, 26 | "accumulate_step": 2, 27 | "learning_rate": 1e-05, 28 | "weight_decay": 1e-05, 29 | "grad_clipping": 5.0, 30 | "beam_size": 1, 31 | "max_length": 375, 32 | "max_output_length": 110, 33 | "ignore_first_header": true 34 | } 35 | -------------------------------------------------------------------------------- /keyee/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json, logging, pickle 3 | from tqdm import tqdm 4 | from torch.utils.data import Dataset 5 | from collections import namedtuple 6 | from utils import pad_sequence_to_length 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | ee_instance_fields = ['doc_id', 'wnd_id', 'tokens', 'pieces', 'piece_idxs', 'token_lens', 'token_start_idxs', 'triggers', 'roles'] 11 | ee_batch_fields = ['tokens', 'pieces', 'piece_idxs', 'token_lens', 'token_start_idxs', 'triggers', 'roles', 'wnd_ids'] 12 | EEInstance = namedtuple('EEInstance', field_names=ee_instance_fields, defaults=[None] * len(ee_instance_fields)) 13 | EEBatch = namedtuple('EEBatch', field_names=ee_batch_fields, defaults=[None] * len(ee_batch_fields)) 14 | 15 | gen_batch_fields = ['input_text', 'target_text', 'enc_idxs', 'enc_attn', 'dec_idxs', 'dec_attn', 'lbl_idxs', 'raw_lbl_idxs', 'infos', 'enc_type_idxs', 'offsets'] 16 | GenBatch = namedtuple('GenBatch', field_names=gen_batch_fields, defaults=[None] * len(gen_batch_fields)) 17 | 18 | def remove_overlap_entities(entities): 19 | """There are a few overlapping entities in the data set. We only keep the 20 | first one and map others to it. 21 | :param entities (list): a list of entity mentions. 22 | :return: processed entity mentions and a table of mapped IDs. 23 | """ 24 | tokens = [None] * 1000 25 | entities_ = [] 26 | id_map = {} 27 | for entity in entities: 28 | start, end = entity['start'], entity['end'] 29 | break_flag = False 30 | for i in range(start, end): 31 | if tokens[i]: 32 | id_map[entity['id']] = tokens[i] 33 | break_flag = True 34 | if break_flag: 35 | continue 36 | entities_.append(entity) 37 | for i in range(start, end): 38 | tokens[i] = entity['id'] 39 | return entities_, id_map 40 | 41 | def get_role_list(entities, events, id_map): 42 | entity_idxs = {entity['id']: (i,entity) for i, entity in enumerate(entities)} 43 | visited = [[0] * len(entities) for _ in range(len(events))] 44 | role_list = [] 45 | role_list = [] 46 | for i, event in enumerate(events): 47 | for arg in event['arguments']: 48 | entity_idx = entity_idxs[id_map.get(arg['entity_id'], arg['entity_id'])] 49 | 50 | # This will automatically remove multi role scenario 51 | if visited[i][entity_idx[0]] == 0: 52 | # ((trigger start, trigger end, trigger type), (argument start, argument end, role type)) 53 | temp = ((event['trigger']['start'], event['trigger']['end'], event['event_type']), 54 | (entity_idx[1]['start'], entity_idx[1]['end'], arg['role'])) 55 | role_list.append(temp) 56 | visited[i][entity_idx[0]] = 1 57 | role_list.sort(key=lambda x: (x[0][0], x[1][0])) 58 | return role_list 59 | 60 | class EEDataset(Dataset): 61 | def __init__(self, tokenizer, path, max_length=128, fair_compare=True): 62 | self.tokenizer = tokenizer 63 | self.path = path 64 | self.data = [] 65 | self.insts = [] 66 | self.max_length = max_length 67 | self.fair_compare = fair_compare 68 | self.gold_triggers = [] 69 | self.gold_roles = [] 70 | self.load_data() 71 | 72 | def __len__(self): 73 | return len(self.data) 74 | 75 | def __getitem__(self, item): 76 | return self.data[item] 77 | 78 | @property 79 | def event_type_set(self): 80 | type_set = set() 81 | for inst in self.insts: 82 | for event in inst['event_mentions']: 83 | type_set.add(event['event_type']) 84 | return type_set 85 | 86 | @property 87 | def role_type_set(self): 88 | type_set = set() 89 | for inst in self.insts: 90 | for event in inst['event_mentions']: 91 | for arg in event['arguments']: 92 | type_set.add(arg['role']) 93 | return type_set 94 | 95 | def load_data(self): 96 | with open(self.path, 'r', encoding='utf-8') as fp: 97 | lines = fp.readlines() 98 | self.insts = [] 99 | for line in lines: 100 | inst = json.loads(line) 101 | inst_len = len(inst['pieces']) 102 | if inst_len > self.max_length: 103 | continue 104 | self.insts.append(inst) 105 | 106 | for inst in tqdm(self.insts): 107 | doc_id = inst['doc_id'] 108 | wnd_id = inst['wnd_id'] 109 | tokens = inst['tokens'] 110 | pieces = inst['pieces'] 111 | 112 | entities = inst['entity_mentions'] 113 | if self.fair_compare: 114 | entities, entity_id_map = remove_overlap_entities(entities) 115 | else: 116 | entities = entities 117 | entity_id_map = {} 118 | 119 | events = inst['event_mentions'] 120 | events.sort(key=lambda x: x['trigger']['start']) 121 | 122 | token_num = len(tokens) 123 | token_lens = inst['token_lens'] 124 | 125 | piece_idxs = self.tokenizer.convert_tokens_to_ids(pieces) 126 | assert sum(token_lens) == len(piece_idxs) 127 | 128 | triggers = [(e['trigger']['start'], e['trigger']['end'], e['event_type']) for e in events] 129 | no_duplicated_triggers = set(triggers) 130 | assert len(triggers) == len(no_duplicated_triggers) 131 | roles = get_role_list(entities, events, entity_id_map) 132 | 133 | token_start_idxs = [sum(token_lens[:_]) for _ in range(len(token_lens))] + [sum(token_lens)] 134 | 135 | instance = EEInstance( 136 | doc_id=doc_id, 137 | wnd_id=wnd_id, 138 | tokens=tokens, 139 | pieces=pieces, 140 | piece_idxs=piece_idxs, 141 | token_lens=token_lens, 142 | token_start_idxs=token_start_idxs, 143 | triggers=triggers, 144 | roles=roles, 145 | ) 146 | self.data.append(instance) 147 | self.gold_triggers.append(triggers) 148 | self.gold_roles.append(roles) 149 | 150 | logger.info(f'Loaded {len(self)}/{len(lines)} instances from {self.path}') 151 | 152 | def collate_fn(self, batch): 153 | tokens = [inst.tokens for inst in batch] 154 | pieces = [inst.pieces for inst in batch] 155 | piece_idxs = [inst.piece_idxs for inst in batch] 156 | token_lens = [inst.token_lens for inst in batch] 157 | token_start_idxs = [inst.token_start_idxs for inst in batch] 158 | triggers = [inst.triggers for inst in batch] 159 | roles = [inst.roles for inst in batch] 160 | wnd_ids = [inst.wnd_id for inst in batch] 161 | 162 | return EEBatch( 163 | tokens=tokens, 164 | pieces=pieces, 165 | piece_idxs=piece_idxs, 166 | token_lens=token_lens, 167 | token_start_idxs=token_start_idxs, 168 | triggers=triggers, 169 | roles=roles, 170 | wnd_ids=wnd_ids, 171 | ) 172 | 173 | class GenDataset(Dataset): 174 | def __init__(self, tokenizer, max_length, path, max_output_length=None, unseen_types=[], no_bos=False): 175 | self.tokenizer = tokenizer 176 | self.max_length = self.max_output_length = max_length 177 | if max_output_length is not None: 178 | self.max_output_length = max_output_length 179 | self.path = path 180 | self.no_bos = no_bos # if you use bart, then this should be False; if you use t5, then this should be True 181 | self.data = [] 182 | self.load_data(unseen_types) 183 | # self.data = self.data[:100] # FOR DEBUG 184 | 185 | def __len__(self): 186 | return len(self.data) 187 | 188 | def __getitem__(self, item): 189 | return self.data[item] 190 | 191 | def load_data(self, unseen_types): 192 | with open(self.path, 'rb') as f: 193 | data = pickle.load(f) 194 | 195 | for l_in, l_out, l_info in zip(data['input'], data['target'], data['all']): 196 | if len(unseen_types) > 0: 197 | if isinstance(l_info, tuple): 198 | # instance base 199 | if l_info[1] in unseen_types: 200 | continue 201 | else: 202 | # trigger base, used in argument model 203 | if l_info['event type'] in unseen_types: 204 | continue 205 | self.data.append({ 206 | 'input': l_in, 207 | 'target': l_out, 208 | 'info': l_info 209 | }) 210 | logger.info(f'Loaded {len(self)} instances from {self.path}') 211 | 212 | def collate_fn(self, batch): 213 | input_text = [x['input'] for x in batch] 214 | target_text = [x['target'] for x in batch] 215 | 216 | # encoder inputs 217 | inputs = self.tokenizer(input_text, return_tensors='pt', padding=True, max_length=self.max_length) 218 | enc_idxs = inputs['input_ids'] 219 | enc_attn = inputs['attention_mask'] 220 | 221 | # decoder inputs 222 | targets = self.tokenizer(target_text, return_tensors='pt', padding=True, max_length=self.max_output_length) 223 | dec_idxs = targets['input_ids'] 224 | batch_size = dec_idxs.size(0) 225 | dec_idxs[:, 0] = self.tokenizer.eos_token_id 226 | dec_attn = targets['attention_mask'] 227 | 228 | # labels 229 | padding = torch.ones((batch_size, 1), dtype=torch.long) 230 | padding[:] = self.tokenizer.pad_token_id 231 | raw_lbl_idxs = torch.cat((dec_idxs[:, 1:], padding), dim=1) 232 | lbl_attn = torch.cat((dec_attn[:, 1:], torch.zeros((batch_size, 1), dtype=torch.long)), dim=1) 233 | lbl_idxs = raw_lbl_idxs.masked_fill(lbl_attn==0, -100) # ignore padding 234 | 235 | enc_idxs = enc_idxs.cuda() 236 | enc_attn = enc_attn.cuda() 237 | dec_idxs = dec_idxs.cuda() 238 | dec_attn = dec_attn.cuda() 239 | raw_lbl_idxs = raw_lbl_idxs.cuda() 240 | lbl_idxs = lbl_idxs.cuda() 241 | 242 | return GenBatch( 243 | input_text=input_text, 244 | target_text=target_text, 245 | enc_idxs=enc_idxs, 246 | enc_attn=enc_attn, 247 | dec_idxs=dec_idxs, 248 | dec_attn=dec_attn, 249 | lbl_idxs=lbl_idxs, 250 | raw_lbl_idxs=raw_lbl_idxs, 251 | infos=[x['info'] for x in batch] 252 | ) -------------------------------------------------------------------------------- /keyee/generate_data.py: -------------------------------------------------------------------------------- 1 | import os, json, pickle, logging, pprint, random 2 | import numpy as np 3 | from tqdm import tqdm 4 | from dataset import EEDataset 5 | from argparse import ArgumentParser, Namespace 6 | from utils import generate_vocabs 7 | from transformers import AutoTokenizer 8 | from template_base import event_template_generator 9 | import ipdb 10 | 11 | # configuration 12 | parser = ArgumentParser() 13 | parser.add_argument('-c', '--config', required=True) 14 | args = parser.parse_args() 15 | with open(args.config) as fp: 16 | config = json.load(fp) 17 | config.update(args.__dict__) 18 | config = Namespace(**config) 19 | 20 | if config.dataset == "ace05e" or config.dataset == "ace05ep": 21 | import template_ace 22 | template_file = "template_ace" 23 | elif config.dataset == "ere": 24 | import template_ere 25 | template_file = "template_ere" 26 | 27 | # fix random seed 28 | random.seed(config.seed) 29 | np.random.seed(config.seed) 30 | 31 | # logger 32 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(message)s', datefmt='[%Y-%m-%d %H:%M:%S]') 33 | logger = logging.getLogger(__name__) 34 | logger.info(f"\n{pprint.pformat(vars(config), indent=4)}") 35 | 36 | def generate_data(data_set, vocab, config): 37 | inputs = [] 38 | targets = [] 39 | events = [] 40 | 41 | keyword_inputs = [] 42 | keyword_targets = [] 43 | keywords = [] 44 | 45 | def organize_data(data, config): 46 | inputs = [] 47 | targets = [] 48 | infos = [] 49 | 50 | pos_data_ = [dt for dt in data if dt[3]] 51 | neg_data_ = [dt for dt in data if not dt[3]] 52 | np.random.shuffle(neg_data_) 53 | 54 | # data => (input_str, output_str, self.gold_event, gold_sample, self.event_type, self.tokens) 55 | for data_ in pos_data_: 56 | inputs.append(data_[0]) 57 | targets.append(data_[1]) 58 | infos.append((data_[2], data_[4], data_[5])) 59 | 60 | neg_data_ = neg_data_[:config.n_negative] 61 | for data_ in neg_data_: 62 | inputs.append(data_[0]) 63 | targets.append(data_[1]) 64 | infos.append((data_[2], data_[4], data_[5])) 65 | 66 | return inputs, targets, infos 67 | 68 | for data in tqdm(data_set.data): 69 | event_template = event_template_generator(template_file, data.tokens, data.triggers, data.roles, config.input_style, config.output_style, vocab, True) 70 | 71 | event_data, keyword_data = event_template.get_training_data() 72 | inputs_, targets_, events_ = organize_data(event_data, config) 73 | inputs.extend(inputs_) 74 | targets.extend(targets_) 75 | events.extend(events_) 76 | 77 | inputs_, targets_, keywords_ = organize_data(keyword_data, config) 78 | keyword_inputs.extend(inputs_) 79 | keyword_targets.extend(targets_) 80 | keywords.extend(keywords_) 81 | 82 | return inputs, targets, events, keyword_inputs, keyword_targets, keywords 83 | 84 | # check valid styles 85 | assert np.all([style in ['event_type', 'event_type_sent', 'static_keywords', 'template'] for style in config.input_style]) 86 | assert np.all([style in ['trigger:sentence', 'argument:sentence'] for style in config.output_style]) 87 | 88 | # tokenizer 89 | tokenizer = AutoTokenizer.from_pretrained(config.model_name, cache_dir=config.cache_dir) 90 | special_tokens = ['', '', '', '', ''] 91 | tokenizer.add_tokens(special_tokens) 92 | 93 | if not os.path.exists(config.finetune_dir): 94 | os.makedirs(config.finetune_dir) 95 | 96 | # load data 97 | train_set = EEDataset(tokenizer, config.train_file, max_length=config.max_length) 98 | dev_set = EEDataset(tokenizer, config.dev_file, max_length=config.max_length) 99 | test_set = EEDataset(tokenizer, config.test_file, max_length=config.max_length) 100 | vocab = generate_vocabs([train_set, dev_set, test_set]) 101 | 102 | # save vocabulary 103 | with open('{}/vocab.json'.format(config.finetune_dir), 'w') as f: 104 | json.dump(vocab, f, indent=4) 105 | 106 | # generate finetune data 107 | train_inputs, train_targets, train_events, train_k_inputs, train_k_targets, train_keywords = generate_data(train_set, vocab, config) 108 | logger.info(f"Generated {len(train_inputs)} training examples from {len(train_set)} instance") 109 | 110 | with open('{}/train_input.json'.format(config.finetune_dir), 'w') as f: 111 | json.dump(train_inputs, f, indent=4) 112 | 113 | with open('{}/train_target.json'.format(config.finetune_dir), 'w') as f: 114 | json.dump(train_targets, f, indent=4) 115 | 116 | with open('{}/train_all.pkl'.format(config.finetune_dir), 'wb') as f: 117 | pickle.dump({ 118 | 'input': train_inputs, 119 | 'target': train_targets, 120 | 'all': train_events 121 | }, f) 122 | 123 | with open(os.path.join(config.finetune_dir, 'train_keywords_input.json'), 'w') as f: 124 | json.dump(train_k_inputs, f, indent=4) 125 | 126 | with open(os.path.join(config.finetune_dir, 'train_keywords_target.json'), 'w') as f: 127 | json.dump(train_k_targets, f, indent=4) 128 | 129 | with open(os.path.join(config.finetune_dir, 'train_keywords_all.pkl'), 'wb') as f: 130 | pickle.dump({ 131 | 'input': train_k_inputs, 132 | 'target': train_k_targets, 133 | 'all': train_keywords 134 | }, f) 135 | 136 | dev_inputs, dev_targets, dev_events, dev_k_inputs, dev_k_targets, dev_keywords = generate_data(dev_set, vocab, config) 137 | logger.info(f"Generated {len(dev_inputs)} dev examples from {len(dev_set)} instance") 138 | 139 | with open('{}/dev_input.json'.format(config.finetune_dir), 'w') as f: 140 | json.dump(dev_inputs, f, indent=4) 141 | 142 | with open('{}/dev_target.json'.format(config.finetune_dir), 'w') as f: 143 | json.dump(dev_targets, f, indent=4) 144 | 145 | with open('{}/dev_all.pkl'.format(config.finetune_dir), 'wb') as f: 146 | pickle.dump({ 147 | 'input': dev_inputs, 148 | 'target': dev_targets, 149 | 'all': dev_events 150 | }, f) 151 | 152 | with open(os.path.join(config.finetune_dir, 'dev_keywords_input.json'), 'w') as f: 153 | json.dump(dev_k_inputs, f, indent=4) 154 | 155 | with open(os.path.join(config.finetune_dir, 'dev_keywords_target.json'), 'w') as f: 156 | json.dump(dev_k_targets, f, indent=4) 157 | 158 | with open(os.path.join(config.finetune_dir, 'dev_keywords_all.pkl'), 'wb') as f: 159 | pickle.dump({ 160 | 'input': dev_k_inputs, 161 | 'target': dev_k_targets, 162 | 'all': dev_keywords 163 | }, f) 164 | 165 | test_inputs, test_targets, test_events, test_k_inputs, test_k_targets, test_keywords = generate_data(test_set, vocab, config) 166 | logger.info(f"Generated {len(test_inputs)} test examples from {len(test_set)} instance") 167 | 168 | with open('{}/test_input.json'.format(config.finetune_dir), 'w') as f: 169 | json.dump(test_inputs, f, indent=4) 170 | 171 | with open('{}/test_target.json'.format(config.finetune_dir), 'w') as f: 172 | json.dump(test_targets, f, indent=4) 173 | 174 | with open('{}/test_all.pkl'.format(config.finetune_dir), 'wb') as f: 175 | pickle.dump({ 176 | 'input': test_inputs, 177 | 'target': test_targets, 178 | 'all': test_events 179 | }, f) 180 | 181 | with open(os.path.join(config.finetune_dir, 'test_keywords_input.json'), 'w') as f: 182 | json.dump(test_k_inputs, f, indent=4) 183 | 184 | with open(os.path.join(config.finetune_dir, 'test_keywords_target.json'), 'w') as f: 185 | json.dump(test_k_targets, f, indent=4) 186 | 187 | with open(os.path.join(config.finetune_dir, 'test_keywords_all.pkl'), 'wb') as f: 188 | pickle.dump({ 189 | 'input': test_k_inputs, 190 | 'target': test_k_targets, 191 | 'all': test_keywords 192 | }, f) -------------------------------------------------------------------------------- /keyee/model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import torch.nn as nn 4 | from transformers import AutoConfig, AutoModelForPreTraining 5 | import ipdb 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | class GenerativeModel(nn.Module): 10 | def __init__(self, config, tokenizer): 11 | super().__init__() 12 | self.tokenizer = tokenizer 13 | logger.info(f'Loading pre-trained model {config.model_name}') 14 | self.model_config = AutoConfig.from_pretrained(config.model_name, cache_dir=config.cache_dir) 15 | self.model = AutoModelForPreTraining.from_pretrained(config.model_name, cache_dir=config.cache_dir, config=self.model_config) 16 | self.model.resize_token_embeddings(len(self.tokenizer)) 17 | 18 | def forward(self, batch): 19 | outputs = self.model(input_ids=batch.enc_idxs, 20 | attention_mask=batch.enc_attn, 21 | decoder_input_ids=batch.dec_idxs, 22 | decoder_attention_mask=batch.dec_attn, 23 | labels=batch.lbl_idxs, 24 | return_dict=True) 25 | 26 | loss = outputs['loss'] 27 | 28 | return loss 29 | 30 | def predict(self, batch, num_beams=4, max_length=50): 31 | self.eval() 32 | with torch.no_grad(): 33 | outputs = self.model.generate(input_ids=batch.enc_idxs, 34 | attention_mask=batch.enc_attn, 35 | num_beams=num_beams, 36 | max_length=max_length) 37 | 38 | final_output = [] 39 | for bid in range(len(batch.enc_idxs)): 40 | output_sentence = self.tokenizer.decode(outputs[bid], skip_special_tokens=True, clean_up_tokenization_spaces=True) 41 | final_output.append(output_sentence) 42 | self.train() 43 | 44 | return final_output -------------------------------------------------------------------------------- /keyee/train.py: -------------------------------------------------------------------------------- 1 | import os, sys, json, logging, time, pprint, tqdm 2 | import numpy as np 3 | import torch 4 | from torch.utils.data import DataLoader 5 | from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup 6 | from model import GenerativeModel 7 | from dataset import GenDataset 8 | from utils import Summarizer, compute_f1 9 | from argparse import ArgumentParser, Namespace 10 | import ipdb 11 | 12 | # configuration 13 | parser = ArgumentParser() 14 | parser.add_argument('-c', '--config', required=True) 15 | args = parser.parse_args() 16 | with open(args.config) as fp: 17 | config = json.load(fp) 18 | config.update(args.__dict__) 19 | config = Namespace(**config) 20 | 21 | # fix random seed 22 | np.random.seed(config.seed) 23 | torch.manual_seed(config.seed) 24 | torch.backends.cudnn.enabled = False 25 | 26 | # logger and summarizer 27 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) 28 | output_dir = os.path.join(config.output_dir, timestamp) 29 | if not os.path.exists(output_dir): 30 | os.makedirs(output_dir) 31 | log_path = os.path.join(output_dir, "train.log") 32 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(message)s', datefmt='[%Y-%m-%d %H:%M:%S]', 33 | handlers=[logging.FileHandler(os.path.join(output_dir, "train.log")), logging.StreamHandler()]) 34 | logger = logging.getLogger(__name__) 35 | logger.info(f"\n{pprint.pformat(vars(config), indent=4)}") 36 | summarizer = Summarizer(output_dir) 37 | 38 | # set GPU device 39 | torch.cuda.set_device(config.gpu_device) 40 | 41 | # check valid styles 42 | assert np.all([style in ['event_type', 'event_type_sent', 'static_keywords', 'template'] for style in config.input_style]) 43 | assert np.all([style in ['trigger:sentence', 'argument:sentence'] for style in config.output_style]) 44 | 45 | # output 46 | with open(os.path.join(output_dir, 'config.json'), 'w') as fp: 47 | json.dump(vars(config), fp, indent=4) 48 | best_model_path = os.path.join(output_dir, 'best_model.mdl') 49 | dev_prediction_path = os.path.join(output_dir, 'pred.dev.json') 50 | test_prediction_path = os.path.join(output_dir, 'pred.test.json') 51 | dev_keyword_prediction_path = os.path.join(output_dir, 'pred.keyword.dev.json') 52 | test_keyword_prediction_path = os.path.join(output_dir, 'pred.keyword.test.json') 53 | 54 | # tokenizer 55 | tokenizer = AutoTokenizer.from_pretrained(config.model_name, cache_dir=config.cache_dir) 56 | special_tokens = ['', '', '', '', ''] 57 | tokenizer.add_tokens(special_tokens) 58 | 59 | # load data 60 | train_set = GenDataset(tokenizer, config.max_length, config.train_finetune_file, config.max_output_length) 61 | dev_set = GenDataset(tokenizer, config.max_length, config.dev_finetune_file, config.max_output_length) 62 | test_set = GenDataset(tokenizer, config.max_length, config.test_finetune_file, config.max_output_length) 63 | keyword_train_set = GenDataset(tokenizer, config.max_length, config.keyword_train_finetune_file, config.max_output_length) 64 | keyword_dev_set = GenDataset(tokenizer, config.max_length, config.keyword_dev_finetune_file, config.max_output_length) 65 | keyword_test_set = GenDataset(tokenizer, config.max_length, config.keyword_test_finetune_file, config.max_output_length) 66 | train_batch_num = len(train_set) // config.train_batch_size + (len(train_set) % config.train_batch_size != 0) 67 | dev_batch_num = len(dev_set) // config.eval_batch_size + (len(dev_set) % config.eval_batch_size != 0) 68 | test_batch_num = len(test_set) // config.eval_batch_size + (len(test_set) % config.eval_batch_size != 0) 69 | 70 | # initialize the model 71 | model = GenerativeModel(config, tokenizer) 72 | model.cuda(device=config.gpu_device) 73 | 74 | # optimizer 75 | param_groups = [{'params': model.parameters(), 'lr': config.learning_rate, 'weight_decay': config.weight_decay}] 76 | optimizer = AdamW(params=param_groups) 77 | schedule = get_linear_schedule_with_warmup(optimizer, 78 | num_warmup_steps=train_batch_num*config.warmup_epoch, 79 | num_training_steps=train_batch_num*config.max_epoch) 80 | 81 | 82 | def evaluation(model, dataset, keyword_dataset, config, progress): 83 | if config.dataset == "ace05e" or config.dataset == "ace05ep": 84 | import template_ace 85 | template_file = "template_ace" 86 | elif config.dataset == "ere": 87 | import template_ere 88 | template_file = "template_ere" 89 | 90 | model.eval() 91 | write_output = [] 92 | keyword_write_output = [] 93 | eval_gold_key_num, eval_pred_key_num, eval_match_key_num = 0, 0, 0 94 | eval_gold_tri_num, eval_pred_tri_num, eval_match_tri_num = 0, 0, 0 95 | eval_gold_arg_num, eval_pred_arg_num, eval_match_arg_id, eval_match_arg_cls = 0, 0, 0, 0 96 | 97 | for batch_idx, (batch, keyword_batch) in enumerate(zip(DataLoader(dataset, batch_size=config.eval_batch_size, 98 | shuffle=False, collate_fn=dataset.collate_fn), 99 | DataLoader(keyword_dataset, batch_size=config.eval_batch_size, 100 | shuffle=False, collate_fn=keyword_dataset.collate_fn))): 101 | progress.update(1) 102 | keyword_pred_text = model.predict(keyword_batch, num_beams=config.beam_size, max_length=config.max_output_length) 103 | keyword_gold_text = keyword_batch.target_text 104 | keyword_input_text = keyword_batch.input_text 105 | keyword_pred_objects = [] 106 | for i_text, g_text, p_text, info, keyword_info in zip(keyword_input_text, keyword_gold_text, keyword_pred_text, batch.infos, keyword_batch.infos): 107 | theclass = getattr(sys.modules[template_file], info[1].replace(':', '_').replace('-', '_'), False) 108 | assert theclass 109 | template = theclass(config.input_style, config.output_style, info[2], info[1], info[0]) 110 | 111 | # decode predictions 112 | pred_object = template.decode_keywords(p_text) 113 | keyword_pred_objects.append(pred_object) 114 | 115 | # calculate scores 116 | sub_scores = template.evaluate_keywords(pred_object) 117 | eval_gold_key_num += sub_scores['gold_num'] 118 | eval_pred_key_num += sub_scores['pred_num'] 119 | eval_match_key_num += sub_scores['match_num'] 120 | keyword_write_output.append({ 121 | 'input text': i_text, 122 | 'gold text': g_text, 123 | 'pred text': p_text, 124 | 'gold keyword spans': template.get_keyword_spans(), 125 | 'pred keyword spans': pred_object, 126 | 'score': sub_scores, 127 | # 'gold info': keyword_info 128 | }) 129 | 130 | pred_text = model.predict(batch, num_beams=config.beam_size, max_length=config.max_output_length) 131 | gold_text = batch.target_text 132 | input_text = batch.input_text 133 | for i_text, g_text, p_text, info in zip(input_text, gold_text, pred_text, batch.infos): 134 | theclass = getattr(sys.modules[template_file], info[1].replace(':', '_').replace('-', '_'), False) 135 | assert theclass 136 | template = theclass(config.input_style, config.output_style, info[2], info[1], info[0]) 137 | 138 | # decode predictions 139 | pred_object = template.decode(p_text) 140 | gold_object = template.trigger_span + [_ for _ in template.get_converted_gold()] 141 | 142 | # calculate scores 143 | sub_scores = template.evaluate(pred_object) 144 | eval_gold_tri_num += sub_scores['gold_tri_num'] 145 | eval_pred_tri_num += sub_scores['pred_tri_num'] 146 | eval_match_tri_num += sub_scores['match_tri_num'] 147 | eval_gold_arg_num += sub_scores['gold_arg_num'] 148 | eval_pred_arg_num += sub_scores['pred_arg_num'] 149 | eval_match_arg_id += sub_scores['match_arg_id'] 150 | eval_match_arg_cls += sub_scores['match_arg_cls'] 151 | write_output.append({ 152 | 'input text': i_text, 153 | 'gold text': g_text, 154 | 'pred text': p_text, 155 | 'gold triggers': gold_object, 156 | 'pred triggers': pred_object, 157 | 'score': sub_scores, 158 | 'gold events': info[0] 159 | }) 160 | 161 | eval_scores = { 162 | 'keyword_id': compute_f1(eval_pred_key_num, eval_gold_key_num, eval_match_key_num), 163 | 'tri_id': compute_f1(eval_pred_tri_num, eval_gold_tri_num, eval_match_tri_num), 164 | 'arg_id': compute_f1(eval_pred_arg_num, eval_gold_arg_num, eval_match_arg_id), 165 | 'arg_cls': compute_f1(eval_pred_arg_num, eval_gold_arg_num, eval_match_arg_cls) 166 | } 167 | 168 | # print scores 169 | logger.info("---------------------------------------------------------------------") 170 | logger.info('Keyword I - P: {:5.2f} ({:4d}/{:4d}), R: {:5.2f} ({:4d}/{:4d}), F: {:5.2f}'.format( 171 | eval_scores['keyword_id'][0] * 100.0, eval_match_key_num, eval_pred_key_num, 172 | eval_scores['keyword_id'][1] * 100.0, eval_match_key_num, eval_gold_key_num, eval_scores['keyword_id'][2] * 100.0)) 173 | logger.info("---------------------------------------------------------------------") 174 | logger.info('Trigger I - P: {:5.2f} ({:4d}/{:4d}), R: {:5.2f} ({:4d}/{:4d}), F: {:5.2f}'.format( 175 | eval_scores['tri_id'][0] * 100.0, eval_match_tri_num, eval_pred_tri_num, 176 | eval_scores['tri_id'][1] * 100.0, eval_match_tri_num, eval_gold_tri_num, eval_scores['tri_id'][2] * 100.0)) 177 | logger.info("---------------------------------------------------------------------") 178 | logger.info('Role I - P: {:5.2f} ({:4d}/{:4d}), R: {:5.2f} ({:4d}/{:4d}), F: {:5.2f}'.format( 179 | eval_scores['arg_id'][0] * 100.0, eval_match_arg_id, eval_pred_arg_num, 180 | eval_scores['arg_id'][1] * 100.0, eval_match_arg_id, eval_gold_arg_num, eval_scores['arg_id'][2] * 100.0)) 181 | logger.info('Role C - P: {:5.2f} ({:4d}/{:4d}), R: {:5.2f} ({:4d}/{:4d}), F: {:5.2f}'.format( 182 | eval_scores['arg_cls'][0] * 100.0, eval_match_arg_cls, eval_pred_arg_num, 183 | eval_scores['arg_cls'][1] * 100.0, eval_match_arg_cls, eval_gold_arg_num, eval_scores['arg_cls'][2] * 100.0)) 184 | logger.info("---------------------------------------------------------------------") 185 | 186 | return eval_scores, write_output, keyword_write_output 187 | 188 | 189 | 190 | # start training 191 | logger.info("Start training ...") 192 | summarizer_step = 0 193 | best_dev_epoch = -1 194 | best_dev_scores = { 195 | 'tri_id': (0.0, 0.0, 0.0), 196 | 'arg_id': (0.0, 0.0, 0.0), 197 | 'arg_cls': (0.0, 0.0, 0.0) 198 | } 199 | for epoch in range(1, config.max_epoch+1): 200 | logger.info(log_path) 201 | logger.info(f"Epoch {epoch}") 202 | 203 | # training 204 | progress = tqdm.tqdm(total=train_batch_num, ncols=75, desc='Train {}'.format(epoch)) 205 | model.train() 206 | optimizer.zero_grad() 207 | for batch_idx, (batch, keyword_batch) in enumerate(zip(DataLoader(train_set, batch_size=config.train_batch_size // config.accumulate_step, 208 | shuffle=True, drop_last=False, collate_fn=train_set.collate_fn), 209 | DataLoader(keyword_train_set, batch_size=config.train_batch_size // config.accumulate_step, 210 | shuffle=True, drop_last=False, collate_fn=keyword_train_set.collate_fn))): 211 | # forard model 212 | ee_loss = model(batch) 213 | keyword_loss = model(keyword_batch) 214 | loss = ee_loss + keyword_loss 215 | 216 | # record loss 217 | summarizer.scalar_summary('train/loss', loss, summarizer_step) 218 | summarizer_step += 1 219 | 220 | loss = loss * (1 / config.accumulate_step) 221 | loss.backward() 222 | 223 | if (batch_idx + 1) % config.accumulate_step == 0: 224 | progress.update(1) 225 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clipping) 226 | optimizer.step() 227 | schedule.step() 228 | optimizer.zero_grad() 229 | progress.close() 230 | 231 | # eval dev set 232 | best_dev_flag = False 233 | progress = tqdm.tqdm(total=dev_batch_num, ncols=75, desc='Dev {}'.format(epoch)) 234 | dev_scores, write_output, keyword_write_output = evaluation(model, dev_set, keyword_dev_set, config, progress) 235 | progress.close() 236 | 237 | # check best dev model 238 | if dev_scores['arg_cls'][2] > best_dev_scores['arg_cls'][2]: 239 | best_dev_flag = True 240 | 241 | # if best dev, save model and evaluate test set 242 | # if best_dev_flag: 243 | if True: 244 | best_dev_scores = dev_scores 245 | best_dev_epoch = epoch 246 | 247 | # save best model 248 | logger.info('Saving best model') 249 | torch.save(model.state_dict(), best_model_path) 250 | 251 | # save dev result 252 | with open(dev_prediction_path, 'w') as fp: 253 | json.dump(write_output, fp, indent=4) 254 | with open(dev_keyword_prediction_path, 'w') as fp: 255 | json.dump(keyword_write_output, fp, indent=4) 256 | 257 | # eval test set 258 | progress = tqdm.tqdm(total=test_batch_num, ncols=75, desc='Test {}'.format(epoch)) 259 | test_scores, write_output, keyword_write_output = evaluation(model, test_set, keyword_test_set, config, progress) 260 | progress.close() 261 | 262 | # save test result 263 | with open(test_prediction_path, 'w') as fp: 264 | json.dump(write_output, fp, indent=4) 265 | with open(test_keyword_prediction_path, 'w') as fp: 266 | json.dump(keyword_write_output, fp, indent=4) 267 | 268 | logger.info({"epoch": epoch, "dev_scores": dev_scores}) 269 | if best_dev_flag: 270 | logger.info({"epoch": epoch, "test_scores": test_scores}) 271 | logger.info("Current best") 272 | logger.info({"best_epoch": best_dev_epoch, "best_scores": best_dev_scores}) 273 | 274 | logger.info(log_path) 275 | logger.info("Done!") -------------------------------------------------------------------------------- /preprocessing/get_doc_statistics.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from collections import Counter, defaultdict, OrderedDict 4 | import ipdb 5 | import os 6 | import random 7 | 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("-i", "--input_path", type=str, required=True) 11 | args = parser.parse_args() 12 | 13 | data = [] 14 | for line in open(args.input_path, 'r', encoding='utf-8'): 15 | data.append(json.loads(line)) 16 | 17 | class Info: 18 | def __init__(self): 19 | self.sentence_num = 0 20 | self.entity = Counter() 21 | self.entity_num = 0 22 | self.event = Counter() 23 | self.event_num = 0 24 | self.role = Counter() 25 | self.role_num = 0 26 | 27 | def update(self, instance): 28 | self.sentence_num += 1 29 | id2entity = dict() 30 | for entity in instance['entity_mentions']: 31 | id2entity[entity['id']] = entity 32 | self.entity[entity['entity_type']] += 1 33 | self.entity_num += 1 34 | 35 | for event in instance['event_mentions']: 36 | self.event[event['event_type']] += 1 37 | self.event_num += 1 38 | for argument in event['arguments']: 39 | self.role[argument['role']] += 1 40 | self.role_num += 1 41 | 42 | def __add__(self, obj): 43 | self.sentence_num += obj.sentence_num 44 | self.entity = self.entity + obj.entity 45 | self.entity_num += obj.entity_num 46 | self.event = self.event + obj.event 47 | self.event_num += obj.event_num 48 | self.role = self.role + obj.role 49 | self.role_num += obj.role_num 50 | return self 51 | 52 | def get_statistics(list_of_key, d_stat): 53 | total_sent = 0 54 | total_event = Counter() 55 | total_event_num = 0 56 | total_role = Counter() 57 | total_role_num = 0 58 | total_entity_num = 0 59 | for l in list_of_key: 60 | assert l in d_stat.keys() 61 | total_sent += d_stat[l].sentence_num 62 | total_event += d_stat[l].event 63 | total_role += d_stat[l].role 64 | total_event_num += d_stat[l].event_num 65 | total_role_num += d_stat[l].role_num 66 | total_entity_num += d_stat[l].entity_num 67 | 68 | return len(list_of_key), total_sent, total_event_num, len(total_event), total_role_num, len(total_role) 69 | 70 | def aggregate(list_of_key, d_stat): 71 | aggre = Info() 72 | for l in list_of_key: 73 | assert l in d_stat.keys() 74 | aggre = aggre+d_stat[l] 75 | return aggre 76 | 77 | # get info 78 | doc_statistics = defaultdict(Info) 79 | for instance in data: 80 | info = doc_statistics[instance['doc_id']] 81 | info.update(instance) 82 | 83 | def export_doc_list(lists, filename): 84 | with open(filename, 'w') as f: 85 | for l in lists: 86 | f.write(l + '\n') 87 | 88 | def read_doc_list(filename): 89 | lists = [] 90 | with open(filename, 'r') as f: 91 | for l in f.readlines(): 92 | lists.append(l.strip('\n')) 93 | return lists -------------------------------------------------------------------------------- /preprocessing/process_ace05e.py: -------------------------------------------------------------------------------- 1 | import json 2 | from argparse import ArgumentParser 3 | from transformers import BertTokenizer, RobertaTokenizer, AutoTokenizer 4 | 5 | 6 | def map_index(pieces): 7 | idxs = [] 8 | for i, piece in enumerate(pieces): 9 | if i == 0: 10 | idxs.append([0, len(piece)]) 11 | else: 12 | _, last = idxs[-1] 13 | idxs.append([last, last + len(piece)]) 14 | return idxs 15 | 16 | def map_decode_back_pieces(encoded_input, ori_tokens, tokenizer): 17 | decoded = [tokenizer.decode(x) for x in encoded_input['input_ids']] 18 | pieces = [] 19 | ori_cnt = 0 20 | current = [] 21 | for d in decoded: 22 | if d == '': 23 | continue 24 | current.append(d) 25 | if ''.join(current) == ori_tokens[ori_cnt]: 26 | pieces.append(current) 27 | current = [] 28 | ori_cnt += 1 29 | assert len(pieces) == len(ori_tokens) 30 | return pieces 31 | 32 | def convert(input_file, output_file, tokenizer, window_size_=3): 33 | with open(input_file, 'r', encoding='utf-8') as r, \ 34 | open(output_file, 'w', encoding='utf-8') as w: 35 | for line in r: 36 | doc = json.loads(line) 37 | doc_id = doc['doc_key'] 38 | sentences = doc['sentences'] 39 | sent_num = len(sentences) 40 | total_tokens = sum([len(sent) for sent in sentences]) 41 | coref_entities = doc['clusters'] 42 | coref_events = doc['event_clusters'] 43 | # upper bound on token index for checking index in range 44 | sent_starts = doc['_sentence_start'] + [total_tokens] 45 | entities = doc.get('ner', [[] for _ in range(sent_num)]) 46 | relations = doc.get('relations', [[] for _ in range(sent_num)]) 47 | events = doc.get('events', [[] for _ in range(sent_num)]) 48 | 49 | if window_size_ > sent_num: 50 | window_size = sent_num 51 | else: 52 | window_size = window_size_ 53 | 54 | offset = 0 55 | for i in range(sent_num - window_size + 1): 56 | wnd_sent_starts = sent_starts[i:i+window_size+1] 57 | wnd_start, wnd_end = wnd_sent_starts[0], wnd_sent_starts[-1] 58 | 59 | def slice_fn(lst, ind, wnd): 60 | return [item for j in range(wnd) for item in lst[ind+j]] 61 | wnd_tokens, wnd_entities, wnd_relations, wnd_events = [slice_fn( 62 | lst, i, window_size) for lst in [sentences, entities, relations, events]] 63 | 64 | wnd_id = '{}-{}'.format(doc_id, i) 65 | pieces = [tokenizer.tokenize(t) for t in wnd_tokens] 66 | word_lens = [len(p) for p in pieces] 67 | 68 | wnd_entities_ = [] 69 | wnd_entity_map = {} 70 | for j, (start, end, entity_type) in enumerate(wnd_entities): 71 | start, end = start - offset, end - offset + 1 72 | entity_id = '{}-E{}'.format(wnd_id, j) 73 | entity = { 74 | 'id': entity_id, 75 | 'start': start, 'end': end, 76 | 'entity_type': entity_type, 77 | # Mention types are not included in DyGIE++'s format 78 | 'mention_type': 'UNK', 79 | 'text': ' '.join(wnd_tokens[start:end])} 80 | wnd_entities_.append(entity) 81 | wnd_entity_map[(start, end)] = entity 82 | 83 | wnd_relations_ = [] 84 | for j, (start1, end1, start2, end2, rel_type) in enumerate(wnd_relations): 85 | start1, end1 = start1 - offset, end1 - offset + 1 86 | start2, end2 = start2 - offset, end2 - offset + 1 87 | arg1 = wnd_entity_map[(start1, end1)] 88 | arg2 = wnd_entity_map[(start2, end2)] 89 | relation_id = '{}-R{}'.format(wnd_id, j) 90 | rel_type = rel_type.split('.')[0] 91 | relation = { 92 | 'relation_type': rel_type, 93 | 'id': relation_id, 94 | 'arguments': [ 95 | { 96 | 'entity_id': arg1['id'], 97 | 'text': arg1['text'], 98 | 'role': 'Arg-1' 99 | }, 100 | { 101 | 'entity_id': arg2['id'], 102 | 'text': arg2['text'], 103 | 'role': 'Arg-2' 104 | }, 105 | ] 106 | } 107 | wnd_relations_.append(relation) 108 | 109 | # parse coref entities 110 | # for each entity mention in a coref, only look up the obj in the dict if they are in the window 111 | wnd_coref_ents = [[wnd_entity_map[(ent[0]-offset, ent[1]-offset+1)] for ent in coref if wnd_start <= ent[0] 112 | and ent[1] < wnd_end and (ent[0]-offset, ent[1]-offset+1) in wnd_entity_map] for coref in coref_entities] 113 | wnd_coref_ents_ = [] 114 | for j, ent_list in enumerate(wnd_coref_ents): 115 | if len(ent_list) > 1: 116 | wnd_coref_ents_.append({ 117 | 'id': '{}-CE{}'.format(wnd_id, j), 118 | 'entities': ent_list 119 | }) 120 | 121 | wnd_events_ = [] 122 | wnd_event_map = {} 123 | for j, event in enumerate(wnd_events): 124 | event_id = '{}-EV{}'.format(wnd_id, j) 125 | if len(event[0]) == 3: 126 | trigger_start, trigger_end, event_type = event[0] 127 | elif len(event[0]) == 2: 128 | trigger_start, event_type = event[0] 129 | trigger_end = trigger_start 130 | trigger_start, trigger_end = trigger_start - offset, trigger_end - offset + 1 131 | event_type = event_type.replace('.', ':') 132 | args = event[1:] 133 | args_ = [] 134 | for arg_start, arg_end, role in args: 135 | arg_start, arg_end = arg_start - offset, arg_end - offset +1 136 | arg = wnd_entity_map[(arg_start, arg_end)] 137 | args_.append({ 138 | 'entity_id': arg['id'], 139 | 'text': arg['text'], 140 | 'role': role 141 | }) 142 | event_obj = { 143 | 'event_type': event_type, 144 | 'id': event_id, 145 | 'trigger': { 146 | 'start': trigger_start, 147 | 'end': trigger_end, 148 | 'text': ' '.join(wnd_tokens[trigger_start:trigger_end]) 149 | }, 150 | 'arguments': args_ 151 | } 152 | wnd_events_.append(event_obj) 153 | wnd_event_map[(trigger_start, trigger_end)] = event_obj 154 | 155 | # parse coref events 156 | wnd_coref_evts = [[wnd_event_map[(evt[0]-offset, evt[1]-offset+1)] 157 | for evt in coref if wnd_start <= evt[0] and evt[1] < wnd_end 158 | and (evt[0]-offset, evt[1]-offset+1) in wnd_event_map] for coref in coref_events] 159 | wnd_coref_evts_ = [] 160 | for j, evt_list in enumerate(wnd_coref_evts): 161 | if len(evt_list) > 1: 162 | wnd_coref_evts_.append({ 163 | 'id': '{}-CEV{}'.format(wnd_id, j), 164 | 'events': evt_list 165 | }) 166 | 167 | wnd_ = { 168 | 'doc_id': doc_id, 169 | 'wnd_id': wnd_id, 170 | 'entity_mentions': wnd_entities_, 171 | 'relation_mentions': wnd_relations_, 172 | 'event_mentions': wnd_events_, 173 | 'entity_coreference': wnd_coref_ents_, 174 | 'event_coreference': wnd_coref_evts_, 175 | 'tokens': wnd_tokens, 176 | 'pieces': [p for w in pieces for p in w], 177 | 'token_lens': word_lens, 178 | 'sentence': ' '.join(wnd_tokens), 179 | 'sentence_starts': [x-offset for x in wnd_sent_starts[:-1]], 180 | } 181 | w.write(json.dumps(wnd_) + '\n') 182 | offset += len(sentences[i]) 183 | 184 | 185 | if __name__ == '__main__': 186 | parser = ArgumentParser() 187 | parser.add_argument('-i', '--input', help='Path to the input file') 188 | parser.add_argument('-o', '--output', help='Path to the output file') 189 | parser.add_argument('-b', '--bert', help='BERT model name', default='bert-large-cased') 190 | parser.add_argument('-w', '--window', default=1, help='Integer for window size', type=int) 191 | args = parser.parse_args() 192 | model_name = args.bert 193 | if model_name.startswith('bert-'): 194 | bert_tokenizer = BertTokenizer.from_pretrained(args.bert, 195 | do_lower_case=False) 196 | elif model_name.startswith('roberta-'): 197 | bert_tokenizer = RobertaTokenizer.from_pretrained(args.bert, 198 | do_lower_case=False) 199 | else: 200 | bert_tokenizer = AutoTokenizer.from_pretrained(args.bert, do_lower_case=False, use_fast=False) 201 | 202 | convert(args.input, args.output, bert_tokenizer, args.window) -------------------------------------------------------------------------------- /preprocessing/split_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from collections import Counter, defaultdict 4 | import os 5 | import ipdb 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("-i", "--input_path", type=str, required=True) 9 | parser.add_argument("-s", "--split_path", type=str, required=True) 10 | parser.add_argument("-o", "--output_path", type=str, required=True) 11 | args = parser.parse_args() 12 | 13 | inp = [json.loads(line) for line in open(args.input_path, 'r')] 14 | split = [x.strip('\n') for x in open(args.split_path, 'r')] 15 | 16 | counter = 0 17 | with open(args.output_path, 'w') as f: 18 | for doc in inp: 19 | if doc['doc_id'] in split: 20 | counter += 1 21 | f.write(json.dumps(doc) + '\n') 22 | 23 | print('Processed {} number of instances'.format(counter)) -------------------------------------------------------------------------------- /preprocessing/split_dataset_dygie.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from collections import Counter, defaultdict 4 | import os 5 | import ipdb 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("-i", "--input_path", type=str, required=True) 9 | parser.add_argument("-s", "--split_path", type=str, required=True) 10 | parser.add_argument("-o", "--output_path", type=str, required=True) 11 | args = parser.parse_args() 12 | 13 | inp = [json.loads(line) for line in open(args.input_path, 'r')] 14 | split = [x.strip('\n') for x in open(args.split_path, 'r')] 15 | 16 | counter = 0 17 | with open(args.output_path, 'w') as f: 18 | for doc in inp: 19 | if doc['doc_key'] in split: 20 | counter += 1 21 | f.write(json.dumps(doc) + '\n') 22 | 23 | print('Processed {} number of instances'.format(counter)) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.8.0 2 | transformers==4.25.1 3 | protobuf==3.20.3 4 | tensorboardx==2.6 5 | lxml==4.9.2 6 | beautifulsoup4==4.11.2 7 | bs4==0.0.1 8 | stanza==1.4.2 9 | ipdb==0.13.11 -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_001: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030605_065831.18 2 | APW_ENG_20030510.0228 3 | FLOPPINGACES_20050101.2244.048 4 | CNN_ENG_20030403_183513.1 5 | CNN_ENG_20030428_130651.4 6 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_002: -------------------------------------------------------------------------------- 1 | APW_ENG_20030408.0090 2 | CNN_ENG_20030605_065831.18 3 | CNN_ENG_20030513_160506.16 4 | CNNHL_ENG_20030609_133335.37 5 | CNN_ENG_20030401_233449.5 6 | CNN_ENG_20030421_120508.17 7 | APW_ENG_20030619.0383 8 | CNN_IP_20030329.1600.01-3 9 | MARKBACKER_20041217.1639 10 | AGGRESSIVEVOICEDAILY_20050116.2149 11 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_003: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030508_170552.18 2 | APW_ENG_20030410.0906 3 | CNN_ENG_20030418_063040.1 4 | CNN_ENG_20030424_073006.4 5 | CNN_ENG_20030617_193116.10 6 | CNN_ENG_20030306_070606.18 7 | MARKBACKER_20041206.0733 8 | CNN_ENG_20030612_173004.2 9 | MARKETVIEW_20050204.1736 10 | CNN_CF_20030303.1900.00 11 | CNN_ENG_20030421_120508.17 12 | MARKETVIEW_20041215.2128 13 | rec.music.phish_20041215.1554 14 | AGGRESSIVEVOICEDAILY_20041208.2133 15 | APW_ENG_20030416.0581 16 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_005: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030506_053020.14 2 | CNN_ENG_20030624_065843.24 3 | MARKETVIEW_20050226.1307 4 | APW_ENG_20030610.0010 5 | CNN_ENG_20030403_183513.1 6 | MARKETVIEW_20050225.0541 7 | soc.culture.china_20050203.0639 8 | CNN_ENG_20030426_160621.0 9 | MARKETVIEW_20050212.1607 10 | AGGRESSIVEVOICEDAILY_20050205.1954 11 | MARKBACKER_20041117.1107 12 | CNN_ENG_20030610_133041.17 13 | CNN_ENG_20030513_113501.6 14 | XIN_ENG_20030415.0379 15 | fsh_29786 16 | alt.obituaries_20041121.1339 17 | CNN_ENG_20030508_170552.18 18 | APW_ENG_20030327.0376 19 | APW_ENG_20030424.0698 20 | XIN_ENG_20030327.0202 21 | CNNHL_ENG_20030513_183907.5 22 | CNN_IP_20030405.1600.01-3 23 | MARKETVIEW_20050222.1919 24 | CNNHL_ENG_20030610_133347.6 25 | CNNHL_ENG_20030611_133445.24 26 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_010: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030422_083005.10 2 | FLOPPINGACES_20041113.1528.042 3 | BACONSREBELLION_20050205.1919 4 | MARKETVIEW_20050105.1901 5 | MARKBACKER_20041112.0707 6 | AGGRESSIVEVOICEDAILY_20050224.2252 7 | CNN_ENG_20030424_113549.11 8 | CNN_ENG_20030318_140851.8 9 | CNN_ENG_20030502_080020.7 10 | APW_ENG_20030527.0232 11 | CNN_ENG_20030509_090025.5 12 | MARKETVIEW_20050208.2059 13 | FLOPPINGACES_20041116.0833.027 14 | soc.culture.jewish_20050130.2105 15 | CNN_ENG_20030508_170552.18 16 | APW_ENG_20030520.0757 17 | CNNHL_ENG_20030416_193742.7 18 | FLOPPINGACES_20041114.1240.039 19 | CNNHL_ENG_20030416_133739.9 20 | CNNHL_ENG_20030429_220618.15 21 | XIN_ENG_20030314.0208 22 | rec.boats_20050130.1006 23 | CNN_ENG_20030530_130025.12 24 | rec.parks.theme_20050217.2019 25 | fsh_29505 26 | BACONSREBELLION_20050218.0848 27 | CNN_ENG_20030312_223733.14 28 | XIN_ENG_20030624.0085 29 | CNN_ENG_20030626_203133.11 30 | CNNHL_ENG_20030505_220734.25 31 | CNNHL_ENG_20030331_193419.9 32 | CNNHL_ENG_20030616_230155.28 33 | CNN_IP_20030409.1600.04 34 | misc.kids.pregnancy_20050120.0404 35 | XIN_ENG_20030523.0202 36 | CNN_ENG_20030416_160804.4 37 | CNN_ENG_20030426_160621.0 38 | BACONSREBELLION_20050206.1345 39 | fsh_29171 40 | MARKBACKER_20041128.1641 41 | MARKBACKER_20041119.1002 42 | CNN_ENG_20030610_123040.9 43 | CNN_IP_20030330.1600.05-2 44 | rec.travel.europe_20050101.1800 45 | CNNHL_ENG_20030618_230303.36 46 | fsh_29105 47 | fsh_29303 48 | CNN_ENG_20030327_163556.20 49 | CNN_IP_20030328.1600.07 50 | CNN_ENG_20030501_063017.15 51 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_020: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030529_085826.10 2 | APW_ENG_20030610.0010 3 | CNNHL_ENG_20030610_230438.14 4 | CNN_CF_20030304.1900.02 5 | CNN_ENG_20030403_183513.1 6 | fsh_29344 7 | CNN_ENG_20030306_070606.18 8 | fsh_29226 9 | CNN_ENG_20030428_130651.4 10 | CNN_ENG_20030318_140851.8 11 | fsh_29581_1 12 | CNN_IP_20030329.1600.00-6 13 | CNN_IP_20030402.1600.02-1 14 | CNN_ENG_20030528_172957.18 15 | MARKETVIEW_20050127.0716 16 | CNN_ENG_20030403_060032.0 17 | FLOPPINGACES_20041230.1844.003 18 | MARKETVIEW_20041220.1537 19 | CNN_ENG_20030424_113549.11 20 | MARKETVIEW_20050208.2033 21 | CNN_ENG_20030603_133025.7 22 | MARKETVIEW_20050205.1358 23 | CNN_IP_20030417.1600.06 24 | fsh_29774 25 | APW_ENG_20030519.0548 26 | MARKETVIEW_20050210.2138 27 | rec.boats_20050130.1006 28 | CNN_ENG_20030408_153616.9 29 | CNN_ENG_20030515_073019.7 30 | CNN_ENG_20030320_153434.7 31 | MARKBACKER_20050217.0647 32 | AGGRESSIVEVOICEDAILY_20050114.1922 33 | CNN_ENG_20030408_200618.14 34 | CNN_ENG_20030627_130145.6 35 | CNN_ENG_20030621_115841.16 36 | misc.legal.moderated_20041202.1648 37 | BACONSREBELLION_20050210.0728 38 | CNN_ENG_20030617_112838.4 39 | BACONSREBELLION_20050205.1919 40 | fsh_29786 41 | CNN_IP_20030330.1600.06 42 | MARKBACKER_20050103.0829 43 | soc.history.what-if_20050129.1404 44 | fsh_29326 45 | CNN_ENG_20030529_130011.6 46 | CNN_IP_20030405.1600.01-3 47 | CNN_ENG_20030514_130518.5 48 | CNNHL_ENG_20030604_230238.5 49 | CNN_ENG_20030429_083016.5 50 | CNN_ENG_20030424_183556.7 51 | APW_ENG_20030326.0190 52 | BACONSREBELLION_20050125.1108 53 | CNN_ENG_20030407_170605.7 54 | CNN_IP_20030329.1600.00-3 55 | XIN_ENG_20030609.0118 56 | CNNHL_ENG_20030402_133449.22 57 | CNN_IP_20030402.1600.00-3 58 | CNN_ENG_20030625_210122.0 59 | CNN_ENG_20030430_160723.6 60 | soc.culture.iraq_20050211.0445 61 | FLOPPINGACES_20041113.1528.042 62 | CNN_ENG_20030625_220123.3 63 | MARKBACKER_20041216.0656 64 | CNN_ENG_20030528_082823.9 65 | FLOPPINGACES_20041114.1240.039 66 | CNN_CF_20030304.1900.06-2 67 | APW_ENG_20030406.0191 68 | CNNHL_ENG_20030526_221156.39 69 | CNN_ENG_20030331_123648.4 70 | AGGRESSIVEVOICEDAILY_20050107.2012 71 | CNN_LE_20030504.1200.02-2 72 | MARKETVIEW_20050204.1337 73 | soc.culture.indian_20041104.2348 74 | MARKBACKER_20041117.1107 75 | APW_ENG_20030403.0862 76 | AGGRESSIVEVOICEDAILY_20050116.2149 77 | CNN_ENG_20030525_143522.8 78 | XIN_ENG_20030513.0002 79 | NYT_ENG_20030602.0074 80 | CNNHL_ENG_20030425_183518.12 81 | fsh_29586 82 | rec.travel.usa-canada_20050128.0121 83 | CNN_ENG_20030429_110706.7 84 | CNNHL_ENG_20030513_220910.11 85 | CNN_IP_20030330.1600.05-2 86 | BACONSREBELLION_20050209.0721 87 | FLOPPINGACES_20050101.2244.048 88 | CNN_ENG_20030430_063016.14 89 | AGGRESSIVEVOICEDAILY_20050203.1356 90 | CNN_ENG_20030401_233449.5 91 | MARKETVIEW_20050212.1717 92 | MARKBACKER_20041128.1641 93 | MARKBACKER_20041103.1300 94 | MARKETVIEW_20041212.1447 95 | CNN_CF_20030305.1900.06-1 96 | CNN_ENG_20030509_123601.13 97 | MARKETVIEW_20050208.2059 98 | CNN_ENG_20030630_075848.7 99 | AGGRESSIVEVOICEDAILY_20041101.1144 100 | CNN_ENG_20030401_073033.14 101 | CNN_ENG_20030506_053020.14 102 | OIADVANTAGE_20050204.1155 103 | CNN_ENG_20030429_190711.14 104 | XIN_ENG_20030523.0202 105 | APW_ENG_20030520.0081 106 | CNN_ENG_20030312_223733.14 107 | CNN_ENG_20030617_173115.22 108 | CNN_CF_20030303.1900.05 109 | fsh_29192 110 | CNN_IP_20030405.1600.02 111 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_030: -------------------------------------------------------------------------------- 1 | CNN_IP_20030405.1600.00-2 2 | BACONSREBELLION_20050204.1326 3 | CNN_ENG_20030501_063017.15 4 | CNN_ENG_20030501_160459.0 5 | fsh_29350 6 | CNN_ENG_20030610_095857.4 7 | fsh_29388 8 | CNN_ENG_20030626_193133.8 9 | CNN_IP_20030404.1600.00-2 10 | fsh_29139 11 | soc.culture.jewish_20050130.2105 12 | CNN_ENG_20030605_193002.8 13 | CNN_LE_20030504.1200.02-1 14 | MARKETVIEW_20050212.1607 15 | misc.invest.marketplace_20050208.2406 16 | APW_ENG_20030519.0367 17 | CNNHL_ENG_20030610_230438.14 18 | CNN_ENG_20030527_195948.3 19 | XIN_ENG_20030509.0137 20 | rec.arts.sf.written.robert-jordan_20050208.1350 21 | CNN_ENG_20030527_215946.12 22 | rec.boats_20050130.1006 23 | fsh_29592 24 | CNNHL_ENG_20030526_221156.39 25 | MARKBACKER_20041119.1002 26 | CNN_IP_20030330.1600.06 27 | CNN_ENG_20030516_090022.7 28 | CNN_IP_20030405.1600.01-2 29 | AGGRESSIVEVOICEDAILY_20041223.1449 30 | APW_ENG_20030327.0376 31 | MARKETVIEW_20050127.0716 32 | CNN_ENG_20030408_083034.11 33 | CNN_CF_20030305.1900.00-2 34 | CNNHL_ENG_20030604_230238.5 35 | misc.legal.moderated_20050129.2225 36 | OIADVANTAGE_20050203.2102 37 | NYT_ENG_20030630.0079 38 | fsh_29628 39 | CNN_ENG_20030617_112838.4 40 | AGGRESSIVEVOICEDAILY_20050224.1207 41 | alt.sys.pc-clone.dell_20050226.2350 42 | CNN_ENG_20030423_180539.2 43 | CNN_IP_20030408.1600.04 44 | BACONSREBELLION_20050226.1317 45 | fsh_29195 46 | APW_ENG_20030502.0470 47 | rec.travel.europe_20050101.1800 48 | AGGRESSIVEVOICEDAILY_20041226.1712 49 | BACONSREBELLION_20050222.1348 50 | CNN_ENG_20030403_060032.0 51 | MARKETVIEW_20041220.1537 52 | CNN_ENG_20030525_143522.8 53 | HEALINGIRAQ_20041108.1942.05 54 | GETTINGPOLITICAL_20050105.0127.001 55 | CNNHL_ENG_20030618_230303.6 56 | CNN_ENG_20030620_095840.4 57 | AFP_ENG_20030330.0211 58 | fsh_29105 59 | CNNHL_ENG_20030625_230351.4 60 | OIADVANTAGE_20050109.1947 61 | CNN_ENG_20030430_093016.0 62 | CNN_ENG_20030525_160525.13 63 | APW_ENG_20030422.0469 64 | CNN_ENG_20030612_173004.2 65 | CNN_ENG_20030408_200618.14 66 | CNN_ENG_20030416_160804.4 67 | CNN_ENG_20030627_065846.3 68 | MARKETVIEW_20041209.1401 69 | CNN_ENG_20030528_172957.18 70 | rec.travel.cruises_20050216.1636 71 | CNN_ENG_20030428_193655.2 72 | MARKBACKER_20041217.1639 73 | CNN_IP_20030403.1600.00-1 74 | CNN_IP_20030402.1600.02-2 75 | CNN_ENG_20030421_120508.17 76 | fsh_29192 77 | XIN_ENG_20030324.0191 78 | rec.sport.disc_20050209.2202 79 | CNN_CF_20030305.1900.06-2 80 | CNN_ENG_20030526_133535.4 81 | fsh_29581_1 82 | APW_ENG_20030406.0191 83 | CNN_ENG_20030619_115954.4 84 | MARKBACKER_20050105.1632 85 | BACONSREBELLION_20050216.1618 86 | fsh_29505 87 | CNN_IP_20030330.1600.05-2 88 | AGGRESSIVEVOICEDAILY_20041208.2133 89 | CNN_ENG_20030415_183752.14 90 | OIADVANTAGE_20050110.1009 91 | MARKETVIEW_20050226.1444 92 | BACONSREBELLION_20050218.0848 93 | AGGRESSIVEVOICEDAILY_20041215.2302 94 | CNNHL_ENG_20030416_133739.9 95 | CNN_ENG_20030603_133025.7 96 | XIN_ENG_20030425.0184 97 | MARKETVIEW_20041211.1845 98 | APW_ENG_20030415.0742 99 | APW_ENG_20030519.0548 100 | CNN_ENG_20030312_083725.3 101 | AGGRESSIVEVOICEDAILY_20050114.1922 102 | APW_ENG_20030619.0383 103 | CNN_ENG_20030306_070606.18 104 | CNN_IP_20030405.1600.02 105 | CNN_ENG_20030507_160538.15 106 | MARKBACKER_20041216.0656 107 | fsh_29187 108 | MARKETVIEW_20050206.1951 109 | fsh_29520 110 | CNN_IP_20030404.1600.00-1 111 | CNNHL_ENG_20030416_193742.7 112 | alt.gossip.celebrities_20050218.0826 113 | CNN_ENG_20030528_195959.20 114 | BACONSREBELLION_20050127.1017 115 | CNN_ENG_20030403_080032.9 116 | CNN_CF_20030305.1900.06-1 117 | CNN_ENG_20030507_170539.0 118 | CNN_IP_20030329.1600.01-1 119 | alt.politics.economics_20041206.1835 120 | CNN_ENG_20030605_065831.18 121 | AGGRESSIVEVOICEDAILY_20041201.2313 122 | CNNHL_ENG_20030407_193547.5 123 | CNNHL_ENG_20030411_230640.38 124 | CNN_ENG_20030429_110706.7 125 | APW_ENG_20030412.0531 126 | CNN_ENG_20030619_115954.10 127 | fsh_29121 128 | CNNHL_ENG_20030611_133445.24 129 | APW_ENG_20030610.0010 130 | CNN_ENG_20030305_170125.1 131 | MARKETVIEW_20050126.0711 132 | BACONSREBELLION_20050222.0817 133 | CNNHL_ENG_20030624_230338.34 134 | MARKBACKER_20041117.0723 135 | CNN_CF_20030304.1900.04 136 | CNN_ENG_20030618_065839.11 137 | CNN_CF_20030305.1900.00-1 138 | BACONSREBELLION_20050123.1639 139 | CNN_ENG_20030610_130042.17 140 | MARKBACKER_20050217.0647 141 | XIN_ENG_20030513.0002 142 | fsh_29344 143 | CNN_ENG_20030507_060023.1 144 | CNN_ENG_20030331_123648.4 145 | CNNHL_ENG_20030402_133449.22 146 | CNN_ENG_20030408_153616.9 147 | CNN_ENG_20030508_170552.18 148 | MARKETVIEW_20050210.2138 149 | CNN_ENG_20030513_160506.16 150 | APW_ENG_20030326.0190 151 | MARKBACKER_20041220.0919 152 | XIN_ENG_20030624.0085 153 | CNN_ENG_20030528_082823.9 154 | misc.legal.moderated_20041202.1648 155 | MARKETVIEW_20050204.1322 156 | CNN_ENG_20030618_193127.17 157 | APW_ENG_20030409.0013 158 | CNN_ENG_20030618_150128.5 159 | CNN_ENG_20030605_223004.4 160 | rec.music.phish_20041215.1554 161 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_050: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030624_082841.12 2 | fsh_29581_1 3 | APW_ENG_20030602.0037 4 | APW_ENG_20030326.0190 5 | CNN_ENG_20030610_105832.1 6 | AGGRESSIVEVOICEDAILY_20041101.1806 7 | MARKBACKER_20041119.1002 8 | soc.culture.jewish_20050130.2105 9 | MARKETVIEW_20050127.0716 10 | MARKBACKER_20041202.0711 11 | CNN_IP_20030402.1600.00-3 12 | CNN_CF_20030303.1900.06-2 13 | CNN_ENG_20030621_160254.25 14 | CNN_ENG_20030526_133535.4 15 | MARKETVIEW_20050120.1641 16 | NYT_ENG_20030630.0079 17 | fsh_29344 18 | CNN_ENG_20030312_083725.3 19 | fsh_29302 20 | alt.obituaries_20041121.1339 21 | fsh_29395 22 | CNN_ENG_20030604_092828.7 23 | AGGRESSIVEVOICEDAILY_20041223.1449 24 | APW_ENG_20030502.0686 25 | CNN_ENG_20030607_170312.6 26 | fsh_29770 27 | MARKETVIEW_20041213.0722 28 | CNN_ENG_20030329_170349.7 29 | CNNHL_ENG_20030624_133331.33 30 | rec.boats_20050130.1006 31 | AGGRESSIVEVOICEDAILY_20050105.1344 32 | CNN_IP_20030402.1600.02-1 33 | CNNHL_ENG_20030505_220734.25 34 | CNN_ENG_20030402_190500.11 35 | AGGRESSIVEVOICEDAILY_20050224.2252 36 | AGGRESSIVEVOICEDAILY_20050107.2012 37 | CNN_ENG_20030415_103039.0 38 | CNN_ENG_20030507_170539.0 39 | MARKETVIEW_20050204.1337 40 | CNN_ENG_20030617_193116.10 41 | CNNHL_ENG_20030403_133453.21 42 | CNN_ENG_20030625_220123.3 43 | CNN_ENG_20030416_190806.4 44 | misc.legal.moderated_20050129.2225 45 | CNN_IP_20030402.1600.02-2 46 | CNN_CF_20030304.1900.06-2 47 | CNN_ENG_20030407_130604.10 48 | CNN_IP_20030402.1600.00-4 49 | CNN_ENG_20030514_130518.5 50 | CNN_ENG_20030618_150128.5 51 | APW_ENG_20030415.0742 52 | AGGRESSIVEVOICEDAILY_20041215.2302 53 | CNN_ENG_20030409_180633.8 54 | CNN_CF_20030304.1900.01 55 | CNN_ENG_20030626_193133.8 56 | CNN_ENG_20030424_183556.7 57 | FLOPPINGACES_20041230.1844.003 58 | CNN_ENG_20030624_065843.24 59 | AFP_ENG_20030323.0020 60 | CNN_ENG_20030622_173306.9 61 | APW_ENG_20030513.0139 62 | CNN_ENG_20030602_102826.13 63 | CNN_ENG_20030605_223004.4 64 | CNN_ENG_20030430_160723.6 65 | CNN_IP_20030405.1600.01-2 66 | CNN_IP_20030329.1600.00-3 67 | rec.travel.cruises_20050222.0313 68 | MARKETVIEW_20050207.0746 69 | CNN_ENG_20030428_130651.4 70 | soc.culture.indian_20041104.2348 71 | CNN_ENG_20030425_063006.5 72 | fsh_29505 73 | CNN_ENG_20030619_115954.10 74 | fsh_29336 75 | FLOPPINGACES_20041115.1613.032 76 | CNN_ENG_20030404_163526.10 77 | CNN_ENG_20030617_105836.4 78 | fsh_29195 79 | CNNHL_ENG_20030403_193455.30 80 | CNN_ENG_20030617_065838.21 81 | APW_ENG_20030520.0757 82 | soc.culture.china_20050203.0639 83 | rec.music.makers.guitar.acoustic_20041228.1628 84 | CNN_ENG_20030509_123601.13 85 | MARKBACKER_20041128.1641 86 | OIADVANTAGE_20050108.1323 87 | CNN_ENG_20030516_123543.8 88 | APW_ENG_20030412.0531 89 | CNN_ENG_20030426_160621.0 90 | CNN_ENG_20030610_085833.10 91 | CNNHL_ENG_20030312_150218.13 92 | MARKETVIEW_20050216.2120 93 | CNN_ENG_20030306_070606.18 94 | CNNHL_ENG_20030625_230351.4 95 | CNN_LE_20030504.1200.02-1 96 | BACONSREBELLION_20050127.1017 97 | CNN_LE_20030504.1200.01 98 | MARKETVIEW_20050105.1901 99 | CNN_ENG_20030306_083604.6 100 | MARKETVIEW_20041217.0801 101 | rec.music.phish_20041215.1554 102 | alt.atheism_20041104.2428 103 | XIN_ENG_20030624.0085 104 | MARKETVIEW_20050208.2059 105 | CNN_ENG_20030515_073019.7 106 | APW_ENG_20030423.0079 107 | CNN_ENG_20030403_183513.1 108 | fsh_29141 109 | CNN_IP_20030408.1600.04 110 | fsh_29774 111 | APW_ENG_20030408.0090 112 | MARKBACKER_20041103.1300 113 | AGGRESSIVEVOICEDAILY_20050109.1627 114 | XIN_ENG_20030423.0011 115 | AGGRESSIVEVOICEDAILY_20050116.2149 116 | CNN_ENG_20030331_193655.14 117 | CNN_CF_20030304.1900.02 118 | CNNHL_ENG_20030430_220712.37 119 | CNN_ENG_20030603_095830.17 120 | misc.kids.pregnancy_20050120.0404 121 | aus.cars_20041206.0903 122 | alt.corel_20041228.0503 123 | fsh_29139 124 | CNN_ENG_20030422_083005.10 125 | CNN_ENG_20030408_153616.9 126 | OIADVANTAGE_20050109.1947 127 | soc.culture.iraq_20050211.0445 128 | alt.books.tom-clancy_20050130.1848 129 | APW_ENG_20030327.0376 130 | CNN_ENG_20030424_073006.4 131 | APW_ENG_20030422.0469 132 | CNN_ENG_20030418_083040.11 133 | CNNHL_ENG_20030624_230338.34 134 | alt.collecting.autographs_20050224.2438 135 | MARKBACKER_20050105.1632 136 | MARKETVIEW_20041212.1447 137 | APW_ENG_20030424.0698 138 | CNN_IP_20030404.1600.00-2 139 | CNNHL_ENG_20030611_133445.24 140 | AGGRESSIVEVOICEDAILY_20041226.1712 141 | XIN_ENG_20030425.0184 142 | CNN_IP_20030409.1600.04 143 | CNNHL_ENG_20030425_183518.12 144 | CNN_ENG_20030403_090032.1 145 | AGGRESSIVEVOICEDAILY_20050114.1922 146 | CNNHL_ENG_20030513_183907.5 147 | CNN_ENG_20030417_063039.0 148 | CNN_ENG_20030429_190711.14 149 | fsh_29622 150 | alt.politics_20050124.0640 151 | CNN_LE_20030504.1200.02-2 152 | MARKETVIEW_20050208.2033 153 | CNN_IP_20030329.1600.01-3 154 | CNN_CF_20030303.1900.00 155 | CNN_IP_20030329.1600.00-6 156 | BACONSREBELLION_20050216.1618 157 | CNNHL_ENG_20030415_193729.5 158 | BACONSREBELLION_20050218.0848 159 | CNN_IP_20030406.1600.03 160 | CNN_ENG_20030403_080032.9 161 | CNN_ENG_20030602_133012.9 162 | APW_ENG_20030610.0010 163 | CNN_IP_20030405.1600.00-3 164 | BACONSREBELLION_20050222.1348 165 | CNN_ENG_20030527_215946.12 166 | MARKETVIEW_20050201.0748 167 | NYT_ENG_20030602.0074 168 | soc.culture.hmong_20050210.1130 169 | fsh_29350 170 | MARKETVIEW_20050206.1951 171 | fsh_29226 172 | MARKBACKER_20050103.0829 173 | CNN_ENG_20030612_173004.10 174 | BACONSREBELLION_20050125.1108 175 | CNN_ENG_20030421_133510.6 176 | MARKETVIEW_20050226.1444 177 | CNN_ENG_20030614_173123.4 178 | CNN_ENG_20030430_063016.14 179 | MARKETVIEW_20041209.1401 180 | AGGRESSIVEVOICEDAILY_20041218.1004 181 | CNN_ENG_20030612_072835.2 182 | CNN_ENG_20030616_130059.25 183 | AGGRESSIVEVOICEDAILY_20050125.0136 184 | OIADVANTAGE_20041224.1007 185 | CNN_ENG_20030617_173115.22 186 | AGGRESSIVEVOICEDAILY_20041208.2133 187 | APW_ENG_20030404.0439 188 | fsh_29138 189 | CNN_ENG_20030525_160525.13 190 | AGGRESSIVEVOICEDAILY_20050124.1354 191 | CNNHL_ENG_20030513_220910.11 192 | Austin-Grad-Community_20050212.2454 193 | APW_ENG_20030410.0906 194 | CNN_ENG_20030416_100042.7 195 | CNN_ENG_20030403_180511.16 196 | alt.gossip.celebrities_20050218.0826 197 | FLOPPINGACES_20041116.0833.027 198 | APW_ENG_20030519.0548 199 | CNN_IP_20030329.1600.01-1 200 | CNN_IP_20030330.1600.06 201 | BACONSREBELLION_20050214.0944 202 | CNN_ENG_20030621_115841.16 203 | fsh_29272 204 | CNN_ENG_20030418_130831.5 205 | fsh_29388 206 | CNN_ENG_20030528_195959.20 207 | CNN_ENG_20030513_160506.16 208 | GETTINGPOLITICAL_20050105.0127.001 209 | CNN_IP_20030403.1600.00-1 210 | CNN_ENG_20030525_143522.8 211 | APW_ENG_20030416.0581 212 | AGGRESSIVEVOICEDAILY_20050208.1142 213 | XIN_ENG_20030408.0341 214 | CNN_IP_20030404.1600.00-1 215 | alt.gossip.celebrities_20041118.2331 216 | CNN_IP_20030403.1600.00-2 217 | AGGRESSIVEVOICEDAILY_20050205.1954 218 | MARKBACKER_20041220.0919 219 | CNN_ENG_20030605_085831.13 220 | AGGRESSIVEVOICEDAILY_20050203.1356 221 | CNN_ENG_20030428_193655.2 222 | CNN_ENG_20030430_093016.0 223 | CNN_ENG_20030506_160524.18 224 | OIADVANTAGE_20050105.0922 225 | CNN_ENG_20030415_180754.5 226 | CNN_ENG_20030507_160538.15 227 | CNN_CF_20030305.1900.06-2 228 | CNN_ENG_20030312_223733.14 229 | fsh_29520 230 | CNN_IP_20030403.1600.00-3 231 | MARKBACKER_20041112.0707 232 | fsh_29786 233 | CNN_ENG_20030411_193701.3 234 | CNN_ENG_20030417_073039.2 235 | AGGRESSIVEVOICEDAILY_20050106.1310 236 | fsh_29121 237 | BACONSREBELLION_20050206.1345 238 | CNN_IP_20030405.1600.02 239 | XIN_ENG_20030523.0202 240 | CNN_IP_20030410.1600.03-2 241 | MARKBACKER_20041216.0656 242 | fsh_29187 243 | CNN_ENG_20030506_163523.22 244 | soc.history.war.world-war-ii_20050127.2403 245 | APW_ENG_20030508.0772 246 | rec.sport.disc_20050209.2202 247 | CNN_ENG_20030620_170011.14 248 | OIADVANTAGE_20050204.1155 249 | CNN_ENG_20030421_120508.13 250 | CNNHL_ENG_20030609_133335.37 251 | CNN_ENG_20030529_130011.6 252 | CNN_ENG_20030528_172957.18 253 | fsh_29782_2 254 | CNN_ENG_20030325_220534.6 255 | MARKETVIEW_20050215.1858 256 | CNN_ENG_20030605_065831.18 257 | CNN_ENG_20030513_080020.2 258 | CNN_ENG_20030624_153103.17 259 | BACONSREBELLION_20050216.1632 260 | CNNHL_ENG_20030429_220618.15 261 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05e/doc_list_075: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030604_092828.7 2 | APW_ENG_20030502.0686 3 | MARKETVIEW_20050208.2059 4 | CNN_IP_20030329.1600.00-6 5 | CNN_ENG_20030610_105832.1 6 | CNN_ENG_20030407_130604.10 7 | CNNHL_ENG_20030430_220712.37 8 | CNN_IP_20030410.1600.03-1 9 | rec.travel.cruises_20050216.1636 10 | MARKETVIEW_20050222.0729 11 | CNN_ENG_20030607_170312.6 12 | MARKETVIEW_20050217.2115 13 | BACONSREBELLION_20050206.1345 14 | CNN_IP_20030405.1600.01-2 15 | MARKETVIEW_20050201.0748 16 | OIADVANTAGE_20041224.1007 17 | rec.sport.disc_20050209.2202 18 | MARKBACKER_20050105.1526 19 | alt.support.divorce_20050113.2451 20 | CNN_ENG_20030320_153434.7 21 | XIN_ENG_20030609.0118 22 | CNN_ENG_20030612_173004.10 23 | CNN_ENG_20030401_233449.5 24 | APW_ENG_20030422.0469 25 | CNN_ENG_20030612_173004.2 26 | rec.music.phish_20041215.1554 27 | CNN_ENG_20030630_085848.18 28 | CNN_IP_20030414.1600.04 29 | CNN_ENG_20030513_080020.2 30 | fsh_29586 31 | CNN_ENG_20030621_115841.16 32 | APW_ENG_20030519.0367 33 | CNN_ENG_20030305_170125.1 34 | BACONSREBELLION_20050216.1536 35 | CNN_ENG_20030515_063019.6 36 | CNN_IP_20030403.1600.00-3 37 | CNN_ENG_20030605_105831.11 38 | AGGRESSIVEVOICEDAILY_20050113.1400 39 | CNN_ENG_20030530_130025.12 40 | APW_ENG_20030502.0470 41 | CNN_ENG_20030407_080037.12 42 | AGGRESSIVEVOICEDAILY_20050213.2123 43 | CNN_ENG_20030516_123543.8 44 | CNNHL_ENG_20030416_193742.26 45 | fsh_29141 46 | BACONSREBELLION_20050222.1348 47 | CNN_ENG_20030421_090007.11 48 | CNN_ENG_20030408_153616.9 49 | fsh_29171 50 | CNN_ENG_20030602_133012.9 51 | CNN_ENG_20030627_130145.6 52 | alt.corel_20041228.0503 53 | BACONSREBELLION_20050218.0848 54 | OIADVANTAGE_20050110.1009 55 | CNN_ENG_20030421_120508.17 56 | CNN_IP_20030329.1600.01-3 57 | OIADVANTAGE_20050109.1947 58 | CNNHL_ENG_20030428_123600.14 59 | APW_ENG_20030424.0698 60 | CNN_ENG_20030624_082841.12 61 | CNN_ENG_20030607_173310.4 62 | CNN_ENG_20030408_200618.14 63 | CNNHL_ENG_20030416_193742.7 64 | CNN_ENG_20030404_163526.10 65 | APW_ENG_20030510.0228 66 | alt.books.tom-clancy_20050130.1848 67 | CNNHL_ENG_20030604_230238.5 68 | CNN_CF_20030303.1900.06-2 69 | MARKETVIEW_20050210.2138 70 | fsh_29628 71 | CNN_ENG_20030619_115954.4 72 | APW_ENG_20030403.0862 73 | OIADVANTAGE_20050203.1000 74 | MARKETVIEW_20041212.1447 75 | CNN_ENG_20030430_063016.14 76 | CNN_ENG_20030624_065843.24 77 | BACONSREBELLION_20050227.1238 78 | CNN_ENG_20030618_193127.17 79 | CNN_ENG_20030527_195948.3 80 | APW_ENG_20030423.0079 81 | CNNHL_ENG_20030402_133449.22 82 | CNN_ENG_20030513_113501.6 83 | Integritas-Group-Community-Forum_20050110.0557 84 | CNN_ENG_20030610_085833.10 85 | XIN_ENG_20030624.0085 86 | APW_ENG_20030414.0392 87 | CNN_ENG_20030605_153000.9 88 | CNNHL_ENG_20030416_133739.13 89 | fsh_29601 90 | MARKETVIEW_20041219.1509 91 | BACONSREBELLION_20050222.0817 92 | CNN_IP_20030417.1600.06 93 | CNN_IP_20030329.1600.00-3 94 | CNN_IP_20030403.1600.00-4 95 | alt.politics.economics_20041206.1835 96 | CNN_ENG_20030408_083034.11 97 | misc.legal.moderated_20050129.2225 98 | CNN_ENG_20030603_095830.17 99 | CNN_ENG_20030611_102832.3 100 | MARKETVIEW_20050228.2211 101 | CNN_ENG_20030506_160524.18 102 | CNN_ENG_20030403_180511.16 103 | rec.music.phish_20050217.1804 104 | BACONSREBELLION_20050125.1108 105 | CNN_ENG_20030509_090025.5 106 | CNN_ENG_20030602_102826.13 107 | CNN_ENG_20030617_065838.21 108 | misc.legal.moderated_20041202.1648 109 | CNN_ENG_20030502_093018.6 110 | AGGRESSIVEVOICEDAILY_20050124.1354 111 | CNN_ENG_20030416_100042.7 112 | rec.arts.sf.written.robert-jordan_20050208.1350 113 | BACONSREBELLION_20050214.0944 114 | CNNHL_ENG_20030415_193729.5 115 | CNNHL_ENG_20030603_230307.3 116 | AGGRESSIVEVOICEDAILY_20050203.1356 117 | CNN_ENG_20030612_072835.2 118 | MARKETVIEW_20050225.0541 119 | CNN_ENG_20030428_193655.2 120 | fsh_29526 121 | MARKBACKER_20050217.0647 122 | MARKETVIEW_20050204.1337 123 | MARKBACKER_20041108.1507 124 | CNN_LE_20030504.1200.02-2 125 | FLOPPINGACES_20041116.0833.027 126 | CNN_ENG_20030625_210122.0 127 | BACONSREBELLION_20050127.1017 128 | CNN_ENG_20030325_150531.10 129 | CNNHL_ENG_20030624_133331.33 130 | CNN_ENG_20030507_060023.1 131 | APW_ENG_20030619.0383 132 | CNN_ENG_20030512_190454.7 133 | MARKBACKER_20050105.1632 134 | MARKBACKER_20050103.0829 135 | CNN_ENG_20030411_070039.21 136 | FLOPPINGACES_20041113.1528.042 137 | CNN_ENG_20030407_170605.7 138 | CNN_ENG_20030626_193133.8 139 | APW_ENG_20030411.0304 140 | alt.atheism_20041104.2428 141 | FLOPPINGACES_20041115.1613.032 142 | APW_ENG_20030527.0232 143 | MARKETVIEW_20050212.1717 144 | CNN_ENG_20030610_123040.9 145 | CNN_ENG_20030429_190711.14 146 | fsh_29191 147 | AGGRESSIVEVOICEDAILY_20050116.2149 148 | APW_ENG_20030409.0013 149 | MARKETVIEW_20050212.1607 150 | MARKETVIEW_20050226.1444 151 | CNN_CF_20030305.1900.06-1 152 | CNN_IP_20030328.1600.07 153 | APW_ENG_20030326.0190 154 | CNN_ENG_20030414_130735.7 155 | MARKBACKER_20041217.1639 156 | CNN_ENG_20030612_160005.13 157 | CNN_ENG_20030329_170349.7 158 | MARKBACKER_20041117.1107 159 | talk.politics.misc_20050216.1337 160 | MARKETVIEW_20041209.1401 161 | APW_ENG_20030603.0303 162 | MARKETVIEW_20050215.1858 163 | XIN_ENG_20030324.0191 164 | misc.kids.pregnancy_20050120.0404 165 | MARKBACKER_20041202.0711 166 | OIADVANTAGE_20050105.0922 167 | CNNHL_ENG_20030625_193346.7 168 | CNN_ENG_20030602_072826.1 169 | AGGRESSIVEVOICEDAILY_20050224.1207 170 | CNN_ENG_20030506_163523.22 171 | rec.parks.theme_20050217.2019 172 | CNN_ENG_20030617_173115.14 173 | soc.culture.jewish_20050130.2105 174 | CNNHL_ENG_20030416_230741.33 175 | MARKETVIEW_20041211.1845 176 | CNN_ENG_20030525_160525.13 177 | MARKETVIEW_20050206.1951 178 | CNN_ENG_20030430_093016.0 179 | soc.culture.china_20050203.0639 180 | CNN_ENG_20030620_095840.4 181 | AGGRESSIVEVOICEDAILY_20050106.1310 182 | CNN_ENG_20030528_165958.16 183 | CNN_ENG_20030415_180754.5 184 | CNN_ENG_20030331_123648.4 185 | CNN_ENG_20030515_073019.7 186 | CNN_CF_20030305.1900.00-3 187 | MARKBACKER_20041112.0707 188 | APW_ENG_20030508.0772 189 | AGGRESSIVEVOICEDAILY_20041201.2313 190 | APW_ENG_20030412.0531 191 | MARKBACKER_20041103.1300 192 | AGGRESSIVEVOICEDAILY_20050107.2012 193 | CNN_ENG_20030327_163556.20 194 | CNN_ENG_20030306_083604.6 195 | FLOPPINGACES_20041228.0927.010 196 | FLOPPINGACES_20041114.1240.039 197 | AGGRESSIVEVOICEDAILY_20041215.2302 198 | CNN_ENG_20030619_115954.10 199 | fsh_29395 200 | CNN_CF_20030303.1900.05 201 | MARKETVIEW_20050214.2115 202 | CNNHL_ENG_20030609_133335.37 203 | CNN_ENG_20030325_220534.6 204 | CNN_ENG_20030508_210555.5 205 | fsh_29592 206 | APW_ENG_20030416.0581 207 | BACONSREBELLION_20050210.0728 208 | MARKETVIEW_20050226.1307 209 | MARKBACKER_20041216.0656 210 | fsh_29786 211 | MARKBACKER_20041128.1641 212 | soc.culture.indian_20041104.2348 213 | CNNHL_ENG_20030523_221118.14 214 | MARKETVIEW_20050216.2120 215 | uk.gay-lesbian-bi_20050127.0311 216 | CNNHL_ENG_20030312_150218.13 217 | CNNHL_ENG_20030513_183907.5 218 | CNN_ENG_20030429_170710.4 219 | APW_ENG_20030419.0358 220 | CNN_ENG_20030425_063006.5 221 | APW_ENG_20030422.0485 222 | BACONSREBELLION_20050204.1326 223 | MARKETVIEW_20050208.2033 224 | APW_ENG_20030417.0555 225 | fsh_29505 226 | CNN_ENG_20030411_193701.3 227 | CNN_ENG_20030527_215946.12 228 | CNN_CF_20030304.1900.06-2 229 | CNN_ENG_20030611_175950.5 230 | CNN_ENG_20030502_080020.7 231 | OIADVANTAGE_20050204.1155 232 | XIN_ENG_20030314.0208 233 | CNN_ENG_20030617_112838.4 234 | GETTINGPOLITICAL_20050105.0127.001 235 | APW_ENG_20030602.0037 236 | AGGRESSIVEVOICEDAILY_20050205.1954 237 | CNN_ENG_20030618_065839.11 238 | CNN_ENG_20030428_130651.4 239 | CNN_ENG_20030528_125956.8 240 | MARKETVIEW_20041217.0801 241 | CNN_ENG_20030403_090032.1 242 | CNN_ENG_20030617_193116.10 243 | APW_ENG_20030424.0532 244 | APW_ENG_20030610.0554 245 | APW_ENG_20030520.0081 246 | CNN_CF_20030304.1900.04 247 | CNN_IP_20030330.1600.06 248 | CNN_ENG_20030515_193533.6 249 | NYT_ENG_20030630.0079 250 | CNNHL_ENG_20030410_193626.13 251 | CNNHL_ENG_20030403_193455.30 252 | CNN_ENG_20030429_143706.14 253 | CNN_ENG_20030514_130518.5 254 | soc.history.what-if_20050129.1404 255 | CNN_IP_20030408.1600.04 256 | BACONSREBELLION_20050216.1618 257 | CNN_ENG_20030622_173306.9 258 | MARKETVIEW_20041213.0722 259 | CNN_IP_20030407.1600.05 260 | fsh_29361 261 | CNN_IP_20030329.1600.00-4 262 | rec.arts.mystery_20050219.1126 263 | CNN_ENG_20030424_173553.8 264 | CNNHL_ENG_20030611_133445.24 265 | APW_ENG_20030408.0090 266 | CNN_ENG_20030626_203133.11 267 | CNN_IP_20030412.1600.03 268 | BACONSREBELLION_20050123.1639 269 | MARKETVIEW_20050204.1322 270 | CNN_ENG_20030401_073033.14 271 | MARKETVIEW_20050120.1641 272 | FLOPPINGACES_20041230.1844.003 273 | CNN_ENG_20030423_180539.2 274 | CNN_ENG_20030528_195959.20 275 | AGGRESSIVEVOICEDAILY_20041101.1806 276 | fsh_29302 277 | CNNHL_ENG_20030610_230438.14 278 | CNNHL_ENG_20030519_124020.23 279 | CNNHL_ENG_20030616_230155.28 280 | alt.sys.pc-clone.dell_20050226.2350 281 | APW_ENG_20030610.0010 282 | alt.obituaries_20041121.1339 283 | fsh_29139 284 | fsh_29336 285 | CNN_CF_20030305.1900.02 286 | alt.gossip.celebrities_20050218.0826 287 | CNN_ENG_20030426_160621.0 288 | fsh_29326 289 | fsh_29770 290 | fsh_29774 291 | AGGRESSIVEVOICEDAILY_20050109.1627 292 | CNN_ENG_20030529_085826.10 293 | CNN_IP_20030402.1600.02-1 294 | soc.culture.iraq_20050211.0445 295 | CNN_IP_20030412.1600.05 296 | CNN_ENG_20030429_083016.5 297 | misc.taxes_20050218.1250 298 | fsh_29350 299 | AGGRESSIVEVOICEDAILY_20041203.1959 300 | MARKETVIEW_20050204.1736 301 | CNN_ENG_20030403_060032.0 302 | fsh_29520 303 | fsh_29195 304 | CNNHL_ENG_20030425_183518.12 305 | CNNHL_ENG_20030403_133453.21 306 | CNNHL_ENG_20030618_230303.36 307 | CNN_IP_20030403.1600.00-1 308 | CNN_ENG_20030525_143522.8 309 | MARKBACKER_20041220.0919 310 | CNN_ENG_20030610_133041.17 311 | fsh_29097 312 | XIN_ENG_20030327.0202 313 | CNNHL_ENG_20030624_230338.34 314 | CNN_IP_20030329.1600.02 315 | CNN_IP_20030408.1600.03 316 | Austin-Grad-Community_20050212.2454 317 | APW_ENG_20030406.0191 318 | soc.history.war.world-war-ii_20050127.2403 319 | CNN_IP_20030409.1600.04 320 | CNN_ENG_20030528_082823.9 321 | CNN_ENG_20030506_053020.14 322 | CNN_ENG_20030507_160538.15 323 | alt.collecting.autographs_20050224.2438 324 | CNN_IP_20030402.1600.00-2 325 | APW_ENG_20030418.0084 326 | CNNHL_ENG_20030513_220910.11 327 | CNN_IP_20030405.1600.01-3 328 | XIN_ENG_20030610.0299 329 | CNN_ENG_20030424_073006.4 330 | BACONSREBELLION_20050205.1919 331 | MARKBACKER_20041119.1002 332 | CNN_LE_20030504.1200.02-1 333 | CNN_ENG_20030409_180633.8 334 | misc.survivalism_20050210.0232 335 | MARKETVIEW_20050207.0746 336 | CNN_ENG_20030306_070606.18 337 | alt.politics_20050124.0640 338 | seattle.politics_20050122.2412 339 | BACONSREBELLION_20050217.0744 340 | AGGRESSIVEVOICEDAILY_20041218.1004 341 | MARKETVIEW_20050205.1358 342 | CNN_ENG_20030505_090022.1 343 | CNN_CF_20030304.1900.01 344 | BACONSREBELLION_20050209.0721 345 | CNN_ENG_20030610_130042.17 346 | aus.cars_20041206.0903 347 | CNN_IP_20030405.1600.00-3 348 | APW_ENG_20030513.0139 349 | MARKETVIEW_20050105.1901 350 | CNN_LE_20030504.1200.01 351 | fsh_29121 352 | MARKETVIEW_20050222.1919 353 | rec.music.makers.guitar.acoustic_20041228.1628 354 | CNN_IP_20030330.1600.05-2 355 | fsh_29187 356 | CNNHL_ENG_20030505_220734.25 357 | CNN_ENG_20030418_063040.1 358 | CNN_ENG_20030605_065831.18 359 | CNN_ENG_20030614_173123.4 360 | OIADVANTAGE_20050203.2102 361 | CNN_ENG_20030611_102832.4 362 | XIN_ENG_20030513.0002 363 | CNN_ENG_20030418_163834.14 364 | fsh_29521 365 | CNN_IP_20030329.1600.00-5 366 | CNNHL_ENG_20030402_193443.5 367 | CNN_ENG_20030624_153103.16 368 | fsh_29783 369 | CNN_ENG_20030602_105829.2 370 | CNN_ENG_20030408_123613.0 371 | APW_ENG_20030404.0439 372 | CNN_ENG_20030516_090022.7 373 | CNN_IP_20030422.1600.05 374 | AGGRESSIVEVOICEDAILY_20041101.1144 375 | BACONSREBELLION_20050218.1214 376 | CNN_ENG_20030509_123601.13 377 | AFP_ENG_20030323.0020 378 | CNN_IP_20030405.1600.00-2 379 | MARKETVIEW_20050126.0711 380 | MARKETVIEW_20041215.2128 381 | AGGRESSIVEVOICEDAILY_20050114.1922 382 | CNN_ENG_20030625_220123.3 383 | CNN_ENG_20030418_130831.5 384 | CNNHL_ENG_20030331_193419.9 385 | CNN_IP_20030402.1600.00-3 386 | CNN_CF_20030305.1900.00-2 387 | CNN_ENG_20030428_173654.13 388 | fsh_29581_1 389 | CNN_ENG_20030526_183538.3 390 | CNN_ENG_20030624_140104.22 391 | CNN_ENG_20030415_103039.0 392 | APW_ENG_20030407.0030 393 | CNN_IP_20030405.1600.02 394 | CNN_ENG_20030501_160459.0 395 | MARKETVIEW_20050127.0716 396 | CNN_ENG_20030411_233701.11 397 | XIN_ENG_20030423.0011 398 | fsh_29348 399 | MARKETVIEW_20041220.1537 400 | CNN_CF_20030305.1900.06-2 401 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_001: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030605_065831.18 2 | APW_ENG_20030510.0228 3 | FLOPPINGACES_20050101.2244.048 4 | CNN_ENG_20030403_183513.1 5 | CNN_ENG_20030428_130651.4 6 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_002: -------------------------------------------------------------------------------- 1 | APW_ENG_20030408.0090 2 | CNN_ENG_20030605_065831.18 3 | CNN_ENG_20030513_160506.16 4 | CNNHL_ENG_20030609_133335.37 5 | CNN_ENG_20030401_233449.5 6 | CNN_ENG_20030421_120508.17 7 | APW_ENG_20030619.0383 8 | CNN_IP_20030329.1600.01-3 9 | MARKBACKER_20041217.1639 10 | AGGRESSIVEVOICEDAILY_20050116.2149 11 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_003: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030508_170552.18 2 | APW_ENG_20030410.0906 3 | CNN_ENG_20030418_063040.1 4 | CNN_ENG_20030424_073006.4 5 | CNN_ENG_20030617_193116.10 6 | CNN_ENG_20030306_070606.18 7 | MARKBACKER_20041206.0733 8 | CNN_ENG_20030612_173004.2 9 | MARKETVIEW_20050204.1736 10 | CNN_CF_20030303.1900.00 11 | CNN_ENG_20030421_120508.17 12 | MARKETVIEW_20041215.2128 13 | rec.music.phish_20041215.1554 14 | AGGRESSIVEVOICEDAILY_20041208.2133 15 | APW_ENG_20030416.0581 16 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_005: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030506_053020.14 2 | CNN_ENG_20030624_065843.24 3 | MARKETVIEW_20050226.1307 4 | APW_ENG_20030610.0010 5 | CNN_ENG_20030403_183513.1 6 | MARKETVIEW_20050225.0541 7 | soc.culture.china_20050203.0639 8 | CNN_ENG_20030426_160621.0 9 | MARKETVIEW_20050212.1607 10 | AGGRESSIVEVOICEDAILY_20050205.1954 11 | MARKBACKER_20041117.1107 12 | CNN_ENG_20030610_133041.17 13 | CNN_ENG_20030513_113501.6 14 | XIN_ENG_20030415.0379 15 | fsh_29786 16 | alt.obituaries_20041121.1339 17 | CNN_ENG_20030508_170552.18 18 | APW_ENG_20030327.0376 19 | APW_ENG_20030424.0698 20 | XIN_ENG_20030327.0202 21 | CNNHL_ENG_20030513_183907.5 22 | CNN_IP_20030405.1600.01-3 23 | MARKETVIEW_20050222.1919 24 | CNNHL_ENG_20030610_133347.6 25 | CNNHL_ENG_20030611_133445.24 26 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_010: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030422_083005.10 2 | FLOPPINGACES_20041113.1528.042 3 | BACONSREBELLION_20050205.1919 4 | MARKETVIEW_20050105.1901 5 | MARKBACKER_20041112.0707 6 | AGGRESSIVEVOICEDAILY_20050224.2252 7 | CNN_ENG_20030424_113549.11 8 | CNN_ENG_20030318_140851.8 9 | CNN_ENG_20030502_080020.7 10 | APW_ENG_20030527.0232 11 | CNN_ENG_20030509_090025.5 12 | MARKETVIEW_20050208.2059 13 | FLOPPINGACES_20041116.0833.027 14 | soc.culture.jewish_20050130.2105 15 | CNN_ENG_20030508_170552.18 16 | APW_ENG_20030520.0757 17 | CNNHL_ENG_20030416_193742.7 18 | FLOPPINGACES_20041114.1240.039 19 | CNNHL_ENG_20030416_133739.9 20 | CNNHL_ENG_20030429_220618.15 21 | XIN_ENG_20030314.0208 22 | rec.boats_20050130.1006 23 | CNN_ENG_20030530_130025.12 24 | rec.parks.theme_20050217.2019 25 | fsh_29505 26 | BACONSREBELLION_20050218.0848 27 | CNN_ENG_20030312_223733.14 28 | XIN_ENG_20030624.0085 29 | CNN_ENG_20030626_203133.11 30 | CNNHL_ENG_20030505_220734.25 31 | CNNHL_ENG_20030331_193419.9 32 | CNNHL_ENG_20030616_230155.28 33 | CNN_IP_20030409.1600.04 34 | misc.kids.pregnancy_20050120.0404 35 | XIN_ENG_20030523.0202 36 | CNN_ENG_20030416_160804.4 37 | CNN_ENG_20030426_160621.0 38 | BACONSREBELLION_20050206.1345 39 | fsh_29171 40 | MARKBACKER_20041128.1641 41 | MARKBACKER_20041119.1002 42 | CNN_ENG_20030610_123040.9 43 | CNN_IP_20030330.1600.05-2 44 | rec.travel.europe_20050101.1800 45 | CNNHL_ENG_20030618_230303.36 46 | fsh_29105 47 | fsh_29303 48 | CNN_ENG_20030327_163556.20 49 | CNN_IP_20030328.1600.07 50 | CNN_ENG_20030501_063017.15 51 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_020: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030529_085826.10 2 | APW_ENG_20030610.0010 3 | CNNHL_ENG_20030610_230438.14 4 | CNN_CF_20030304.1900.02 5 | CNN_ENG_20030403_183513.1 6 | fsh_29344 7 | CNN_ENG_20030306_070606.18 8 | fsh_29226 9 | CNN_ENG_20030428_130651.4 10 | CNN_ENG_20030318_140851.8 11 | fsh_29581_1 12 | CNN_IP_20030329.1600.00-6 13 | CNN_IP_20030402.1600.02-1 14 | CNN_ENG_20030528_172957.18 15 | MARKETVIEW_20050127.0716 16 | CNN_ENG_20030403_060032.0 17 | FLOPPINGACES_20041230.1844.003 18 | MARKETVIEW_20041220.1537 19 | CNN_ENG_20030424_113549.11 20 | MARKETVIEW_20050208.2033 21 | CNN_ENG_20030603_133025.7 22 | MARKETVIEW_20050205.1358 23 | CNN_IP_20030417.1600.06 24 | fsh_29774 25 | APW_ENG_20030519.0548 26 | MARKETVIEW_20050210.2138 27 | rec.boats_20050130.1006 28 | CNN_ENG_20030408_153616.9 29 | CNN_ENG_20030515_073019.7 30 | CNN_ENG_20030320_153434.7 31 | MARKBACKER_20050217.0647 32 | AGGRESSIVEVOICEDAILY_20050114.1922 33 | CNN_ENG_20030408_200618.14 34 | CNN_ENG_20030627_130145.6 35 | CNN_ENG_20030621_115841.16 36 | misc.legal.moderated_20041202.1648 37 | BACONSREBELLION_20050210.0728 38 | CNN_ENG_20030617_112838.4 39 | BACONSREBELLION_20050205.1919 40 | fsh_29786 41 | CNN_IP_20030330.1600.06 42 | MARKBACKER_20050103.0829 43 | soc.history.what-if_20050129.1404 44 | fsh_29326 45 | CNN_ENG_20030529_130011.6 46 | CNN_IP_20030405.1600.01-3 47 | CNN_ENG_20030514_130518.5 48 | CNNHL_ENG_20030604_230238.5 49 | CNN_ENG_20030429_083016.5 50 | CNN_ENG_20030424_183556.7 51 | APW_ENG_20030326.0190 52 | BACONSREBELLION_20050125.1108 53 | CNN_ENG_20030407_170605.7 54 | CNN_IP_20030329.1600.00-3 55 | XIN_ENG_20030609.0118 56 | CNNHL_ENG_20030402_133449.22 57 | CNN_IP_20030402.1600.00-3 58 | CNN_ENG_20030625_210122.0 59 | CNN_ENG_20030430_160723.6 60 | soc.culture.iraq_20050211.0445 61 | FLOPPINGACES_20041113.1528.042 62 | CNN_ENG_20030625_220123.3 63 | MARKBACKER_20041216.0656 64 | CNN_ENG_20030528_082823.9 65 | FLOPPINGACES_20041114.1240.039 66 | CNN_CF_20030304.1900.06-2 67 | APW_ENG_20030406.0191 68 | CNNHL_ENG_20030526_221156.39 69 | CNN_ENG_20030331_123648.4 70 | AGGRESSIVEVOICEDAILY_20050107.2012 71 | CNN_LE_20030504.1200.02-2 72 | MARKETVIEW_20050204.1337 73 | soc.culture.indian_20041104.2348 74 | MARKBACKER_20041117.1107 75 | APW_ENG_20030403.0862 76 | AGGRESSIVEVOICEDAILY_20050116.2149 77 | CNN_ENG_20030525_143522.8 78 | XIN_ENG_20030513.0002 79 | NYT_ENG_20030602.0074 80 | CNNHL_ENG_20030425_183518.12 81 | fsh_29586 82 | rec.travel.usa-canada_20050128.0121 83 | CNN_ENG_20030429_110706.7 84 | CNNHL_ENG_20030513_220910.11 85 | CNN_IP_20030330.1600.05-2 86 | BACONSREBELLION_20050209.0721 87 | FLOPPINGACES_20050101.2244.048 88 | CNN_ENG_20030430_063016.14 89 | AGGRESSIVEVOICEDAILY_20050203.1356 90 | CNN_ENG_20030401_233449.5 91 | MARKETVIEW_20050212.1717 92 | MARKBACKER_20041128.1641 93 | MARKBACKER_20041103.1300 94 | MARKETVIEW_20041212.1447 95 | CNN_CF_20030305.1900.06-1 96 | CNN_ENG_20030509_123601.13 97 | MARKETVIEW_20050208.2059 98 | CNN_ENG_20030630_075848.7 99 | AGGRESSIVEVOICEDAILY_20041101.1144 100 | CNN_ENG_20030401_073033.14 101 | CNN_ENG_20030506_053020.14 102 | OIADVANTAGE_20050204.1155 103 | CNN_ENG_20030429_190711.14 104 | XIN_ENG_20030523.0202 105 | APW_ENG_20030520.0081 106 | CNN_ENG_20030312_223733.14 107 | CNN_ENG_20030617_173115.22 108 | CNN_CF_20030303.1900.05 109 | fsh_29192 110 | CNN_IP_20030405.1600.02 111 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_030: -------------------------------------------------------------------------------- 1 | CNN_IP_20030405.1600.00-2 2 | BACONSREBELLION_20050204.1326 3 | CNN_ENG_20030501_063017.15 4 | CNN_ENG_20030501_160459.0 5 | fsh_29350 6 | CNN_ENG_20030610_095857.4 7 | fsh_29388 8 | CNN_ENG_20030626_193133.8 9 | CNN_IP_20030404.1600.00-2 10 | fsh_29139 11 | soc.culture.jewish_20050130.2105 12 | CNN_ENG_20030605_193002.8 13 | CNN_LE_20030504.1200.02-1 14 | MARKETVIEW_20050212.1607 15 | misc.invest.marketplace_20050208.2406 16 | APW_ENG_20030519.0367 17 | CNNHL_ENG_20030610_230438.14 18 | CNN_ENG_20030527_195948.3 19 | XIN_ENG_20030509.0137 20 | rec.arts.sf.written.robert-jordan_20050208.1350 21 | CNN_ENG_20030527_215946.12 22 | rec.boats_20050130.1006 23 | fsh_29592 24 | CNNHL_ENG_20030526_221156.39 25 | MARKBACKER_20041119.1002 26 | CNN_IP_20030330.1600.06 27 | CNN_ENG_20030516_090022.7 28 | CNN_IP_20030405.1600.01-2 29 | AGGRESSIVEVOICEDAILY_20041223.1449 30 | APW_ENG_20030327.0376 31 | MARKETVIEW_20050127.0716 32 | CNN_ENG_20030408_083034.11 33 | CNN_CF_20030305.1900.00-2 34 | CNNHL_ENG_20030604_230238.5 35 | misc.legal.moderated_20050129.2225 36 | OIADVANTAGE_20050203.2102 37 | NYT_ENG_20030630.0079 38 | fsh_29628 39 | CNN_ENG_20030617_112838.4 40 | AGGRESSIVEVOICEDAILY_20050224.1207 41 | alt.sys.pc-clone.dell_20050226.2350 42 | CNN_ENG_20030423_180539.2 43 | CNN_IP_20030408.1600.04 44 | BACONSREBELLION_20050226.1317 45 | fsh_29195 46 | APW_ENG_20030502.0470 47 | rec.travel.europe_20050101.1800 48 | AGGRESSIVEVOICEDAILY_20041226.1712 49 | BACONSREBELLION_20050222.1348 50 | CNN_ENG_20030403_060032.0 51 | MARKETVIEW_20041220.1537 52 | CNN_ENG_20030525_143522.8 53 | HEALINGIRAQ_20041108.1942.05 54 | GETTINGPOLITICAL_20050105.0127.001 55 | CNNHL_ENG_20030618_230303.6 56 | CNN_ENG_20030620_095840.4 57 | AFP_ENG_20030330.0211 58 | fsh_29105 59 | CNNHL_ENG_20030625_230351.4 60 | OIADVANTAGE_20050109.1947 61 | CNN_ENG_20030430_093016.0 62 | CNN_ENG_20030525_160525.13 63 | APW_ENG_20030422.0469 64 | CNN_ENG_20030612_173004.2 65 | CNN_ENG_20030408_200618.14 66 | CNN_ENG_20030416_160804.4 67 | CNN_ENG_20030627_065846.3 68 | MARKETVIEW_20041209.1401 69 | CNN_ENG_20030528_172957.18 70 | rec.travel.cruises_20050216.1636 71 | CNN_ENG_20030428_193655.2 72 | MARKBACKER_20041217.1639 73 | CNN_IP_20030403.1600.00-1 74 | CNN_IP_20030402.1600.02-2 75 | CNN_ENG_20030421_120508.17 76 | fsh_29192 77 | XIN_ENG_20030324.0191 78 | rec.sport.disc_20050209.2202 79 | CNN_CF_20030305.1900.06-2 80 | CNN_ENG_20030526_133535.4 81 | fsh_29581_1 82 | APW_ENG_20030406.0191 83 | CNN_ENG_20030619_115954.4 84 | MARKBACKER_20050105.1632 85 | BACONSREBELLION_20050216.1618 86 | fsh_29505 87 | CNN_IP_20030330.1600.05-2 88 | AGGRESSIVEVOICEDAILY_20041208.2133 89 | CNN_ENG_20030415_183752.14 90 | OIADVANTAGE_20050110.1009 91 | MARKETVIEW_20050226.1444 92 | BACONSREBELLION_20050218.0848 93 | AGGRESSIVEVOICEDAILY_20041215.2302 94 | CNNHL_ENG_20030416_133739.9 95 | CNN_ENG_20030603_133025.7 96 | XIN_ENG_20030425.0184 97 | MARKETVIEW_20041211.1845 98 | APW_ENG_20030415.0742 99 | APW_ENG_20030519.0548 100 | CNN_ENG_20030312_083725.3 101 | AGGRESSIVEVOICEDAILY_20050114.1922 102 | APW_ENG_20030619.0383 103 | CNN_ENG_20030306_070606.18 104 | CNN_IP_20030405.1600.02 105 | CNN_ENG_20030507_160538.15 106 | MARKBACKER_20041216.0656 107 | fsh_29187 108 | MARKETVIEW_20050206.1951 109 | fsh_29520 110 | CNN_IP_20030404.1600.00-1 111 | CNNHL_ENG_20030416_193742.7 112 | alt.gossip.celebrities_20050218.0826 113 | CNN_ENG_20030528_195959.20 114 | BACONSREBELLION_20050127.1017 115 | CNN_ENG_20030403_080032.9 116 | CNN_CF_20030305.1900.06-1 117 | CNN_ENG_20030507_170539.0 118 | CNN_IP_20030329.1600.01-1 119 | alt.politics.economics_20041206.1835 120 | CNN_ENG_20030605_065831.18 121 | AGGRESSIVEVOICEDAILY_20041201.2313 122 | CNNHL_ENG_20030407_193547.5 123 | CNNHL_ENG_20030411_230640.38 124 | CNN_ENG_20030429_110706.7 125 | APW_ENG_20030412.0531 126 | CNN_ENG_20030619_115954.10 127 | fsh_29121 128 | CNNHL_ENG_20030611_133445.24 129 | APW_ENG_20030610.0010 130 | CNN_ENG_20030305_170125.1 131 | MARKETVIEW_20050126.0711 132 | BACONSREBELLION_20050222.0817 133 | CNNHL_ENG_20030624_230338.34 134 | MARKBACKER_20041117.0723 135 | CNN_CF_20030304.1900.04 136 | CNN_ENG_20030618_065839.11 137 | CNN_CF_20030305.1900.00-1 138 | BACONSREBELLION_20050123.1639 139 | CNN_ENG_20030610_130042.17 140 | MARKBACKER_20050217.0647 141 | XIN_ENG_20030513.0002 142 | fsh_29344 143 | CNN_ENG_20030507_060023.1 144 | CNN_ENG_20030331_123648.4 145 | CNNHL_ENG_20030402_133449.22 146 | CNN_ENG_20030408_153616.9 147 | CNN_ENG_20030508_170552.18 148 | MARKETVIEW_20050210.2138 149 | CNN_ENG_20030513_160506.16 150 | APW_ENG_20030326.0190 151 | MARKBACKER_20041220.0919 152 | XIN_ENG_20030624.0085 153 | CNN_ENG_20030528_082823.9 154 | misc.legal.moderated_20041202.1648 155 | MARKETVIEW_20050204.1322 156 | CNN_ENG_20030618_193127.17 157 | APW_ENG_20030409.0013 158 | CNN_ENG_20030618_150128.5 159 | CNN_ENG_20030605_223004.4 160 | rec.music.phish_20041215.1554 161 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_050: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030624_082841.12 2 | fsh_29581_1 3 | APW_ENG_20030602.0037 4 | APW_ENG_20030326.0190 5 | CNN_ENG_20030610_105832.1 6 | AGGRESSIVEVOICEDAILY_20041101.1806 7 | MARKBACKER_20041119.1002 8 | soc.culture.jewish_20050130.2105 9 | MARKETVIEW_20050127.0716 10 | MARKBACKER_20041202.0711 11 | CNN_IP_20030402.1600.00-3 12 | CNN_CF_20030303.1900.06-2 13 | CNN_ENG_20030621_160254.25 14 | CNN_ENG_20030526_133535.4 15 | MARKETVIEW_20050120.1641 16 | NYT_ENG_20030630.0079 17 | fsh_29344 18 | CNN_ENG_20030312_083725.3 19 | fsh_29302 20 | alt.obituaries_20041121.1339 21 | fsh_29395 22 | CNN_ENG_20030604_092828.7 23 | AGGRESSIVEVOICEDAILY_20041223.1449 24 | APW_ENG_20030502.0686 25 | CNN_ENG_20030607_170312.6 26 | fsh_29770 27 | MARKETVIEW_20041213.0722 28 | CNN_ENG_20030329_170349.7 29 | CNNHL_ENG_20030624_133331.33 30 | rec.boats_20050130.1006 31 | AGGRESSIVEVOICEDAILY_20050105.1344 32 | CNN_IP_20030402.1600.02-1 33 | CNNHL_ENG_20030505_220734.25 34 | CNN_ENG_20030402_190500.11 35 | AGGRESSIVEVOICEDAILY_20050224.2252 36 | AGGRESSIVEVOICEDAILY_20050107.2012 37 | CNN_ENG_20030415_103039.0 38 | CNN_ENG_20030507_170539.0 39 | MARKETVIEW_20050204.1337 40 | CNN_ENG_20030617_193116.10 41 | CNNHL_ENG_20030403_133453.21 42 | CNN_ENG_20030625_220123.3 43 | CNN_ENG_20030416_190806.4 44 | misc.legal.moderated_20050129.2225 45 | CNN_IP_20030402.1600.02-2 46 | CNN_CF_20030304.1900.06-2 47 | CNN_ENG_20030407_130604.10 48 | CNN_IP_20030402.1600.00-4 49 | CNN_ENG_20030514_130518.5 50 | CNN_ENG_20030618_150128.5 51 | APW_ENG_20030415.0742 52 | AGGRESSIVEVOICEDAILY_20041215.2302 53 | CNN_ENG_20030409_180633.8 54 | CNN_CF_20030304.1900.01 55 | CNN_ENG_20030626_193133.8 56 | CNN_ENG_20030424_183556.7 57 | FLOPPINGACES_20041230.1844.003 58 | CNN_ENG_20030624_065843.24 59 | AFP_ENG_20030323.0020 60 | CNN_ENG_20030622_173306.9 61 | APW_ENG_20030513.0139 62 | CNN_ENG_20030602_102826.13 63 | CNN_ENG_20030605_223004.4 64 | CNN_ENG_20030430_160723.6 65 | CNN_IP_20030405.1600.01-2 66 | CNN_IP_20030329.1600.00-3 67 | rec.travel.cruises_20050222.0313 68 | MARKETVIEW_20050207.0746 69 | CNN_ENG_20030428_130651.4 70 | soc.culture.indian_20041104.2348 71 | CNN_ENG_20030425_063006.5 72 | fsh_29505 73 | CNN_ENG_20030619_115954.10 74 | fsh_29336 75 | FLOPPINGACES_20041115.1613.032 76 | CNN_ENG_20030404_163526.10 77 | CNN_ENG_20030617_105836.4 78 | fsh_29195 79 | CNNHL_ENG_20030403_193455.30 80 | CNN_ENG_20030617_065838.21 81 | APW_ENG_20030520.0757 82 | soc.culture.china_20050203.0639 83 | rec.music.makers.guitar.acoustic_20041228.1628 84 | CNN_ENG_20030509_123601.13 85 | MARKBACKER_20041128.1641 86 | OIADVANTAGE_20050108.1323 87 | CNN_ENG_20030516_123543.8 88 | APW_ENG_20030412.0531 89 | CNN_ENG_20030426_160621.0 90 | CNN_ENG_20030610_085833.10 91 | CNNHL_ENG_20030312_150218.13 92 | MARKETVIEW_20050216.2120 93 | CNN_ENG_20030306_070606.18 94 | CNNHL_ENG_20030625_230351.4 95 | CNN_LE_20030504.1200.02-1 96 | BACONSREBELLION_20050127.1017 97 | CNN_LE_20030504.1200.01 98 | MARKETVIEW_20050105.1901 99 | CNN_ENG_20030306_083604.6 100 | MARKETVIEW_20041217.0801 101 | rec.music.phish_20041215.1554 102 | alt.atheism_20041104.2428 103 | XIN_ENG_20030624.0085 104 | MARKETVIEW_20050208.2059 105 | CNN_ENG_20030515_073019.7 106 | APW_ENG_20030423.0079 107 | CNN_ENG_20030403_183513.1 108 | fsh_29141 109 | CNN_IP_20030408.1600.04 110 | fsh_29774 111 | APW_ENG_20030408.0090 112 | MARKBACKER_20041103.1300 113 | AGGRESSIVEVOICEDAILY_20050109.1627 114 | XIN_ENG_20030423.0011 115 | AGGRESSIVEVOICEDAILY_20050116.2149 116 | CNN_ENG_20030331_193655.14 117 | CNN_CF_20030304.1900.02 118 | CNNHL_ENG_20030430_220712.37 119 | CNN_ENG_20030603_095830.17 120 | misc.kids.pregnancy_20050120.0404 121 | aus.cars_20041206.0903 122 | alt.corel_20041228.0503 123 | fsh_29139 124 | CNN_ENG_20030422_083005.10 125 | CNN_ENG_20030408_153616.9 126 | OIADVANTAGE_20050109.1947 127 | soc.culture.iraq_20050211.0445 128 | alt.books.tom-clancy_20050130.1848 129 | APW_ENG_20030327.0376 130 | CNN_ENG_20030424_073006.4 131 | APW_ENG_20030422.0469 132 | CNN_ENG_20030418_083040.11 133 | CNNHL_ENG_20030624_230338.34 134 | alt.collecting.autographs_20050224.2438 135 | MARKBACKER_20050105.1632 136 | MARKETVIEW_20041212.1447 137 | APW_ENG_20030424.0698 138 | CNN_IP_20030404.1600.00-2 139 | CNNHL_ENG_20030611_133445.24 140 | AGGRESSIVEVOICEDAILY_20041226.1712 141 | XIN_ENG_20030425.0184 142 | CNN_IP_20030409.1600.04 143 | CNNHL_ENG_20030425_183518.12 144 | CNN_ENG_20030403_090032.1 145 | AGGRESSIVEVOICEDAILY_20050114.1922 146 | CNNHL_ENG_20030513_183907.5 147 | CNN_ENG_20030417_063039.0 148 | CNN_ENG_20030429_190711.14 149 | fsh_29622 150 | alt.politics_20050124.0640 151 | CNN_LE_20030504.1200.02-2 152 | MARKETVIEW_20050208.2033 153 | CNN_IP_20030329.1600.01-3 154 | CNN_CF_20030303.1900.00 155 | CNN_IP_20030329.1600.00-6 156 | BACONSREBELLION_20050216.1618 157 | CNNHL_ENG_20030415_193729.5 158 | BACONSREBELLION_20050218.0848 159 | CNN_IP_20030406.1600.03 160 | CNN_ENG_20030403_080032.9 161 | CNN_ENG_20030602_133012.9 162 | APW_ENG_20030610.0010 163 | CNN_IP_20030405.1600.00-3 164 | BACONSREBELLION_20050222.1348 165 | CNN_ENG_20030527_215946.12 166 | MARKETVIEW_20050201.0748 167 | NYT_ENG_20030602.0074 168 | soc.culture.hmong_20050210.1130 169 | fsh_29350 170 | MARKETVIEW_20050206.1951 171 | fsh_29226 172 | MARKBACKER_20050103.0829 173 | CNN_ENG_20030612_173004.10 174 | BACONSREBELLION_20050125.1108 175 | CNN_ENG_20030421_133510.6 176 | MARKETVIEW_20050226.1444 177 | CNN_ENG_20030614_173123.4 178 | CNN_ENG_20030430_063016.14 179 | MARKETVIEW_20041209.1401 180 | AGGRESSIVEVOICEDAILY_20041218.1004 181 | CNN_ENG_20030612_072835.2 182 | CNN_ENG_20030616_130059.25 183 | AGGRESSIVEVOICEDAILY_20050125.0136 184 | OIADVANTAGE_20041224.1007 185 | CNN_ENG_20030617_173115.22 186 | AGGRESSIVEVOICEDAILY_20041208.2133 187 | APW_ENG_20030404.0439 188 | fsh_29138 189 | CNN_ENG_20030525_160525.13 190 | AGGRESSIVEVOICEDAILY_20050124.1354 191 | CNNHL_ENG_20030513_220910.11 192 | Austin-Grad-Community_20050212.2454 193 | APW_ENG_20030410.0906 194 | CNN_ENG_20030416_100042.7 195 | CNN_ENG_20030403_180511.16 196 | alt.gossip.celebrities_20050218.0826 197 | FLOPPINGACES_20041116.0833.027 198 | APW_ENG_20030519.0548 199 | CNN_IP_20030329.1600.01-1 200 | CNN_IP_20030330.1600.06 201 | BACONSREBELLION_20050214.0944 202 | CNN_ENG_20030621_115841.16 203 | fsh_29272 204 | CNN_ENG_20030418_130831.5 205 | fsh_29388 206 | CNN_ENG_20030528_195959.20 207 | CNN_ENG_20030513_160506.16 208 | GETTINGPOLITICAL_20050105.0127.001 209 | CNN_IP_20030403.1600.00-1 210 | CNN_ENG_20030525_143522.8 211 | APW_ENG_20030416.0581 212 | AGGRESSIVEVOICEDAILY_20050208.1142 213 | XIN_ENG_20030408.0341 214 | CNN_IP_20030404.1600.00-1 215 | alt.gossip.celebrities_20041118.2331 216 | CNN_IP_20030403.1600.00-2 217 | AGGRESSIVEVOICEDAILY_20050205.1954 218 | MARKBACKER_20041220.0919 219 | CNN_ENG_20030605_085831.13 220 | AGGRESSIVEVOICEDAILY_20050203.1356 221 | CNN_ENG_20030428_193655.2 222 | CNN_ENG_20030430_093016.0 223 | CNN_ENG_20030506_160524.18 224 | OIADVANTAGE_20050105.0922 225 | CNN_ENG_20030415_180754.5 226 | CNN_ENG_20030507_160538.15 227 | CNN_CF_20030305.1900.06-2 228 | CNN_ENG_20030312_223733.14 229 | fsh_29520 230 | CNN_IP_20030403.1600.00-3 231 | MARKBACKER_20041112.0707 232 | fsh_29786 233 | CNN_ENG_20030411_193701.3 234 | CNN_ENG_20030417_073039.2 235 | AGGRESSIVEVOICEDAILY_20050106.1310 236 | fsh_29121 237 | BACONSREBELLION_20050206.1345 238 | CNN_IP_20030405.1600.02 239 | XIN_ENG_20030523.0202 240 | CNN_IP_20030410.1600.03-2 241 | MARKBACKER_20041216.0656 242 | fsh_29187 243 | CNN_ENG_20030506_163523.22 244 | soc.history.war.world-war-ii_20050127.2403 245 | APW_ENG_20030508.0772 246 | rec.sport.disc_20050209.2202 247 | CNN_ENG_20030620_170011.14 248 | OIADVANTAGE_20050204.1155 249 | CNN_ENG_20030421_120508.13 250 | CNNHL_ENG_20030609_133335.37 251 | CNN_ENG_20030529_130011.6 252 | CNN_ENG_20030528_172957.18 253 | fsh_29782_2 254 | CNN_ENG_20030325_220534.6 255 | MARKETVIEW_20050215.1858 256 | CNN_ENG_20030605_065831.18 257 | CNN_ENG_20030513_080020.2 258 | CNN_ENG_20030624_153103.17 259 | BACONSREBELLION_20050216.1632 260 | CNNHL_ENG_20030429_220618.15 261 | -------------------------------------------------------------------------------- /resource/low_resource_split/ace05ep/doc_list_075: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030604_092828.7 2 | APW_ENG_20030502.0686 3 | MARKETVIEW_20050208.2059 4 | CNN_IP_20030329.1600.00-6 5 | CNN_ENG_20030610_105832.1 6 | CNN_ENG_20030407_130604.10 7 | CNNHL_ENG_20030430_220712.37 8 | CNN_IP_20030410.1600.03-1 9 | rec.travel.cruises_20050216.1636 10 | MARKETVIEW_20050222.0729 11 | CNN_ENG_20030607_170312.6 12 | MARKETVIEW_20050217.2115 13 | BACONSREBELLION_20050206.1345 14 | CNN_IP_20030405.1600.01-2 15 | MARKETVIEW_20050201.0748 16 | OIADVANTAGE_20041224.1007 17 | rec.sport.disc_20050209.2202 18 | MARKBACKER_20050105.1526 19 | alt.support.divorce_20050113.2451 20 | CNN_ENG_20030320_153434.7 21 | XIN_ENG_20030609.0118 22 | CNN_ENG_20030612_173004.10 23 | CNN_ENG_20030401_233449.5 24 | APW_ENG_20030422.0469 25 | CNN_ENG_20030612_173004.2 26 | rec.music.phish_20041215.1554 27 | CNN_ENG_20030630_085848.18 28 | CNN_IP_20030414.1600.04 29 | CNN_ENG_20030513_080020.2 30 | fsh_29586 31 | CNN_ENG_20030621_115841.16 32 | APW_ENG_20030519.0367 33 | CNN_ENG_20030305_170125.1 34 | BACONSREBELLION_20050216.1536 35 | CNN_ENG_20030515_063019.6 36 | CNN_IP_20030403.1600.00-3 37 | CNN_ENG_20030605_105831.11 38 | AGGRESSIVEVOICEDAILY_20050113.1400 39 | CNN_ENG_20030530_130025.12 40 | APW_ENG_20030502.0470 41 | CNN_ENG_20030407_080037.12 42 | AGGRESSIVEVOICEDAILY_20050213.2123 43 | CNN_ENG_20030516_123543.8 44 | CNNHL_ENG_20030416_193742.26 45 | fsh_29141 46 | BACONSREBELLION_20050222.1348 47 | CNN_ENG_20030421_090007.11 48 | CNN_ENG_20030408_153616.9 49 | fsh_29171 50 | CNN_ENG_20030602_133012.9 51 | CNN_ENG_20030627_130145.6 52 | alt.corel_20041228.0503 53 | BACONSREBELLION_20050218.0848 54 | OIADVANTAGE_20050110.1009 55 | CNN_ENG_20030421_120508.17 56 | CNN_IP_20030329.1600.01-3 57 | OIADVANTAGE_20050109.1947 58 | CNNHL_ENG_20030428_123600.14 59 | APW_ENG_20030424.0698 60 | CNN_ENG_20030624_082841.12 61 | CNN_ENG_20030607_173310.4 62 | CNN_ENG_20030408_200618.14 63 | CNNHL_ENG_20030416_193742.7 64 | CNN_ENG_20030404_163526.10 65 | APW_ENG_20030510.0228 66 | alt.books.tom-clancy_20050130.1848 67 | CNNHL_ENG_20030604_230238.5 68 | CNN_CF_20030303.1900.06-2 69 | MARKETVIEW_20050210.2138 70 | fsh_29628 71 | CNN_ENG_20030619_115954.4 72 | APW_ENG_20030403.0862 73 | OIADVANTAGE_20050203.1000 74 | MARKETVIEW_20041212.1447 75 | CNN_ENG_20030430_063016.14 76 | CNN_ENG_20030624_065843.24 77 | BACONSREBELLION_20050227.1238 78 | CNN_ENG_20030618_193127.17 79 | CNN_ENG_20030527_195948.3 80 | APW_ENG_20030423.0079 81 | CNNHL_ENG_20030402_133449.22 82 | CNN_ENG_20030513_113501.6 83 | Integritas-Group-Community-Forum_20050110.0557 84 | CNN_ENG_20030610_085833.10 85 | XIN_ENG_20030624.0085 86 | APW_ENG_20030414.0392 87 | CNN_ENG_20030605_153000.9 88 | CNNHL_ENG_20030416_133739.13 89 | fsh_29601 90 | MARKETVIEW_20041219.1509 91 | BACONSREBELLION_20050222.0817 92 | CNN_IP_20030417.1600.06 93 | CNN_IP_20030329.1600.00-3 94 | CNN_IP_20030403.1600.00-4 95 | alt.politics.economics_20041206.1835 96 | CNN_ENG_20030408_083034.11 97 | misc.legal.moderated_20050129.2225 98 | CNN_ENG_20030603_095830.17 99 | CNN_ENG_20030611_102832.3 100 | MARKETVIEW_20050228.2211 101 | CNN_ENG_20030506_160524.18 102 | CNN_ENG_20030403_180511.16 103 | rec.music.phish_20050217.1804 104 | BACONSREBELLION_20050125.1108 105 | CNN_ENG_20030509_090025.5 106 | CNN_ENG_20030602_102826.13 107 | CNN_ENG_20030617_065838.21 108 | misc.legal.moderated_20041202.1648 109 | CNN_ENG_20030502_093018.6 110 | AGGRESSIVEVOICEDAILY_20050124.1354 111 | CNN_ENG_20030416_100042.7 112 | rec.arts.sf.written.robert-jordan_20050208.1350 113 | BACONSREBELLION_20050214.0944 114 | CNNHL_ENG_20030415_193729.5 115 | CNNHL_ENG_20030603_230307.3 116 | AGGRESSIVEVOICEDAILY_20050203.1356 117 | CNN_ENG_20030612_072835.2 118 | MARKETVIEW_20050225.0541 119 | CNN_ENG_20030428_193655.2 120 | fsh_29526 121 | MARKBACKER_20050217.0647 122 | MARKETVIEW_20050204.1337 123 | MARKBACKER_20041108.1507 124 | CNN_LE_20030504.1200.02-2 125 | FLOPPINGACES_20041116.0833.027 126 | CNN_ENG_20030625_210122.0 127 | BACONSREBELLION_20050127.1017 128 | CNN_ENG_20030325_150531.10 129 | CNNHL_ENG_20030624_133331.33 130 | CNN_ENG_20030507_060023.1 131 | APW_ENG_20030619.0383 132 | CNN_ENG_20030512_190454.7 133 | MARKBACKER_20050105.1632 134 | MARKBACKER_20050103.0829 135 | CNN_ENG_20030411_070039.21 136 | FLOPPINGACES_20041113.1528.042 137 | CNN_ENG_20030407_170605.7 138 | CNN_ENG_20030626_193133.8 139 | APW_ENG_20030411.0304 140 | alt.atheism_20041104.2428 141 | FLOPPINGACES_20041115.1613.032 142 | APW_ENG_20030527.0232 143 | MARKETVIEW_20050212.1717 144 | CNN_ENG_20030610_123040.9 145 | CNN_ENG_20030429_190711.14 146 | fsh_29191 147 | AGGRESSIVEVOICEDAILY_20050116.2149 148 | APW_ENG_20030409.0013 149 | MARKETVIEW_20050212.1607 150 | MARKETVIEW_20050226.1444 151 | CNN_CF_20030305.1900.06-1 152 | CNN_IP_20030328.1600.07 153 | APW_ENG_20030326.0190 154 | CNN_ENG_20030414_130735.7 155 | MARKBACKER_20041217.1639 156 | CNN_ENG_20030612_160005.13 157 | CNN_ENG_20030329_170349.7 158 | MARKBACKER_20041117.1107 159 | talk.politics.misc_20050216.1337 160 | MARKETVIEW_20041209.1401 161 | APW_ENG_20030603.0303 162 | MARKETVIEW_20050215.1858 163 | XIN_ENG_20030324.0191 164 | misc.kids.pregnancy_20050120.0404 165 | MARKBACKER_20041202.0711 166 | OIADVANTAGE_20050105.0922 167 | CNNHL_ENG_20030625_193346.7 168 | CNN_ENG_20030602_072826.1 169 | AGGRESSIVEVOICEDAILY_20050224.1207 170 | CNN_ENG_20030506_163523.22 171 | rec.parks.theme_20050217.2019 172 | CNN_ENG_20030617_173115.14 173 | soc.culture.jewish_20050130.2105 174 | CNNHL_ENG_20030416_230741.33 175 | MARKETVIEW_20041211.1845 176 | CNN_ENG_20030525_160525.13 177 | MARKETVIEW_20050206.1951 178 | CNN_ENG_20030430_093016.0 179 | soc.culture.china_20050203.0639 180 | CNN_ENG_20030620_095840.4 181 | AGGRESSIVEVOICEDAILY_20050106.1310 182 | CNN_ENG_20030528_165958.16 183 | CNN_ENG_20030415_180754.5 184 | CNN_ENG_20030331_123648.4 185 | CNN_ENG_20030515_073019.7 186 | CNN_CF_20030305.1900.00-3 187 | MARKBACKER_20041112.0707 188 | APW_ENG_20030508.0772 189 | AGGRESSIVEVOICEDAILY_20041201.2313 190 | APW_ENG_20030412.0531 191 | MARKBACKER_20041103.1300 192 | AGGRESSIVEVOICEDAILY_20050107.2012 193 | CNN_ENG_20030327_163556.20 194 | CNN_ENG_20030306_083604.6 195 | FLOPPINGACES_20041228.0927.010 196 | FLOPPINGACES_20041114.1240.039 197 | AGGRESSIVEVOICEDAILY_20041215.2302 198 | CNN_ENG_20030619_115954.10 199 | fsh_29395 200 | CNN_CF_20030303.1900.05 201 | MARKETVIEW_20050214.2115 202 | CNNHL_ENG_20030609_133335.37 203 | CNN_ENG_20030325_220534.6 204 | CNN_ENG_20030508_210555.5 205 | fsh_29592 206 | APW_ENG_20030416.0581 207 | BACONSREBELLION_20050210.0728 208 | MARKETVIEW_20050226.1307 209 | MARKBACKER_20041216.0656 210 | fsh_29786 211 | MARKBACKER_20041128.1641 212 | soc.culture.indian_20041104.2348 213 | CNNHL_ENG_20030523_221118.14 214 | MARKETVIEW_20050216.2120 215 | uk.gay-lesbian-bi_20050127.0311 216 | CNNHL_ENG_20030312_150218.13 217 | CNNHL_ENG_20030513_183907.5 218 | CNN_ENG_20030429_170710.4 219 | APW_ENG_20030419.0358 220 | CNN_ENG_20030425_063006.5 221 | APW_ENG_20030422.0485 222 | BACONSREBELLION_20050204.1326 223 | MARKETVIEW_20050208.2033 224 | APW_ENG_20030417.0555 225 | fsh_29505 226 | CNN_ENG_20030411_193701.3 227 | CNN_ENG_20030527_215946.12 228 | CNN_CF_20030304.1900.06-2 229 | CNN_ENG_20030611_175950.5 230 | CNN_ENG_20030502_080020.7 231 | OIADVANTAGE_20050204.1155 232 | XIN_ENG_20030314.0208 233 | CNN_ENG_20030617_112838.4 234 | GETTINGPOLITICAL_20050105.0127.001 235 | APW_ENG_20030602.0037 236 | AGGRESSIVEVOICEDAILY_20050205.1954 237 | CNN_ENG_20030618_065839.11 238 | CNN_ENG_20030428_130651.4 239 | CNN_ENG_20030528_125956.8 240 | MARKETVIEW_20041217.0801 241 | CNN_ENG_20030403_090032.1 242 | CNN_ENG_20030617_193116.10 243 | APW_ENG_20030424.0532 244 | APW_ENG_20030610.0554 245 | APW_ENG_20030520.0081 246 | CNN_CF_20030304.1900.04 247 | CNN_IP_20030330.1600.06 248 | CNN_ENG_20030515_193533.6 249 | NYT_ENG_20030630.0079 250 | CNNHL_ENG_20030410_193626.13 251 | CNNHL_ENG_20030403_193455.30 252 | CNN_ENG_20030429_143706.14 253 | CNN_ENG_20030514_130518.5 254 | soc.history.what-if_20050129.1404 255 | CNN_IP_20030408.1600.04 256 | BACONSREBELLION_20050216.1618 257 | CNN_ENG_20030622_173306.9 258 | MARKETVIEW_20041213.0722 259 | CNN_IP_20030407.1600.05 260 | fsh_29361 261 | CNN_IP_20030329.1600.00-4 262 | rec.arts.mystery_20050219.1126 263 | CNN_ENG_20030424_173553.8 264 | CNNHL_ENG_20030611_133445.24 265 | APW_ENG_20030408.0090 266 | CNN_ENG_20030626_203133.11 267 | CNN_IP_20030412.1600.03 268 | BACONSREBELLION_20050123.1639 269 | MARKETVIEW_20050204.1322 270 | CNN_ENG_20030401_073033.14 271 | MARKETVIEW_20050120.1641 272 | FLOPPINGACES_20041230.1844.003 273 | CNN_ENG_20030423_180539.2 274 | CNN_ENG_20030528_195959.20 275 | AGGRESSIVEVOICEDAILY_20041101.1806 276 | fsh_29302 277 | CNNHL_ENG_20030610_230438.14 278 | CNNHL_ENG_20030519_124020.23 279 | CNNHL_ENG_20030616_230155.28 280 | alt.sys.pc-clone.dell_20050226.2350 281 | APW_ENG_20030610.0010 282 | alt.obituaries_20041121.1339 283 | fsh_29139 284 | fsh_29336 285 | CNN_CF_20030305.1900.02 286 | alt.gossip.celebrities_20050218.0826 287 | CNN_ENG_20030426_160621.0 288 | fsh_29326 289 | fsh_29770 290 | fsh_29774 291 | AGGRESSIVEVOICEDAILY_20050109.1627 292 | CNN_ENG_20030529_085826.10 293 | CNN_IP_20030402.1600.02-1 294 | soc.culture.iraq_20050211.0445 295 | CNN_IP_20030412.1600.05 296 | CNN_ENG_20030429_083016.5 297 | misc.taxes_20050218.1250 298 | fsh_29350 299 | AGGRESSIVEVOICEDAILY_20041203.1959 300 | MARKETVIEW_20050204.1736 301 | CNN_ENG_20030403_060032.0 302 | fsh_29520 303 | fsh_29195 304 | CNNHL_ENG_20030425_183518.12 305 | CNNHL_ENG_20030403_133453.21 306 | CNNHL_ENG_20030618_230303.36 307 | CNN_IP_20030403.1600.00-1 308 | CNN_ENG_20030525_143522.8 309 | MARKBACKER_20041220.0919 310 | CNN_ENG_20030610_133041.17 311 | fsh_29097 312 | XIN_ENG_20030327.0202 313 | CNNHL_ENG_20030624_230338.34 314 | CNN_IP_20030329.1600.02 315 | CNN_IP_20030408.1600.03 316 | Austin-Grad-Community_20050212.2454 317 | APW_ENG_20030406.0191 318 | soc.history.war.world-war-ii_20050127.2403 319 | CNN_IP_20030409.1600.04 320 | CNN_ENG_20030528_082823.9 321 | CNN_ENG_20030506_053020.14 322 | CNN_ENG_20030507_160538.15 323 | alt.collecting.autographs_20050224.2438 324 | CNN_IP_20030402.1600.00-2 325 | APW_ENG_20030418.0084 326 | CNNHL_ENG_20030513_220910.11 327 | CNN_IP_20030405.1600.01-3 328 | XIN_ENG_20030610.0299 329 | CNN_ENG_20030424_073006.4 330 | BACONSREBELLION_20050205.1919 331 | MARKBACKER_20041119.1002 332 | CNN_LE_20030504.1200.02-1 333 | CNN_ENG_20030409_180633.8 334 | misc.survivalism_20050210.0232 335 | MARKETVIEW_20050207.0746 336 | CNN_ENG_20030306_070606.18 337 | alt.politics_20050124.0640 338 | seattle.politics_20050122.2412 339 | BACONSREBELLION_20050217.0744 340 | AGGRESSIVEVOICEDAILY_20041218.1004 341 | MARKETVIEW_20050205.1358 342 | CNN_ENG_20030505_090022.1 343 | CNN_CF_20030304.1900.01 344 | BACONSREBELLION_20050209.0721 345 | CNN_ENG_20030610_130042.17 346 | aus.cars_20041206.0903 347 | CNN_IP_20030405.1600.00-3 348 | APW_ENG_20030513.0139 349 | MARKETVIEW_20050105.1901 350 | CNN_LE_20030504.1200.01 351 | fsh_29121 352 | MARKETVIEW_20050222.1919 353 | rec.music.makers.guitar.acoustic_20041228.1628 354 | CNN_IP_20030330.1600.05-2 355 | fsh_29187 356 | CNNHL_ENG_20030505_220734.25 357 | CNN_ENG_20030418_063040.1 358 | CNN_ENG_20030605_065831.18 359 | CNN_ENG_20030614_173123.4 360 | OIADVANTAGE_20050203.2102 361 | CNN_ENG_20030611_102832.4 362 | XIN_ENG_20030513.0002 363 | CNN_ENG_20030418_163834.14 364 | fsh_29521 365 | CNN_IP_20030329.1600.00-5 366 | CNNHL_ENG_20030402_193443.5 367 | CNN_ENG_20030624_153103.16 368 | fsh_29783 369 | CNN_ENG_20030602_105829.2 370 | CNN_ENG_20030408_123613.0 371 | APW_ENG_20030404.0439 372 | CNN_ENG_20030516_090022.7 373 | CNN_IP_20030422.1600.05 374 | AGGRESSIVEVOICEDAILY_20041101.1144 375 | BACONSREBELLION_20050218.1214 376 | CNN_ENG_20030509_123601.13 377 | AFP_ENG_20030323.0020 378 | CNN_IP_20030405.1600.00-2 379 | MARKETVIEW_20050126.0711 380 | MARKETVIEW_20041215.2128 381 | AGGRESSIVEVOICEDAILY_20050114.1922 382 | CNN_ENG_20030625_220123.3 383 | CNN_ENG_20030418_130831.5 384 | CNNHL_ENG_20030331_193419.9 385 | CNN_IP_20030402.1600.00-3 386 | CNN_CF_20030305.1900.00-2 387 | CNN_ENG_20030428_173654.13 388 | fsh_29581_1 389 | CNN_ENG_20030526_183538.3 390 | CNN_ENG_20030624_140104.22 391 | CNN_ENG_20030415_103039.0 392 | APW_ENG_20030407.0030 393 | CNN_IP_20030405.1600.02 394 | CNN_ENG_20030501_160459.0 395 | MARKETVIEW_20050127.0716 396 | CNN_ENG_20030411_233701.11 397 | XIN_ENG_20030423.0011 398 | fsh_29348 399 | MARKETVIEW_20041220.1537 400 | CNN_CF_20030305.1900.06-2 401 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_001: -------------------------------------------------------------------------------- 1 | APW_ENG_20101231.0037 2 | 11c906f2f798abb05f143b206edf77a5 3 | c8930568f1175e8bb0bff9b932a5c2d4 4 | 2ee2377e5d4ae6f5922ea2af11f9d4e1 5 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_002: -------------------------------------------------------------------------------- 1 | 84828469f40b28161c559e3d01526039 2 | 2d7d6761aad911a63a235a571fa7862f 3 | NYT_ENG_20130828.0147 4 | e37cfedb8a3a32769a12262eaef9ee0d 5 | 38cd9b530a5be18dbad52400da435934 6 | 1473ea2ded50c05b29b4f55f1b83ada3 7 | NYT_ENG_20130506.0130 8 | 3eb834d9a5d9c9fcad258087b5c2794a 9 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_003: -------------------------------------------------------------------------------- 1 | 97655df62dd4a176b65cf8a2c2a6e82d 2 | b49eee97fd373efbb4cb41926e60e385 3 | 3ae6760a860a33cb90af23596fac475c 4 | 11c906f2f798abb05f143b206edf77a5 5 | 376c304800b734b2a5a2c87b19eddc2a 6 | 087f58983ef5e94e54024bc9f0f009ae 7 | 2ac3b55a10d5395ded9e8e54c345553b 8 | NYT_ENG_20131003.0269 9 | 5f3a6a4c39c15d7382c2cafe64ae898d 10 | 33c71a5cec78e7d766d75c9a73b327b8 11 | 542d2b2755c23b22e9747d8a3b020bf2 12 | NYT_ENG_20131220.0283 13 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_005: -------------------------------------------------------------------------------- 1 | aa54ac32868c5de9b05b65a8ee7a4329 2 | 08ebdc5f0ec8588af38ab1684318d99c 3 | 9777919d54ccbb7810bd1c73df91fa4a 4 | NYT_ENG_20131115.0084 5 | 39280a4d31d81837e17469e18a854116 6 | NYT_ENG_20130508.0098 7 | 3f0e2f2fb9b773bc178522a6535a9651 8 | 0f316bb245762eedec6682acbecf2822 9 | 824610c87232d345dcc130521f20f72a 10 | 459bc8b09f4dd2e1fec7c77d26193b01 11 | b9109877820d90dbc5efcdda02e6d450 12 | 290e2643c2f91c108b206c5edb7a1c0f 13 | 2a54459212636289034af844f8634e37 14 | 5fa0f2a7f323a781640b126978ca8a42 15 | 3c9fb643a48360935c1044efca570514 16 | NYT_ENG_20131025.0190 17 | cb824da90723fed309217c6e28b1c7cd 18 | 21dbe23f56aaef87fd0980234895b321 19 | c397ecd66789b905c6b1c5ef21af03ec 20 | cb156ad2a5458fabc9e093b6b5e0f97f 21 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_010: -------------------------------------------------------------------------------- 1 | 30eadb19db9f0db62cba7be66862920d 2 | 644706e2d97c9a9a1f9874510180f136 3 | 130a86739522ab7c56232e798d04cbf9 4 | ca2a6fbf721ca102c149ad6a90d5b00a 5 | d5825f99faec1ae48589b98560a98d61 6 | 0fe5904ced20c20537fe29c1db11cd28 7 | NYT_ENG_20130619.0092 8 | 17f98f0c6cda0227e732e6761f396d1f 9 | NYT_ENG_20130508.0098 10 | 4eb58398a5c2ef35b16d885c5573b3d4 11 | 0f947223d04c10118b523cfeec5d231e 12 | 477135a713d07aafe00d5e86648ea408 13 | 4743a10c1d5f1ad35c31646049acb9db 14 | 5d0b5755e212a88afbbb8b29c34c4f13 15 | NYT_ENG_20131210.0203 16 | 5bb3c2b1094912a6df7e862bb2981481 17 | 3322caacf140c92366a639ee004560ce 18 | 081fede2fca345dce82bf6b2355d4ae5 19 | 087f58983ef5e94e54024bc9f0f009ae 20 | 0eb03fc279066b84ed49d44b2405469a 21 | 018fb4e59ac5474167ffc5940d7e55e7 22 | 63dca285201d1fcda72a54f4302b2c3e 23 | 4deb48e2b0ab194ce37c1bd31c73586a 24 | NYT_ENG_20130813.0006 25 | 8073c89ca4fdbe3b1eba0352bfe15d78 26 | 0a421343005f3241376fa01e1cb3c6fb 27 | 57026b7bcb8f855de3e26d572db35285 28 | 2ac3b55a10d5395ded9e8e54c345553b 29 | 24d93564f48ae17904aa82f937db8c21 30 | b9109877820d90dbc5efcdda02e6d450 31 | 59f8514f6db132207ba9e5828f73d706 32 | 08ebdc5f0ec8588af38ab1684318d99c 33 | 073020eb350fc73f123bfac8ec485ecc 34 | 31ea929baed3887e762b0b7f9196ce7e 35 | f18a7b77b1fd1065db9aeaf3f6143a5e 36 | 026bd1c7eae9f14da9480a4b88ba2fb6 37 | 95af1b55c359f28ff3a9159d55e9528a 38 | f9af64dc0cf1e7edd4a8feef75018b81 39 | 648abb9000309b9807cc8b212c11254f 40 | NYT_ENG_20130910.0002 41 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_020: -------------------------------------------------------------------------------- 1 | 2d8d3572658fdb8754fdc84d2b15f302 2 | 33bdb079026f1fcbe47c64b8c6968d0e 3 | 5d7b429073c60d53acba21bb6e7e6caa 4 | 2f5ee4e363c30678dc3b55caf43bc63d 5 | NYT_ENG_20131121.0250 6 | d528b874a0a6bd6011279a3239360aa2 7 | NYT_ENG_20130506.0130 8 | 57fb3f87bbb8c3205163ea256f658891 9 | 2a10c5cc27e7504dc9df92396b9e28b8 10 | aa003ea934a97bac86cee52b7122f1f8 11 | 1e9dfabe5e068a4142e768c0c5c37b6b 12 | 661ece467567ffbb54b551dfc1c2c254 13 | 57b2773ab54bbc5c119a46fd9be2c4f0 14 | 40f1f697a457e39c30ad94b7cc712c96 15 | 376c304800b734b2a5a2c87b19eddc2a 16 | 9b3bc3c727dfaa49218b57254087ff5d 17 | ca2a6fbf721ca102c149ad6a90d5b00a 18 | b608865c83b6612bf9ccb4e4c6e66ee7 19 | 4829d3d91263ed9d8801e6d94c3569a5 20 | 543e319fb067ef8cba81c74bb13c5711 21 | 48c498c9762046efbece8d183ed996ca 22 | NYT_ENG_20130828.0147 23 | d4698e3ad06f896058ade2e8f3a09577 24 | 3059538a2542c71687871b3444f8d921 25 | fd80f8b1a5694813bbda3253139c6395 26 | 590baa25bb1cc16c31fd02395edf6835 27 | NYT_ENG_20131128.0177 28 | 43611a2f256d101f910b852379c70959 29 | a68c8d0ef75bbbd2923bf7aa78b72d3e 30 | 3c9fb643a48360935c1044efca570514 31 | c397ecd66789b905c6b1c5ef21af03ec 32 | 4435a7cb258d37b4fafc3ef0e833582e 33 | 5dd42026c76290af6689691fbe2b8d1c 34 | 5254f96ac3a601e99b6357c4f7627991 35 | da156c00417e2020948c009d39341607 36 | 334de29f692ef2c5460b78fcad5c6c9e 37 | 565fa81d640f451b20955887a43b3a23 38 | 07b79a8764693a80861e5a3e5fd47fa5 39 | NYT_ENG_20130716.0036 40 | 12bbeaf10a36d36d82824a72352ac178 41 | c793b6b583e008f105af586fe433d4ac 42 | f9af64dc0cf1e7edd4a8feef75018b81 43 | cd04993849c889a56ea66c6670f002f4 44 | NYT_ENG_20131220.0283 45 | 4fca88a5c29716cbb7c0f9aa9b84007a 46 | d3b5c32563ebb009bc1b1f5bc1b9eb14 47 | 39ff7dcae4034417ba175de97d14b165 48 | 2bbf45266e4ec0ae72977c89ac8d55c1 49 | 3b34a76a3589417f5db02883b47280a6 50 | 44fd27d40ae65547c3b584c2ff360cd7 51 | 30fa916e5173b52d449300e2ea71b787 52 | cf88887857b155d8822f82cad3597744 53 | 8492134197b5bf8e9179e2fa245ae02f 54 | 18e8a277f2659f79291efa0e12e80cb3 55 | 3e9bbf75058a3f16585889bb9c64a903 56 | 9f23d711bf5016fec9d05081772b4f24 57 | 17af00d74fca31bceab4ad463bf1c384 58 | 02905b7ce3a6b8b0961c6c2310392ef9 59 | NYT_ENG_20131121.0040 60 | 5bfd613fd31f0c2bdfb5c41f21629144 61 | 82f0af70bf68f4e78e6ea60a339f830d 62 | 3b9c27eda65c635e109a547930942486 63 | 3f987a93959acff3609a251b5abbecd7 64 | 9f6e4c46ae753bf14edff7e2ac767213 65 | f3e00fa1d34bca154aea0845c628f0e6 66 | 63dca285201d1fcda72a54f4302b2c3e 67 | c1f185252a2837aa464e36f263d1ebe9 68 | 59f8514f6db132207ba9e5828f73d706 69 | 5bb3c2b1094912a6df7e862bb2981481 70 | 648fc5834f73b4196b4ceb3daad954f9 71 | 10953ba63f691cb49f47f852b359a6e3 72 | 459bc8b09f4dd2e1fec7c77d26193b01 73 | 17f22c2b1e5642b41a9aeedb03261d1a 74 | 44b011cd504c9ed71beb851324db886a 75 | 43326b9fa7deac9d3f8f9e2a0aa0e5cf 76 | 33ed1c9fdee1000e2340ac7f92c77752 77 | b6b443777e5ca92aa5152f5593960fd9 78 | 3dc7812b2b39ed067cc7c8ab1218e128 79 | a724033bff06e750d27cd7e3bf8263ac 80 | 3b9b81a3a446c24009c7642da54dbd28 81 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_030: -------------------------------------------------------------------------------- 1 | 2701285c791f423cd2f8fd827df9c2c9 2 | 3e9bbf75058a3f16585889bb9c64a903 3 | 3f78c311ad97d4bbc6b4914deb4ab1ec 4 | 324274e50f2d07757e2d88ff58a0c33b 5 | 63878a2b6d34b576361d2a2778f321a6 6 | 2ca0238925d38f345acbf826854ea448 7 | 7c5b86ed55f4e5b8667423ef88f49fb5 8 | NYT_ENG_20130619.0092 9 | 648fc5834f73b4196b4ceb3daad954f9 10 | cb824da90723fed309217c6e28b1c7cd 11 | 4d996a22855cc2ec9f54990a23d51c56 12 | 563b1e8fcb1de7a4c0e01da9100d6e09 13 | 1b0f90c029f75d326ea39c0371901ef4 14 | 087f58983ef5e94e54024bc9f0f009ae 15 | 65814a1b2cccd0fd9be5ee3d5068038d 16 | 086e26ec92d1cc02f3900e9ac46d6962 17 | 389c70a4859f7528cc6e8b84c10766d7 18 | 856bc3bee118c826c394ed09548db9b2 19 | 31ea929baed3887e762b0b7f9196ce7e 20 | NYT_ENG_20130422.0048 21 | 408dff173c599256711f23238e280c15 22 | 4edd239ce7d1f7274154cd05081f8995 23 | acdf07c9477b21e1d29c51dc692e085b 24 | 86a94ca907de6688cca64610730fa11b 25 | NYT_ENG_20130703.0214 26 | 2ac34d012c8d909d4a29aa3f6be1f23d 27 | 4a3d067b19686b281e0beb437573a28c 28 | a83302f9002b6707fc7a91a7d7d29e6e 29 | f0aabfc899d1c17b8e99039bb4f80d64 30 | 4ae1669fc17f6b863ff35fa14a960270 31 | 5dfd5bfee062cd5896b619a2b1309766 32 | a68c8d0ef75bbbd2923bf7aa78b72d3e 33 | 2b96d1172d37f60aea5ce64a0b410248 34 | 3b9c27eda65c635e109a547930942486 35 | 459f9a2b3eddd436f0232395f129dfd0 36 | NYT_ENG_20130816.0151 37 | cfd86b06365dab636d13523c7ed93ad6 38 | 3d8f19221d257f81e3376b9e0731d4db 39 | 4743a10c1d5f1ad35c31646049acb9db 40 | cf88887857b155d8822f82cad3597744 41 | NYT_ENG_20130822.0136 42 | 10953ba63f691cb49f47f852b359a6e3 43 | ed6c37ed1996fc89f5fe813731c71b9d 44 | 584ccaef38f5936e973f0561966bbf06 45 | 3c9fb643a48360935c1044efca570514 46 | 29f64df7feb04dfb16f4667ce199c9f0 47 | 2ba8bbf004fe30c0a01f6fcd25f01dcc 48 | 5c29f9e575b94c61db8ed52bdfa53843 49 | d409fd37c208c5a7a5b2c64b4130b0ec 50 | 4683e6affe801713ed4cc9d596b57fac 51 | a9318b72c7a2ff32d459af958c7defe1 52 | 043b35fbf220a2d1bbe7d0612ad87635 53 | 502c46cc149d30f9ad0c25194636dcb6 54 | 27eb0b9d14d45ede66fe86534e36a2ce 55 | ffc5cc6892ff203f43b2dc8d83bcd725 56 | 2d7d6761aad911a63a235a571fa7862f 57 | e98123aa18eb4ce95d2d4eccace51169 58 | 459bc8b09f4dd2e1fec7c77d26193b01 59 | 3f115570c2fcc85263ba97e0134fb039 60 | 459b795a150e7866d6e4ef75e1b92b4a 61 | f3e00fa1d34bca154aea0845c628f0e6 62 | 47de592453663260c44944346d669611 63 | 84828469f40b28161c559e3d01526039 64 | 2a46fcf4ff6ce3896f249848e48b3b4c 65 | 6521f6bd1eb405232a5e852423722bac 66 | 3a9a0c07af53fce42e1a55c21826c54d 67 | 17f22c2b1e5642b41a9aeedb03261d1a 68 | da156c00417e2020948c009d39341607 69 | 334de29f692ef2c5460b78fcad5c6c9e 70 | 3878ab866ca434318076c4e7eac49c0d 71 | 542d2b2755c23b22e9747d8a3b020bf2 72 | f801d26c9b4d7577df089a196e242a04 73 | a48d00241e327e54ca914b950e97c7d4 74 | 1d2911e09a6746b942c3e7b3cbdcb0ce 75 | 644706e2d97c9a9a1f9874510180f136 76 | 6491f0650d9628b84dee6f539df5a53d 77 | ae656f6d658efca126f9721087608e95 78 | 373a3b4bb2a9e67a12c50ad54a1be657 79 | 290e2643c2f91c108b206c5edb7a1c0f 80 | 561a0178f4b846b9bbcf39f7e63afe4e 81 | d5825f99faec1ae48589b98560a98d61 82 | 5685a6069312d52a897fe69973269338 83 | APW_ENG_20101231.0037 84 | 2f5ee4e363c30678dc3b55caf43bc63d 85 | 57b2773ab54bbc5c119a46fd9be2c4f0 86 | 670b5425fcd1700e2c27af5f09244cb1 87 | deb3e0ea36b437c34b52d95aa6a9631f 88 | NYT_ENG_20131029.0091 89 | 2c8bcca93da4097da338a8754e4f03b0 90 | fab32c473df923a6a9242054c8d23bf3 91 | 23987125927d321ec6f0c30c8f453cb3 92 | 57026b7bcb8f855de3e26d572db35285 93 | 37b56b6dd846ad0dd6e8cd00ba2efaf4 94 | c8930568f1175e8bb0bff9b932a5c2d4 95 | 66fba4f92d2f9d8c3bee5dfad3af9828 96 | NYT_ENG_20131003.0269 97 | 36d45aff571e3fbe036f309c18d31668 98 | 736fa00bfb16f3298883be5e962fe01b 99 | 0fab386f8b6527439481f526c92341c7 100 | 37d781089c669131c5118415cf470422 101 | 0fe5904ced20c20537fe29c1db11cd28 102 | NYT_ENG_20131029.0042 103 | 0f565d3822dca80336582ffac4adaf78 104 | 774caed283a1e55ef9490864771029c3 105 | 2a54459212636289034af844f8634e37 106 | a724033bff06e750d27cd7e3bf8263ac 107 | 593cb5020613a4695859130542f7fc94 108 | 0ba982819aaf9f5b94a7cebd48ac6018 109 | 04134f2be20afbb868d7a8292f49e277 110 | 3ac3c99241c2243a9e233b091eddfe15 111 | cade0d91e2e82e4db58efe64d7462c33 112 | 15ba31cca04cc5300361f46319247c40 113 | af18d29036ab0a9f8cf2742a5a1b4804 114 | 026bd1c7eae9f14da9480a4b88ba2fb6 115 | 9fc05e3fab69893da830adfa6513510d 116 | 59a5d2e146c13f7519130193fc773610 117 | 44b011cd504c9ed71beb851324db886a 118 | 04debcc4da342dc971bdef4210fe468a 119 | 0eb03fc279066b84ed49d44b2405469a 120 | c793b6b583e008f105af586fe433d4ac 121 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_050: -------------------------------------------------------------------------------- 1 | 0cde024ee993679967f7ac397000ad52 2 | fd80f8b1a5694813bbda3253139c6395 3 | 30eadb19db9f0db62cba7be66862920d 4 | 2d7d6761aad911a63a235a571fa7862f 5 | 97655df62dd4a176b65cf8a2c2a6e82d 6 | 2251a78817e67a2adaf0722fd05c7ac0 7 | NYT_ENG_20130508.0098 8 | 324274e50f2d07757e2d88ff58a0c33b 9 | 542d2b2755c23b22e9747d8a3b020bf2 10 | 1557734399e8da2b84a2dd9ddb4eba49 11 | c06e8bbdf69f73a69cd3d5dbb4d06a21 12 | 33ed1c9fdee1000e2340ac7f92c77752 13 | 79a3cc37998a99808583eba765aedca1 14 | 2a54459212636289034af844f8634e37 15 | f3e00fa1d34bca154aea0845c628f0e6 16 | 3ae6760a860a33cb90af23596fac475c 17 | 6667fb9e43ac7edde844453cba97baf0 18 | ffc5cc6892ff203f43b2dc8d83bcd725 19 | 018fb4e59ac5474167ffc5940d7e55e7 20 | aa54ac32868c5de9b05b65a8ee7a4329 21 | 590baa25bb1cc16c31fd02395edf6835 22 | a42f7cf822523c76c225602537aefc7a 23 | 4fbb1eec7dfd5c2fefb94a2d873ddfa5 24 | a724033bff06e750d27cd7e3bf8263ac 25 | 5f3a6a4c39c15d7382c2cafe64ae898d 26 | 0c49bb860962aa0d5b8e3fc277592da0 27 | 52355a4167e6ac3a80d19c94ad6259a7 28 | aa32f4f9534045b9f33a9599d0c1b580 29 | 584ccaef38f5936e973f0561966bbf06 30 | 130a86739522ab7c56232e798d04cbf9 31 | NYT_ENG_20130813.0006 32 | 361e1c2ca3a1e21c618e0e8fab959e30 33 | 3a0d64b5cb2bc7319e803e344dc695b5 34 | a48d00241e327e54ca914b950e97c7d4 35 | 61d6f81f680f83a1a3281fde24d9c3ac 36 | 3446f8cbcf53eaca5692913ced012b11 37 | da156c00417e2020948c009d39341607 38 | 652f1fbc927a6c358447947d0d77f95f 39 | d3b5c32563ebb009bc1b1f5bc1b9eb14 40 | 565fa81d640f451b20955887a43b3a23 41 | 362f9d9707c4da0c8068bc7034aae4b4 42 | 1a0f101744b34677ce1e1da1b1b91beb 43 | 07c9c8ca974b6e9333c38720b0b06896 44 | 0f565d3822dca80336582ffac4adaf78 45 | bf1047c7c17ae3daab59c3bee423e12f 46 | 1473ea2ded50c05b29b4f55f1b83ada3 47 | b9109877820d90dbc5efcdda02e6d450 48 | 15ba31cca04cc5300361f46319247c40 49 | 17f22c2b1e5642b41a9aeedb03261d1a 50 | 3878ab866ca434318076c4e7eac49c0d 51 | NYT_ENG_20131115.0084 52 | 2c2e8b3286bd34e30a4cb57cb7e26ce5 53 | 70b2f9277a1c78bd13cef68ba6485bd9 54 | f703536e3212f51cbf26ce47aa7b5eff 55 | 5b7cab1d1cfc0c05686399d8bcbcfe5b 56 | 3e9bbf75058a3f16585889bb9c64a903 57 | 186ef6837e001cd9b97a132c86705545 58 | 2f5ee4e363c30678dc3b55caf43bc63d 59 | 1f60eb9697e240af089b134b69c2042d 60 | 7c5b86ed55f4e5b8667423ef88f49fb5 61 | NYT_ENG_20130816.0151 62 | 2bbf45266e4ec0ae72977c89ac8d55c1 63 | NYT_ENG_20131118.0019 64 | fa371b1fbb4d20143e638a7dac6e4f6b 65 | 08ebdc5f0ec8588af38ab1684318d99c 66 | c397ecd66789b905c6b1c5ef21af03ec 67 | 04952b874a2a34d602faaa74712d435e 68 | 9c500ea2248358171d77d419e67f5760 69 | 3f987a93959acff3609a251b5abbecd7 70 | NYT_ENG_20131025.0190 71 | 35587c6d8aa67724ba23231dd16f7b44 72 | 7e520221ddb1602a0f2aa10560a50a66 73 | ae9a0d394c5e3d3d812c7ffc07c2f836 74 | AFP_ENG_20100414.0615 75 | 774caed283a1e55ef9490864771029c3 76 | 026bd1c7eae9f14da9480a4b88ba2fb6 77 | 5753617c893938f625b349cf6bd2b388 78 | 01f69c4c2206e7c3fa3706ccd5b8b350 79 | 57fb3f87bbb8c3205163ea256f658891 80 | af79ea77b8fb92424dbc02d88d8c14e8 81 | 04debcc4da342dc971bdef4210fe468a 82 | 7c0e0e53980aeb2868cbe4e1c1cb79db 83 | 661ece467567ffbb54b551dfc1c2c254 84 | 3b9c27eda65c635e109a547930942486 85 | 0a421343005f3241376fa01e1cb3c6fb 86 | 4edd239ce7d1f7274154cd05081f8995 87 | 38cd9b530a5be18dbad52400da435934 88 | 2ba8bbf004fe30c0a01f6fcd25f01dcc 89 | 4764f1400fa336d1fb972719b10b939a 90 | 5c29f9e575b94c61db8ed52bdfa53843 91 | 5fa0f2a7f323a781640b126978ca8a42 92 | 0fab386f8b6527439481f526c92341c7 93 | 502c46cc149d30f9ad0c25194636dcb6 94 | 48c498c9762046efbece8d183ed996ca 95 | 59a5d2e146c13f7519130193fc773610 96 | 5d4273298e649a13c4dce27c89f414ac 97 | NYT_ENG_20130525.0040 98 | 5c59566e9132c060423cad5b2d1bac1e 99 | 65814a1b2cccd0fd9be5ee3d5068038d 100 | 4d7e1af80bc46167ef3d81cf642bf94b 101 | 4deb48e2b0ab194ce37c1bd31c73586a 102 | 0c100ebc18cc55f80cdae6343f72db69 103 | 087f58983ef5e94e54024bc9f0f009ae 104 | 0cfdfe102b7a4cb34e1a181c1d36d23d 105 | a83302f9002b6707fc7a91a7d7d29e6e 106 | 43611a2f256d101f910b852379c70959 107 | 5d7b429073c60d53acba21bb6e7e6caa 108 | 21dbe23f56aaef87fd0980234895b321 109 | 178e7de35eccad0df800f0c7539cf614 110 | 370e7ee173951eeff13998a416b8b3d0 111 | 4572d22caf3e1924f894002b724f958b 112 | ecb7c8154bf58b48ae00b252ff283c29 113 | 561a0178f4b846b9bbcf39f7e63afe4e 114 | 83d7cb6d5b663f34dcf83879a8729fb4 115 | 33bdb079026f1fcbe47c64b8c6968d0e 116 | e972c0257d72aefc52cfdf7e7f5a1623 117 | 07b79a8764693a80861e5a3e5fd47fa5 118 | NYT_ENG_20131029.0042 119 | edc4216d65afa47fe7bc6004ac172e92 120 | 3138f7fb2f8575ed762eb0bc11023d59 121 | 6837dcaff76ad3235d46708dd89e7306 122 | 57b2773ab54bbc5c119a46fd9be2c4f0 123 | 9fc05e3fab69893da830adfa6513510d 124 | 52e569e00b6428b94205d3dd5c457c54 125 | 736fa00bfb16f3298883be5e962fe01b 126 | 43341a312ffd84a4ad3c3ab0df8bcd7c 127 | 648abb9000309b9807cc8b212c11254f 128 | 2701285c791f423cd2f8fd827df9c2c9 129 | 5254f96ac3a601e99b6357c4f7627991 130 | 861cdd1a5c6c41610021b25c3795e293 131 | XIN_ENG_20101125.0137 132 | 086e26ec92d1cc02f3900e9ac46d6962 133 | f801d26c9b4d7577df089a196e242a04 134 | NYT_ENG_20130731.0133 135 | abbdf0048737e9e639403f8fe8cd7dd2 136 | 9f6e4c46ae753bf14edff7e2ac767213 137 | 36b12cef6f7a805e3e74a4f430129028 138 | cade0d91e2e82e4db58efe64d7462c33 139 | 3059538a2542c71687871b3444f8d921 140 | 11c906f2f798abb05f143b206edf77a5 141 | NYT_ENG_20130828.0147 142 | a05c08e340a73270592f62361a19274d 143 | 47de592453663260c44944346d669611 144 | 33c71a5cec78e7d766d75c9a73b327b8 145 | f0612c786635ed96ee3df84821a17685 146 | 43326b9fa7deac9d3f8f9e2a0aa0e5cf 147 | 40f1f697a457e39c30ad94b7cc712c96 148 | b608865c83b6612bf9ccb4e4c6e66ee7 149 | 609d5112c0386dc4e5f2e90b93cb7a5f 150 | 95af1b55c359f28ff3a9159d55e9528a 151 | 8f575db98ccc3af0a904b650898368dd 152 | 3b9b81a3a446c24009c7642da54dbd28 153 | 2ee2377e5d4ae6f5922ea2af11f9d4e1 154 | 3b34a76a3589417f5db02883b47280a6 155 | 4743a10c1d5f1ad35c31646049acb9db 156 | NYT_ENG_20131225.0200 157 | 648fc5834f73b4196b4ceb3daad954f9 158 | 99ab1cad51361e94c2fe3f997c45705a 159 | 39280a4d31d81837e17469e18a854116 160 | 44fd27d40ae65547c3b584c2ff360cd7 161 | 18a89cdd00dadc593a88c924111575f1 162 | f0aabfc899d1c17b8e99039bb4f80d64 163 | 120fe19a9bc68fd85fc4963c166e9345 164 | NYT_ENG_20131220.0283 165 | 51d64c51a2363954454ee9e921b590ce 166 | 5e3fbf49f8301654bb4954c0f1e386a9 167 | 584b6272bb8c9cc134621ff5ace8c98d 168 | AFP_ENG_20100601.0724 169 | 52a77871923a7f86bb1a52812bc7f2e1 170 | f9af64dc0cf1e7edd4a8feef75018b81 171 | 1980ed7ea6a283f8dd19da5a4e9952d6 172 | 4c2488e10c34e5412d3b67e794c9bc84 173 | 17f98f0c6cda0227e732e6761f396d1f 174 | 15c96bac6c08ef94fe249fde914b53d7 175 | 02905b7ce3a6b8b0961c6c2310392ef9 176 | 23987125927d321ec6f0c30c8f453cb3 177 | 0fe5904ced20c20537fe29c1db11cd28 178 | 26175bdbe49b712d7412c273c111e813 179 | 3d8f19221d257f81e3376b9e0731d4db 180 | 56c895a1c8dead5698a49321a674f3f4 181 | 5cd7d603e1cf8d2c134d039dc90112f0 182 | 3065902101e4282b89ed4ac8f64d4a84 183 | 9f23d711bf5016fec9d05081772b4f24 184 | d409fd37c208c5a7a5b2c64b4130b0ec 185 | 7734fb9363c2adf91c6ede6c7bb7df90 186 | cf88887857b155d8822f82cad3597744 187 | 2c8bcca93da4097da338a8754e4f03b0 188 | 2ca0238925d38f345acbf826854ea448 189 | 0536891daea71ab51ee1123137b67146 190 | 78333509dffd4a7df90b029a5d851dfe 191 | 1b268b27094ba9c5feb11192dad940ab 192 | NYT_ENG_20130603.0111 193 | 5c7ea2b51202d80ee37eba8a182afad3 194 | 0e0abbf0da91d9e34750441c08d5d262 195 | 17af00d74fca31bceab4ad463bf1c384 196 | 63dca285201d1fcda72a54f4302b2c3e 197 | 644706e2d97c9a9a1f9874510180f136 198 | 4175e3da216dcc8710a26359e4ecaaad 199 | 1b0f90c029f75d326ea39c0371901ef4 200 | 5bb3c2b1094912a6df7e862bb2981481 201 | -------------------------------------------------------------------------------- /resource/low_resource_split/ere/doc_list_075: -------------------------------------------------------------------------------- 1 | 5c59566e9132c060423cad5b2d1bac1e 2 | 362f9d9707c4da0c8068bc7034aae4b4 3 | b608865c83b6612bf9ccb4e4c6e66ee7 4 | 324274e50f2d07757e2d88ff58a0c33b 5 | d0b9b1747f4a6247294cde9ac0165c60 6 | 4f7eedf44076ea050d7db3715f9333fa 7 | NYT_ENG_20130914.0094 8 | NYT_ENG_20131121.0040 9 | 4bab621aef9d14b5d20ac23cb8142112 10 | 590baa25bb1cc16c31fd02395edf6835 11 | 25f868780ac18430a6f10ab4de22ffb8 12 | b6b443777e5ca92aa5152f5593960fd9 13 | 2701285c791f423cd2f8fd827df9c2c9 14 | 186ef6837e001cd9b97a132c86705545 15 | ecb7c8154bf58b48ae00b252ff283c29 16 | 24d93564f48ae17904aa82f937db8c21 17 | 3059538a2542c71687871b3444f8d921 18 | 4c2488e10c34e5412d3b67e794c9bc84 19 | af18d29036ab0a9f8cf2742a5a1b4804 20 | 342431e61e80263f606c46bb5e399cc7 21 | 1f60eb9697e240af089b134b69c2042d 22 | 47de592453663260c44944346d669611 23 | 04debcc4da342dc971bdef4210fe468a 24 | NYT_ENG_20130910.0002 25 | 52a77871923a7f86bb1a52812bc7f2e1 26 | 07c9c8ca974b6e9333c38720b0b06896 27 | 36b12cef6f7a805e3e74a4f430129028 28 | 5bb3c2b1094912a6df7e862bb2981481 29 | 5e3fbf49f8301654bb4954c0f1e386a9 30 | cb824da90723fed309217c6e28b1c7cd 31 | 21dbe23f56aaef87fd0980234895b321 32 | 11329f1cdb44019afc8f48b6fdc5376d 33 | cca700aed62fd497e64e507752409b41 34 | 3dc7812b2b39ed067cc7c8ab1218e128 35 | NYT_ENG_20130508.0098 36 | NYT_ENG_20131025.0190 37 | edc4216d65afa47fe7bc6004ac172e92 38 | 3878ab866ca434318076c4e7eac49c0d 39 | 08ebdc5f0ec8588af38ab1684318d99c 40 | 459f9a2b3eddd436f0232395f129dfd0 41 | 204f8f6bdb24c5198175bf1ed483247b 42 | 0c100ebc18cc55f80cdae6343f72db69 43 | 1bf9912633f942d6d1d4e87df33cee40 44 | 648abb9000309b9807cc8b212c11254f 45 | af79ea77b8fb92424dbc02d88d8c14e8 46 | 0e0abbf0da91d9e34750441c08d5d262 47 | 2d8d3572658fdb8754fdc84d2b15f302 48 | 4d996a22855cc2ec9f54990a23d51c56 49 | 7c5b86ed55f4e5b8667423ef88f49fb5 50 | 18e8a277f2659f79291efa0e12e80cb3 51 | 79a3cc37998a99808583eba765aedca1 52 | 0fab386f8b6527439481f526c92341c7 53 | 376c304800b734b2a5a2c87b19eddc2a 54 | 073020eb350fc73f123bfac8ec485ecc 55 | 648fc5834f73b4196b4ceb3daad954f9 56 | APW_ENG_20101231.0037 57 | f703536e3212f51cbf26ce47aa7b5eff 58 | NYT_ENG_20130816.0151 59 | 5cd7d603e1cf8d2c134d039dc90112f0 60 | 120fe19a9bc68fd85fc4963c166e9345 61 | 2c2e8b3286bd34e30a4cb57cb7e26ce5 62 | bdca67a0bacec61b5e691d5ca51ba724 63 | NYT_ENG_20130525.0040 64 | 4798bc0e166fe93893bdf2d922f06258 65 | 1badbb95e5e70ef90e49cdf5a46b6d9b 66 | 04952b874a2a34d602faaa74712d435e 67 | 824610c87232d345dcc130521f20f72a 68 | 459bc8b09f4dd2e1fec7c77d26193b01 69 | c1f185252a2837aa464e36f263d1ebe9 70 | a724033bff06e750d27cd7e3bf8263ac 71 | 30fa916e5173b52d449300e2ea71b787 72 | 3ac3c99241c2243a9e233b091eddfe15 73 | 99ab1cad51361e94c2fe3f997c45705a 74 | 37b56b6dd846ad0dd6e8cd00ba2efaf4 75 | f18a7b77b1fd1065db9aeaf3f6143a5e 76 | 178e7de35eccad0df800f0c7539cf614 77 | NYT_ENG_20131029.0042 78 | c793b6b583e008f105af586fe433d4ac 79 | 1a0f101744b34677ce1e1da1b1b91beb 80 | 2d7d6761aad911a63a235a571fa7862f 81 | NYT_ENG_20130422.0048 82 | 30cced37fcceb1800341d18d4f97b670 83 | 565fa81d640f451b20955887a43b3a23 84 | 12bbeaf10a36d36d82824a72352ac178 85 | 31ea929baed3887e762b0b7f9196ce7e 86 | 8073c89ca4fdbe3b1eba0352bfe15d78 87 | 07b79a8764693a80861e5a3e5fd47fa5 88 | 70b2f9277a1c78bd13cef68ba6485bd9 89 | abbdf0048737e9e639403f8fe8cd7dd2 90 | 026bd1c7eae9f14da9480a4b88ba2fb6 91 | 370e7ee173951eeff13998a416b8b3d0 92 | 3b34a76a3589417f5db02883b47280a6 93 | 3f987a93959acff3609a251b5abbecd7 94 | 08b0dfe15192c063055ed7db8d24c625 95 | 368df106b2eaa0b4091e099f360a07d6 96 | 84828469f40b28161c559e3d01526039 97 | 4435a7cb258d37b4fafc3ef0e833582e 98 | 9c500ea2248358171d77d419e67f5760 99 | 26542fb5b83cdb4b98a3fe31e0226b39 100 | NYT_ENG_20130504.0098 101 | c8930568f1175e8bb0bff9b932a5c2d4 102 | 0eb03fc279066b84ed49d44b2405469a 103 | 2ee2377e5d4ae6f5922ea2af11f9d4e1 104 | 3b9c27eda65c635e109a547930942486 105 | 36d45aff571e3fbe036f309c18d31668 106 | 67db76e5116c4c809107948d4b0a5ecc 107 | NYT_ENG_20131029.0091 108 | 4042cd8643253f65df3a4e8de320a1c9 109 | 43341a312ffd84a4ad3c3ab0df8bcd7c 110 | 408dff173c599256711f23238e280c15 111 | 4fca88a5c29716cbb7c0f9aa9b84007a 112 | 1980ed7ea6a283f8dd19da5a4e9952d6 113 | d409fd37c208c5a7a5b2c64b4130b0ec 114 | 15c96bac6c08ef94fe249fde914b53d7 115 | NYT_ENG_20131210.0203 116 | 584b6272bb8c9cc134621ff5ace8c98d 117 | 01f69c4c2206e7c3fa3706ccd5b8b350 118 | 44b011cd504c9ed71beb851324db886a 119 | 593cb5020613a4695859130542f7fc94 120 | f0612c786635ed96ee3df84821a17685 121 | c06e8bbdf69f73a69cd3d5dbb4d06a21 122 | 2ac34d012c8d909d4a29aa3f6be1f23d 123 | a42f7cf822523c76c225602537aefc7a 124 | 1b268b27094ba9c5feb11192dad940ab 125 | 2bebb50073ceefd0c9ccfdf3e07b3258 126 | a48d00241e327e54ca914b950e97c7d4 127 | 4743a10c1d5f1ad35c31646049acb9db 128 | 78333509dffd4a7df90b029a5d851dfe 129 | 57fb3f87bbb8c3205163ea256f658891 130 | a13d4f9511d799fc25b73e4d5cf28d13 131 | 4edd239ce7d1f7274154cd05081f8995 132 | f9af64dc0cf1e7edd4a8feef75018b81 133 | 30eadb19db9f0db62cba7be66862920d 134 | 33c71a5cec78e7d766d75c9a73b327b8 135 | 2a54459212636289034af844f8634e37 136 | 130a86739522ab7c56232e798d04cbf9 137 | 91147deeeec220cc445a8d546585cdb7 138 | 542d2b2755c23b22e9747d8a3b020bf2 139 | 97655df62dd4a176b65cf8a2c2a6e82d 140 | 6837dcaff76ad3235d46708dd89e7306 141 | 38cd9b530a5be18dbad52400da435934 142 | 2ca0238925d38f345acbf826854ea448 143 | b49eee97fd373efbb4cb41926e60e385 144 | 584ccaef38f5936e973f0561966bbf06 145 | 736fa00bfb16f3298883be5e962fe01b 146 | e972c0257d72aefc52cfdf7e7f5a1623 147 | 1473ea2ded50c05b29b4f55f1b83ada3 148 | 5dd42026c76290af6689691fbe2b8d1c 149 | 6491f0650d9628b84dee6f539df5a53d 150 | 0cde024ee993679967f7ac397000ad52 151 | 3a9a0c07af53fce42e1a55c21826c54d 152 | 17f22c2b1e5642b41a9aeedb03261d1a 153 | 459b795a150e7866d6e4ef75e1b92b4a 154 | 61d6f81f680f83a1a3281fde24d9c3ac 155 | 10953ba63f691cb49f47f852b359a6e3 156 | a05c08e340a73270592f62361a19274d 157 | 255bae1c133d1d77ef727c063e435a78 158 | 2c8bcca93da4097da338a8754e4f03b0 159 | 11c906f2f798abb05f143b206edf77a5 160 | 9777919d54ccbb7810bd1c73df91fa4a 161 | 018fb4e59ac5474167ffc5940d7e55e7 162 | 6667fb9e43ac7edde844453cba97baf0 163 | 1b0f90c029f75d326ea39c0371901ef4 164 | 48c498c9762046efbece8d183ed996ca 165 | 22696c601df1a7359e9b629c689700ad 166 | 18a89cdd00dadc593a88c924111575f1 167 | 9b3bc3c727dfaa49218b57254087ff5d 168 | 3d8f19221d257f81e3376b9e0731d4db 169 | 5c7ea2b51202d80ee37eba8a182afad3 170 | e37cfedb8a3a32769a12262eaef9ee0d 171 | 561a0178f4b846b9bbcf39f7e63afe4e 172 | 2cf358ab89c732d6b35b65e619d2bc86 173 | 44a65adb7f74e6c99d05eb2721fd0baf 174 | 2d2a4ddb1c8f4a669541704f9fb78472 175 | 3eb834d9a5d9c9fcad258087b5c2794a 176 | 3ae6760a860a33cb90af23596fac475c 177 | 0536891daea71ab51ee1123137b67146 178 | 652f1fbc927a6c358447947d0d77f95f 179 | 081fede2fca345dce82bf6b2355d4ae5 180 | 2a10c5cc27e7504dc9df92396b9e28b8 181 | NYT_ENG_20130813.0006 182 | 2ac3b55a10d5395ded9e8e54c345553b 183 | 0c49bb860962aa0d5b8e3fc277592da0 184 | af36543ebce546c7c678fbf9767bfdbb 185 | 15ba31cca04cc5300361f46319247c40 186 | 3c9fb643a48360935c1044efca570514 187 | 361e1c2ca3a1e21c618e0e8fab959e30 188 | 5d0b5755e212a88afbbb8b29c34c4f13 189 | 1d16a571f14fb1032bc19e9314a46deb 190 | d528b874a0a6bd6011279a3239360aa2 191 | 3e6c7121211de578d7fd831eae801438 192 | 3322caacf140c92366a639ee004560ce 193 | 9f6e4c46ae753bf14edff7e2ac767213 194 | 4a3d067b19686b281e0beb437573a28c 195 | 670b5425fcd1700e2c27af5f09244cb1 196 | 3e9bbf75058a3f16585889bb9c64a903 197 | NYT_ENG_20130731.0133 198 | 47c26ba3563092e41c5a42252931baf1 199 | 26175bdbe49b712d7412c273c111e813 200 | 40f1f697a457e39c30ad94b7cc712c96 201 | aa003ea934a97bac86cee52b7122f1f8 202 | NYT_ENG_20131003.0269 203 | 33bdb079026f1fcbe47c64b8c6968d0e 204 | 4175e3da216dcc8710a26359e4ecaaad 205 | ae9a0d394c5e3d3d812c7ffc07c2f836 206 | NYT_ENG_20131115.0084 207 | 9f23d711bf5016fec9d05081772b4f24 208 | 477135a713d07aafe00d5e86648ea408 209 | 04134f2be20afbb868d7a8292f49e277 210 | 7734fb9363c2adf91c6ede6c7bb7df90 211 | e98123aa18eb4ce95d2d4eccace51169 212 | 44fd27d40ae65547c3b584c2ff360cd7 213 | NYT_ENG_20130703.0214 214 | 2f5ee4e363c30678dc3b55caf43bc63d 215 | 909239794c799f2d2e79c023ae090c35 216 | 3a0d64b5cb2bc7319e803e344dc695b5 217 | deb3e0ea36b437c34b52d95aa6a9631f 218 | 774caed283a1e55ef9490864771029c3 219 | 334de29f692ef2c5460b78fcad5c6c9e 220 | 416cfc6a5717682cd35d381c5be07734 221 | 44087d95184e9d94f3948f47e9b602af 222 | fd80f8b1a5694813bbda3253139c6395 223 | 51d64c51a2363954454ee9e921b590ce 224 | 4b2d9d5984b731dbdd3db398b5fb5e46 225 | NYT_ENG_20130603.0111 226 | 56c895a1c8dead5698a49321a674f3f4 227 | 1e9dfabe5e068a4142e768c0c5c37b6b 228 | ed6c37ed1996fc89f5fe813731c71b9d 229 | da156c00417e2020948c009d39341607 230 | d7369ce92ed0b6327412c705dbbab654 231 | 5fa0f2a7f323a781640b126978ca8a42 232 | 105249d0d0575a1a5939b16139f6229d 233 | 39280a4d31d81837e17469e18a854116 234 | 2bbf45266e4ec0ae72977c89ac8d55c1 235 | NYT_ENG_20130828.0147 236 | 563b1e8fcb1de7a4c0e01da9100d6e09 237 | NYT_ENG_20130613.0153 238 | AFP_ENG_20100414.0615 239 | 4ae1669fc17f6b863ff35fa14a960270 240 | 95af1b55c359f28ff3a9159d55e9528a 241 | 52e569e00b6428b94205d3dd5c457c54 242 | 5bfd613fd31f0c2bdfb5c41f21629144 243 | NYT_ENG_20131121.0250 244 | 59f8514f6db132207ba9e5828f73d706 245 | 0f316bb245762eedec6682acbecf2822 246 | NYT_ENG_20131225.0200 247 | 5fa7fbe87758a02a1e4591f88175ccf3 248 | 644706e2d97c9a9a1f9874510180f136 249 | f913574a9c0637dbcf66def4a2c1dc84 250 | 4fbb1eec7dfd5c2fefb94a2d873ddfa5 251 | d6bc66d7c8423368aaa8d789b5bdf5db 252 | 63dca285201d1fcda72a54f4302b2c3e 253 | 3065902101e4282b89ed4ac8f64d4a84 254 | 043b35fbf220a2d1bbe7d0612ad87635 255 | 4d7e1af80bc46167ef3d81cf642bf94b 256 | 543e319fb067ef8cba81c74bb13c5711 257 | 026e0a2c96e90bd8bf9aecde62d7530d 258 | ca2a6fbf721ca102c149ad6a90d5b00a 259 | a9318b72c7a2ff32d459af958c7defe1 260 | ae656f6d658efca126f9721087608e95 261 | bf1047c7c17ae3daab59c3bee423e12f 262 | 57026b7bcb8f855de3e26d572db35285 263 | aa32f4f9534045b9f33a9599d0c1b580 264 | 63878a2b6d34b576361d2a2778f321a6 265 | a68c8d0ef75bbbd2923bf7aa78b72d3e 266 | 5753617c893938f625b349cf6bd2b388 267 | 5c29f9e575b94c61db8ed52bdfa53843 268 | 5dfd5bfee062cd5896b619a2b1309766 269 | 5685a6069312d52a897fe69973269338 270 | fd103b2c981e724f64d70a22c392ee93 271 | 2a46fcf4ff6ce3896f249848e48b3b4c 272 | cade0d91e2e82e4db58efe64d7462c33 273 | 7c0e0e53980aeb2868cbe4e1c1cb79db 274 | 0fbcb8f76124b9654076889ce04a045b 275 | acdf07c9477b21e1d29c51dc692e085b 276 | 1d2911e09a6746b942c3e7b3cbdcb0ce 277 | 1d6c0e3df079663f6bceca0b44c98a40 278 | 9fc05e3fab69893da830adfa6513510d 279 | 087f58983ef5e94e54024bc9f0f009ae 280 | 0f947223d04c10118b523cfeec5d231e 281 | 3f78c311ad97d4bbc6b4914deb4ab1ec 282 | 609d5112c0386dc4e5f2e90b93cb7a5f 283 | 0fe5904ced20c20537fe29c1db11cd28 284 | 86a94ca907de6688cca64610730fa11b 285 | 82f0af70bf68f4e78e6ea60a339f830d 286 | 3f0e2f2fb9b773bc178522a6535a9651 287 | d5825f99faec1ae48589b98560a98d61 288 | AFP_ENG_20100601.0724 289 | dbed9b6ed7d2eaf75fef0aa5a245a663 290 | 0929d82f7059353f9593b9558983efba 291 | 4deb48e2b0ab194ce37c1bd31c73586a 292 | 1a79f9d5c3f784a494196a9bbb586f3b 293 | aa54ac32868c5de9b05b65a8ee7a4329 294 | 086e26ec92d1cc02f3900e9ac46d6962 295 | ea4d6baa1d6174c45fce1e6bbb58e1b4 296 | NYT_ENG_20130501.0255 297 | NYT_ENG_20130428.0140 298 | 43326b9fa7deac9d3f8f9e2a0aa0e5cf 299 | 4df3dfff1ee1683ac6e1c2ea24ce2589 300 | 66fba4f92d2f9d8c3bee5dfad3af9828 301 | -------------------------------------------------------------------------------- /resource/splits/ACE05-EP/dev.doc.txt: -------------------------------------------------------------------------------- 1 | AFP_ENG_20030305.0918 2 | AFP_ENG_20030311.0491 3 | AFP_ENG_20030314.0238 4 | AFP_ENG_20030319.0879 5 | AFP_ENG_20030320.0722 6 | AFP_ENG_20030327.0022 7 | AFP_ENG_20030327.0224 8 | AGGRESSIVEVOICEDAILY_20041116.1347 9 | CNNHL_ENG_20030304_142751.10 10 | CNNHL_ENG_20030424_123502.25 11 | CNNHL_ENG_20030513_220910.32 12 | CNN_CF_20030303.1900.02 13 | CNN_ENG_20030304_173120.16 14 | CNN_ENG_20030328_150609.10 15 | CNN_ENG_20030424_070008.15 16 | CNN_ENG_20030512_170454.13 17 | CNN_ENG_20030620_085840.7 18 | CNN_IP_20030329.1600.00-2 19 | CNN_IP_20030402.1600.00-1 20 | CNN_IP_20030405.1600.01-1 21 | CNN_IP_20030409.1600.02 22 | FLOPPINGACES_20041117.2002.024 23 | FLOPPINGACES_20050203.1953.038 24 | FLOPPINGACES_20050217.1237.014 25 | TTRACY_20050223.1049 26 | marcellapr_20050228.2219 27 | rec.games.chess.politics_20041217.2111 28 | soc.org.nonprofit_20050218.1902 29 | -------------------------------------------------------------------------------- /resource/splits/ACE05-EP/test.doc.txt: -------------------------------------------------------------------------------- 1 | AFP_ENG_20030401.0476 2 | AFP_ENG_20030413.0098 3 | AFP_ENG_20030415.0734 4 | AFP_ENG_20030417.0004 5 | AFP_ENG_20030417.0307 6 | AFP_ENG_20030417.0764 7 | AFP_ENG_20030418.0556 8 | AFP_ENG_20030425.0408 9 | AFP_ENG_20030427.0118 10 | AFP_ENG_20030428.0720 11 | AFP_ENG_20030429.0007 12 | AFP_ENG_20030430.0075 13 | AFP_ENG_20030502.0614 14 | AFP_ENG_20030504.0248 15 | AFP_ENG_20030508.0118 16 | AFP_ENG_20030508.0357 17 | AFP_ENG_20030509.0345 18 | AFP_ENG_20030514.0706 19 | AFP_ENG_20030519.0049 20 | AFP_ENG_20030519.0372 21 | AFP_ENG_20030522.0878 22 | AFP_ENG_20030527.0616 23 | AFP_ENG_20030528.0561 24 | AFP_ENG_20030530.0132 25 | AFP_ENG_20030601.0262 26 | AFP_ENG_20030607.0030 27 | AFP_ENG_20030616.0715 28 | AFP_ENG_20030617.0846 29 | AFP_ENG_20030625.0057 30 | AFP_ENG_20030630.0271 31 | APW_ENG_20030304.0555 32 | APW_ENG_20030306.0191 33 | APW_ENG_20030308.0314 34 | APW_ENG_20030310.0719 35 | APW_ENG_20030311.0775 36 | APW_ENG_20030318.0689 37 | APW_ENG_20030319.0545 38 | APW_ENG_20030322.0119 39 | APW_ENG_20030324.0768 40 | APW_ENG_20030325.0786 41 | -------------------------------------------------------------------------------- /resource/splits/ACE05-EP/train.doc.txt: -------------------------------------------------------------------------------- 1 | AFP_ENG_20030323.0020 2 | AFP_ENG_20030330.0211 3 | AGGRESSIVEVOICEDAILY_20041101.1144 4 | AGGRESSIVEVOICEDAILY_20041101.1806 5 | AGGRESSIVEVOICEDAILY_20041201.2313 6 | AGGRESSIVEVOICEDAILY_20041203.1959 7 | AGGRESSIVEVOICEDAILY_20041208.2133 8 | AGGRESSIVEVOICEDAILY_20041215.2302 9 | AGGRESSIVEVOICEDAILY_20041218.0146 10 | AGGRESSIVEVOICEDAILY_20041218.1004 11 | AGGRESSIVEVOICEDAILY_20041223.1449 12 | AGGRESSIVEVOICEDAILY_20041226.1712 13 | AGGRESSIVEVOICEDAILY_20050105.1344 14 | AGGRESSIVEVOICEDAILY_20050106.1310 15 | AGGRESSIVEVOICEDAILY_20050107.2012 16 | AGGRESSIVEVOICEDAILY_20050109.1627 17 | AGGRESSIVEVOICEDAILY_20050113.1400 18 | AGGRESSIVEVOICEDAILY_20050114.1922 19 | AGGRESSIVEVOICEDAILY_20050116.2149 20 | AGGRESSIVEVOICEDAILY_20050124.1354 21 | AGGRESSIVEVOICEDAILY_20050125.0136 22 | AGGRESSIVEVOICEDAILY_20050203.1356 23 | AGGRESSIVEVOICEDAILY_20050205.1954 24 | AGGRESSIVEVOICEDAILY_20050208.1142 25 | AGGRESSIVEVOICEDAILY_20050213.2123 26 | AGGRESSIVEVOICEDAILY_20050224.1207 27 | AGGRESSIVEVOICEDAILY_20050224.2252 28 | APW_ENG_20030326.0190 29 | APW_ENG_20030327.0376 30 | APW_ENG_20030331.0410 31 | APW_ENG_20030403.0862 32 | APW_ENG_20030404.0439 33 | APW_ENG_20030406.0191 34 | APW_ENG_20030407.0030 35 | APW_ENG_20030408.0090 36 | APW_ENG_20030409.0013 37 | APW_ENG_20030410.0906 38 | APW_ENG_20030411.0304 39 | APW_ENG_20030412.0531 40 | APW_ENG_20030414.0392 41 | APW_ENG_20030415.0742 42 | APW_ENG_20030416.0581 43 | APW_ENG_20030417.0555 44 | APW_ENG_20030418.0084 45 | APW_ENG_20030419.0358 46 | APW_ENG_20030422.0469 47 | APW_ENG_20030422.0485 48 | APW_ENG_20030423.0079 49 | APW_ENG_20030424.0532 50 | APW_ENG_20030424.0698 51 | APW_ENG_20030502.0470 52 | APW_ENG_20030502.0686 53 | APW_ENG_20030508.0772 54 | APW_ENG_20030510.0228 55 | APW_ENG_20030513.0139 56 | APW_ENG_20030519.0367 57 | APW_ENG_20030519.0548 58 | APW_ENG_20030520.0081 59 | APW_ENG_20030520.0757 60 | APW_ENG_20030527.0232 61 | APW_ENG_20030602.0037 62 | APW_ENG_20030603.0303 63 | APW_ENG_20030610.0010 64 | APW_ENG_20030610.0554 65 | APW_ENG_20030619.0383 66 | Austin-Grad-Community_20050212.2454 67 | BACONSREBELLION_20050123.1639 68 | BACONSREBELLION_20050125.1108 69 | BACONSREBELLION_20050127.1017 70 | BACONSREBELLION_20050204.1326 71 | BACONSREBELLION_20050205.1919 72 | BACONSREBELLION_20050206.1345 73 | BACONSREBELLION_20050209.0721 74 | BACONSREBELLION_20050210.0728 75 | BACONSREBELLION_20050214.0944 76 | BACONSREBELLION_20050216.1536 77 | BACONSREBELLION_20050216.1618 78 | BACONSREBELLION_20050216.1632 79 | BACONSREBELLION_20050217.0744 80 | BACONSREBELLION_20050218.0848 81 | BACONSREBELLION_20050218.1214 82 | BACONSREBELLION_20050222.0817 83 | BACONSREBELLION_20050222.1348 84 | BACONSREBELLION_20050226.1317 85 | BACONSREBELLION_20050227.1238 86 | CNNHL_ENG_20030312_150218.13 87 | CNNHL_ENG_20030331_193419.9 88 | CNNHL_ENG_20030402_133449.22 89 | CNNHL_ENG_20030402_193443.5 90 | CNNHL_ENG_20030403_133453.21 91 | CNNHL_ENG_20030403_193455.30 92 | CNNHL_ENG_20030407_193547.5 93 | CNNHL_ENG_20030410_193626.13 94 | CNNHL_ENG_20030411_230640.38 95 | CNNHL_ENG_20030415_193729.5 96 | CNNHL_ENG_20030416_133739.13 97 | CNNHL_ENG_20030416_133739.9 98 | CNNHL_ENG_20030416_193742.26 99 | CNNHL_ENG_20030416_193742.7 100 | CNNHL_ENG_20030416_230741.33 101 | CNNHL_ENG_20030425_183518.12 102 | CNNHL_ENG_20030428_123600.14 103 | CNNHL_ENG_20030429_220618.15 104 | CNNHL_ENG_20030430_220712.37 105 | CNNHL_ENG_20030505_220734.25 106 | CNNHL_ENG_20030513_183907.5 107 | CNNHL_ENG_20030513_220910.11 108 | CNNHL_ENG_20030519_124020.23 109 | CNNHL_ENG_20030523_221118.14 110 | CNNHL_ENG_20030526_221156.39 111 | CNNHL_ENG_20030603_230307.3 112 | CNNHL_ENG_20030604_230238.5 113 | CNNHL_ENG_20030609_133335.37 114 | CNNHL_ENG_20030610_133347.6 115 | CNNHL_ENG_20030610_230438.14 116 | CNNHL_ENG_20030611_133445.24 117 | CNNHL_ENG_20030616_230155.28 118 | CNNHL_ENG_20030616_230155.7 119 | CNNHL_ENG_20030618_230303.36 120 | CNNHL_ENG_20030618_230303.6 121 | CNNHL_ENG_20030624_133331.33 122 | CNNHL_ENG_20030624_230338.34 123 | CNNHL_ENG_20030625_193346.7 124 | CNNHL_ENG_20030625_230351.4 125 | CNN_CF_20030303.1900.00 126 | CNN_CF_20030303.1900.05 127 | CNN_CF_20030303.1900.06-1 128 | CNN_CF_20030303.1900.06-2 129 | CNN_CF_20030304.1900.01 130 | CNN_CF_20030304.1900.02 131 | CNN_CF_20030304.1900.04 132 | CNN_CF_20030304.1900.06-2 133 | CNN_CF_20030305.1900.00-1 134 | CNN_CF_20030305.1900.00-2 135 | CNN_CF_20030305.1900.00-3 136 | CNN_CF_20030305.1900.02 137 | CNN_CF_20030305.1900.06-1 138 | CNN_CF_20030305.1900.06-2 139 | CNN_ENG_20030305_170125.1 140 | CNN_ENG_20030306_070606.18 141 | CNN_ENG_20030306_083604.6 142 | CNN_ENG_20030312_083725.3 143 | CNN_ENG_20030312_223733.14 144 | CNN_ENG_20030313_083739.0 145 | CNN_ENG_20030318_140851.8 146 | CNN_ENG_20030320_153434.7 147 | CNN_ENG_20030325_150531.10 148 | CNN_ENG_20030325_220534.6 149 | CNN_ENG_20030327_163556.20 150 | CNN_ENG_20030329_170349.7 151 | CNN_ENG_20030331_123648.4 152 | CNN_ENG_20030331_193655.14 153 | CNN_ENG_20030401_073033.14 154 | CNN_ENG_20030401_233449.5 155 | CNN_ENG_20030402_190500.11 156 | CNN_ENG_20030403_060032.0 157 | CNN_ENG_20030403_080032.9 158 | CNN_ENG_20030403_090032.1 159 | CNN_ENG_20030403_180511.16 160 | CNN_ENG_20030403_183513.1 161 | CNN_ENG_20030404_073033.4 162 | CNN_ENG_20030404_163526.10 163 | CNN_ENG_20030407_080037.12 164 | CNN_ENG_20030407_130604.10 165 | CNN_ENG_20030407_170605.7 166 | CNN_ENG_20030408_083034.11 167 | CNN_ENG_20030408_123613.0 168 | CNN_ENG_20030408_153616.9 169 | CNN_ENG_20030408_200618.14 170 | CNN_ENG_20030409_180633.8 171 | CNN_ENG_20030410_183644.8 172 | CNN_ENG_20030411_070039.21 173 | CNN_ENG_20030411_193701.3 174 | CNN_ENG_20030411_233701.11 175 | CNN_ENG_20030414_130735.7 176 | CNN_ENG_20030415_103039.0 177 | CNN_ENG_20030415_173752.0 178 | CNN_ENG_20030415_180754.5 179 | CNN_ENG_20030415_183752.14 180 | CNN_ENG_20030416_100042.7 181 | CNN_ENG_20030416_160804.4 182 | CNN_ENG_20030416_180808.15 183 | CNN_ENG_20030416_190806.4 184 | CNN_ENG_20030417_063039.0 185 | CNN_ENG_20030417_073039.2 186 | CNN_ENG_20030418_063040.1 187 | CNN_ENG_20030418_083040.11 188 | CNN_ENG_20030418_130831.5 189 | CNN_ENG_20030418_163834.14 190 | CNN_ENG_20030421_090007.11 191 | CNN_ENG_20030421_120508.13 192 | CNN_ENG_20030421_120508.17 193 | CNN_ENG_20030421_133510.6 194 | CNN_ENG_20030422_083005.10 195 | CNN_ENG_20030422_213527.4 196 | CNN_ENG_20030423_180539.2 197 | CNN_ENG_20030424_073006.4 198 | CNN_ENG_20030424_113549.11 199 | CNN_ENG_20030424_173553.8 200 | CNN_ENG_20030424_183556.7 201 | CNN_ENG_20030425_063006.5 202 | CNN_ENG_20030425_133605.6 203 | CNN_ENG_20030426_160621.0 204 | CNN_ENG_20030428_130651.4 205 | CNN_ENG_20030428_173654.13 206 | CNN_ENG_20030428_193655.2 207 | CNN_ENG_20030429_083016.5 208 | CNN_ENG_20030429_110706.7 209 | CNN_ENG_20030429_143706.14 210 | CNN_ENG_20030429_170710.4 211 | CNN_ENG_20030429_190711.14 212 | CNN_ENG_20030430_063016.14 213 | CNN_ENG_20030430_093016.0 214 | CNN_ENG_20030430_160723.6 215 | CNN_ENG_20030501_063017.15 216 | CNN_ENG_20030501_160459.0 217 | CNN_ENG_20030502_080020.7 218 | CNN_ENG_20030502_093018.6 219 | CNN_ENG_20030505_090022.1 220 | CNN_ENG_20030506_053020.14 221 | CNN_ENG_20030506_160524.18 222 | CNN_ENG_20030506_163523.22 223 | CNN_ENG_20030507_060023.1 224 | CNN_ENG_20030507_160538.15 225 | CNN_ENG_20030507_170539.0 226 | CNN_ENG_20030508_170552.18 227 | CNN_ENG_20030508_210555.5 228 | CNN_ENG_20030509_090025.5 229 | CNN_ENG_20030509_123601.13 230 | CNN_ENG_20030512_190454.7 231 | CNN_ENG_20030513_080020.2 232 | CNN_ENG_20030513_113501.6 233 | CNN_ENG_20030513_160506.16 234 | CNN_ENG_20030514_130518.5 235 | CNN_ENG_20030515_063019.6 236 | CNN_ENG_20030515_073019.7 237 | CNN_ENG_20030515_193533.6 238 | CNN_ENG_20030516_090022.7 239 | CNN_ENG_20030516_123543.8 240 | CNN_ENG_20030524_143511.4 241 | CNN_ENG_20030525_143522.8 242 | CNN_ENG_20030525_160525.13 243 | CNN_ENG_20030526_133535.4 244 | CNN_ENG_20030526_180540.6 245 | CNN_ENG_20030526_183538.3 246 | CNN_ENG_20030527_195948.3 247 | CNN_ENG_20030527_215946.12 248 | CNN_ENG_20030528_082823.9 249 | CNN_ENG_20030528_125956.8 250 | CNN_ENG_20030528_165958.16 251 | CNN_ENG_20030528_172957.18 252 | CNN_ENG_20030528_195959.20 253 | CNN_ENG_20030529_085826.10 254 | CNN_ENG_20030529_130011.6 255 | CNN_ENG_20030530_130025.12 256 | CNN_ENG_20030602_072826.1 257 | CNN_ENG_20030602_102826.13 258 | CNN_ENG_20030602_105829.2 259 | CNN_ENG_20030602_133012.9 260 | CNN_ENG_20030603_095830.17 261 | CNN_ENG_20030603_133025.7 262 | CNN_ENG_20030604_092828.7 263 | CNN_ENG_20030604_102828.6 264 | CNN_ENG_20030605_065831.18 265 | CNN_ENG_20030605_085831.13 266 | CNN_ENG_20030605_105831.11 267 | CNN_ENG_20030605_153000.9 268 | CNN_ENG_20030605_193002.8 269 | CNN_ENG_20030605_223004.4 270 | CNN_ENG_20030607_170312.6 271 | CNN_ENG_20030607_173310.4 272 | CNN_ENG_20030610_085833.10 273 | CNN_ENG_20030610_095857.4 274 | CNN_ENG_20030610_105832.1 275 | CNN_ENG_20030610_123040.9 276 | CNN_ENG_20030610_130042.17 277 | CNN_ENG_20030610_133041.17 278 | CNN_ENG_20030611_102832.3 279 | CNN_ENG_20030611_102832.4 280 | CNN_ENG_20030611_175950.5 281 | CNN_ENG_20030612_072835.2 282 | CNN_ENG_20030612_160005.13 283 | CNN_ENG_20030612_173004.10 284 | CNN_ENG_20030612_173004.2 285 | CNN_ENG_20030614_173123.4 286 | CNN_ENG_20030616_130059.25 287 | CNN_ENG_20030617_065838.21 288 | CNN_ENG_20030617_105836.4 289 | CNN_ENG_20030617_112838.4 290 | CNN_ENG_20030617_173115.14 291 | CNN_ENG_20030617_173115.22 292 | CNN_ENG_20030617_193116.10 293 | CNN_ENG_20030618_065839.11 294 | CNN_ENG_20030618_150128.5 295 | CNN_ENG_20030618_150128.6 296 | CNN_ENG_20030618_193127.17 297 | CNN_ENG_20030619_115954.10 298 | CNN_ENG_20030619_115954.4 299 | CNN_ENG_20030619_125955.10 300 | CNN_ENG_20030620_095840.4 301 | CNN_ENG_20030620_170011.14 302 | CNN_ENG_20030621_115841.16 303 | CNN_ENG_20030621_160254.25 304 | CNN_ENG_20030622_173306.9 305 | CNN_ENG_20030624_065843.24 306 | CNN_ENG_20030624_082841.12 307 | CNN_ENG_20030624_140104.22 308 | CNN_ENG_20030624_153103.16 309 | CNN_ENG_20030624_153103.17 310 | CNN_ENG_20030625_210122.0 311 | CNN_ENG_20030625_220123.3 312 | CNN_ENG_20030626_193133.8 313 | CNN_ENG_20030626_203133.11 314 | CNN_ENG_20030627_065846.3 315 | CNN_ENG_20030627_130145.6 316 | CNN_ENG_20030630_075848.7 317 | CNN_ENG_20030630_085848.18 318 | CNN_IP_20030328.1600.07 319 | CNN_IP_20030329.1600.00-3 320 | CNN_IP_20030329.1600.00-4 321 | CNN_IP_20030329.1600.00-5 322 | CNN_IP_20030329.1600.00-6 323 | CNN_IP_20030329.1600.01-1 324 | CNN_IP_20030329.1600.01-3 325 | CNN_IP_20030329.1600.02 326 | CNN_IP_20030330.1600.05-2 327 | CNN_IP_20030330.1600.06 328 | CNN_IP_20030402.1600.00-2 329 | CNN_IP_20030402.1600.00-3 330 | CNN_IP_20030402.1600.00-4 331 | CNN_IP_20030402.1600.02-1 332 | CNN_IP_20030402.1600.02-2 333 | CNN_IP_20030403.1600.00-1 334 | CNN_IP_20030403.1600.00-2 335 | CNN_IP_20030403.1600.00-3 336 | CNN_IP_20030403.1600.00-4 337 | CNN_IP_20030404.1600.00-1 338 | CNN_IP_20030404.1600.00-2 339 | CNN_IP_20030405.1600.00-2 340 | CNN_IP_20030405.1600.00-3 341 | CNN_IP_20030405.1600.01-2 342 | CNN_IP_20030405.1600.01-3 343 | CNN_IP_20030405.1600.02 344 | CNN_IP_20030406.1600.03 345 | CNN_IP_20030407.1600.05 346 | CNN_IP_20030408.1600.03 347 | CNN_IP_20030408.1600.04 348 | CNN_IP_20030409.1600.04 349 | CNN_IP_20030410.1600.03-1 350 | CNN_IP_20030410.1600.03-2 351 | CNN_IP_20030412.1600.03 352 | CNN_IP_20030412.1600.05 353 | CNN_IP_20030414.1600.04 354 | CNN_IP_20030417.1600.06 355 | CNN_IP_20030422.1600.05 356 | CNN_LE_20030504.1200.01 357 | CNN_LE_20030504.1200.02-1 358 | CNN_LE_20030504.1200.02-2 359 | FLOPPINGACES_20041113.1528.042 360 | FLOPPINGACES_20041114.1240.039 361 | FLOPPINGACES_20041115.1613.032 362 | FLOPPINGACES_20041116.0833.027 363 | FLOPPINGACES_20041228.0927.010 364 | FLOPPINGACES_20041230.1844.003 365 | FLOPPINGACES_20050101.2244.048 366 | GETTINGPOLITICAL_20050105.0127.001 367 | HEALINGIRAQ_20041108.1942.05 368 | Integritas-Group-Community-Forum_20050110.0557 369 | MARKBACKER_20041103.1300 370 | MARKBACKER_20041108.1507 371 | MARKBACKER_20041112.0707 372 | MARKBACKER_20041117.0723 373 | MARKBACKER_20041117.1107 374 | MARKBACKER_20041119.1002 375 | MARKBACKER_20041128.1641 376 | MARKBACKER_20041202.0711 377 | MARKBACKER_20041206.0733 378 | MARKBACKER_20041216.0656 379 | MARKBACKER_20041217.1639 380 | MARKBACKER_20041220.0919 381 | MARKBACKER_20050103.0829 382 | MARKBACKER_20050105.1526 383 | MARKBACKER_20050105.1632 384 | MARKBACKER_20050217.0647 385 | MARKETVIEW_20041209.1401 386 | MARKETVIEW_20041211.1845 387 | MARKETVIEW_20041212.1447 388 | MARKETVIEW_20041213.0722 389 | MARKETVIEW_20041215.2128 390 | MARKETVIEW_20041217.0801 391 | MARKETVIEW_20041219.1509 392 | MARKETVIEW_20041220.1537 393 | MARKETVIEW_20050105.1901 394 | MARKETVIEW_20050120.1641 395 | MARKETVIEW_20050126.0711 396 | MARKETVIEW_20050127.0716 397 | MARKETVIEW_20050201.0748 398 | MARKETVIEW_20050204.1322 399 | MARKETVIEW_20050204.1337 400 | MARKETVIEW_20050204.1736 401 | MARKETVIEW_20050205.1358 402 | MARKETVIEW_20050206.1951 403 | MARKETVIEW_20050206.2009 404 | MARKETVIEW_20050207.0746 405 | MARKETVIEW_20050208.2033 406 | MARKETVIEW_20050208.2059 407 | MARKETVIEW_20050209.1923 408 | MARKETVIEW_20050210.2138 409 | MARKETVIEW_20050212.1607 410 | MARKETVIEW_20050212.1717 411 | MARKETVIEW_20050214.2115 412 | MARKETVIEW_20050215.1858 413 | MARKETVIEW_20050216.2120 414 | MARKETVIEW_20050217.2115 415 | MARKETVIEW_20050222.0729 416 | MARKETVIEW_20050222.1919 417 | MARKETVIEW_20050225.0541 418 | MARKETVIEW_20050226.1307 419 | MARKETVIEW_20050226.1444 420 | MARKETVIEW_20050228.2211 421 | NYT_ENG_20030403.0008 422 | NYT_ENG_20030602.0074 423 | NYT_ENG_20030630.0079 424 | OIADVANTAGE_20041224.1007 425 | OIADVANTAGE_20050103.0944 426 | OIADVANTAGE_20050105.0922 427 | OIADVANTAGE_20050108.1323 428 | OIADVANTAGE_20050109.1947 429 | OIADVANTAGE_20050110.1009 430 | OIADVANTAGE_20050203.1000 431 | OIADVANTAGE_20050203.2102 432 | OIADVANTAGE_20050204.1155 433 | XIN_ENG_20030314.0208 434 | XIN_ENG_20030317.0177 435 | XIN_ENG_20030324.0191 436 | XIN_ENG_20030327.0202 437 | XIN_ENG_20030408.0341 438 | XIN_ENG_20030415.0379 439 | XIN_ENG_20030423.0011 440 | XIN_ENG_20030425.0184 441 | XIN_ENG_20030509.0137 442 | XIN_ENG_20030513.0002 443 | XIN_ENG_20030523.0202 444 | XIN_ENG_20030609.0118 445 | XIN_ENG_20030610.0299 446 | XIN_ENG_20030616.0274 447 | XIN_ENG_20030624.0085 448 | alt.atheism_20041104.2428 449 | alt.books.tom-clancy_20050130.1848 450 | alt.collecting.autographs_20050224.2438 451 | alt.corel_20041228.0503 452 | alt.gossip.celebrities_20041118.2331 453 | alt.gossip.celebrities_20050218.0826 454 | alt.obituaries_20041121.1339 455 | alt.politics.economics_20041206.1835 456 | alt.politics_20050124.0640 457 | alt.religion.mormon_20050103.0854 458 | alt.support.divorce_20050113.2451 459 | alt.sys.pc-clone.dell_20050226.2350 460 | alt.vacation.las-vegas_20050109.0133 461 | aus.cars_20041206.0903 462 | fsh_29097 463 | fsh_29105 464 | fsh_29121 465 | fsh_29138 466 | fsh_29139 467 | fsh_29141 468 | fsh_29171 469 | fsh_29187 470 | fsh_29191 471 | fsh_29192 472 | fsh_29195 473 | fsh_29226 474 | fsh_29272 475 | fsh_29302 476 | fsh_29303 477 | fsh_29326 478 | fsh_29336 479 | fsh_29344 480 | fsh_29348 481 | fsh_29350 482 | fsh_29361 483 | fsh_29388 484 | fsh_29395 485 | fsh_29505 486 | fsh_29520 487 | fsh_29521 488 | fsh_29526 489 | fsh_29581_1 490 | fsh_29586 491 | fsh_29592 492 | fsh_29601 493 | fsh_29622 494 | fsh_29628 495 | fsh_29630 496 | fsh_29770 497 | fsh_29774 498 | fsh_29782_2 499 | fsh_29783 500 | fsh_29786 501 | marcellapr_20050211.2013 502 | misc.invest.marketplace_20050208.2406 503 | misc.kids.pregnancy_20050120.0404 504 | misc.legal.moderated_20041202.1648 505 | misc.legal.moderated_20050129.2225 506 | misc.survivalism_20050210.0232 507 | misc.taxes_20050218.1250 508 | rec.arts.mystery_20050219.1126 509 | rec.arts.sf.written.robert-jordan_20050208.1350 510 | rec.boats_20050130.1006 511 | rec.music.makers.guitar.acoustic_20041228.1628 512 | rec.music.phish_20041215.1554 513 | rec.music.phish_20050217.1804 514 | rec.parks.theme_20050217.2019 515 | rec.sport.disc_20050209.2202 516 | rec.travel.cruises_20050216.1636 517 | rec.travel.cruises_20050222.0313 518 | rec.travel.europe_20050101.1800 519 | rec.travel.usa-canada_20050128.0121 520 | seattle.politics_20050122.2412 521 | soc.culture.china_20050203.0639 522 | soc.culture.hmong_20050210.1130 523 | soc.culture.indian_20041104.2348 524 | soc.culture.iraq_20050211.0445 525 | soc.culture.jewish_20050130.2105 526 | soc.history.war.world-war-ii_20050127.2403 527 | soc.history.what-if_20050129.1404 528 | talk.politics.misc_20050216.1337 529 | uk.gay-lesbian-bi_20050127.0311 530 | -------------------------------------------------------------------------------- /resource/splits/ACE05-EP/train_more.doc.txt: -------------------------------------------------------------------------------- 1 | CNN_CF_20030303.1900.00 2 | CNN_CF_20030303.1900.05 3 | CNN_CF_20030303.1900.06-1 4 | CNN_CF_20030303.1900.06-2 5 | CNN_CF_20030304.1900.02 6 | CNN_CF_20030304.1900.04 7 | CNN_CF_20030304.1900.06-2 8 | CNN_CF_20030305.1900.00-1 9 | CNN_CF_20030305.1900.00-2 10 | CNN_CF_20030305.1900.00-3 11 | CNN_CF_20030305.1900.02 12 | CNN_CF_20030305.1900.06-1 13 | CNN_CF_20030305.1900.06-2 14 | CNN_IP_20030328.1600.07 15 | CNN_IP_20030329.1600.00-3 16 | CNN_IP_20030329.1600.00-4 17 | CNN_IP_20030329.1600.00-5 18 | CNN_IP_20030329.1600.00-6 19 | CNN_IP_20030329.1600.01-1 20 | CNN_IP_20030329.1600.01-3 21 | CNN_IP_20030329.1600.02 22 | CNN_IP_20030330.1600.05-2 23 | CNN_IP_20030330.1600.06 24 | CNN_IP_20030402.1600.00-2 25 | CNN_IP_20030402.1600.00-3 26 | CNN_IP_20030402.1600.00-4 27 | CNN_IP_20030402.1600.02-1 28 | CNN_IP_20030402.1600.02-2 29 | CNN_IP_20030403.1600.00-1 30 | CNN_IP_20030403.1600.00-2 31 | CNN_IP_20030403.1600.00-3 32 | CNN_IP_20030403.1600.00-4 33 | CNN_IP_20030404.1600.00-1 34 | CNN_IP_20030404.1600.00-2 35 | CNN_IP_20030405.1600.00-2 36 | CNN_IP_20030405.1600.00-3 37 | CNN_IP_20030405.1600.01-2 38 | CNN_IP_20030405.1600.01-3 39 | CNN_IP_20030405.1600.02 40 | CNN_IP_20030406.1600.03 41 | CNN_IP_20030407.1600.05 42 | CNN_IP_20030408.1600.03 43 | CNN_IP_20030408.1600.04 44 | CNN_IP_20030409.1600.04 45 | CNN_IP_20030410.1600.03-1 46 | CNN_IP_20030410.1600.03-2 47 | CNN_IP_20030412.1600.03 48 | CNN_IP_20030412.1600.05 49 | CNN_IP_20030414.1600.04 50 | CNN_IP_20030417.1600.06 51 | CNN_IP_20030422.1600.05 52 | CNN_LE_20030504.1200.01 53 | CNN_LE_20030504.1200.02-1 54 | CNN_LE_20030504.1200.02-2 55 | CNNHL_ENG_20030312_150218.13 56 | CNNHL_ENG_20030331_193419.9 57 | CNNHL_ENG_20030402_133449.22 58 | CNNHL_ENG_20030402_193443.5 59 | CNNHL_ENG_20030403_133453.21 60 | CNNHL_ENG_20030403_193455.30 61 | CNNHL_ENG_20030407_193547.5 62 | CNNHL_ENG_20030411_230640.38 63 | CNNHL_ENG_20030415_193729.5 64 | CNNHL_ENG_20030416_133739.13 65 | CNNHL_ENG_20030416_133739.9 66 | CNNHL_ENG_20030416_193742.26 67 | CNNHL_ENG_20030416_193742.7 68 | CNNHL_ENG_20030416_230741.33 69 | CNNHL_ENG_20030425_183518.12 70 | CNNHL_ENG_20030428_123600.14 71 | CNNHL_ENG_20030429_220618.15 72 | CNNHL_ENG_20030430_220712.37 73 | CNNHL_ENG_20030505_220734.25 74 | CNNHL_ENG_20030513_183907.5 75 | CNNHL_ENG_20030513_220910.11 76 | CNNHL_ENG_20030519_124020.23 77 | CNNHL_ENG_20030523_221118.14 78 | CNNHL_ENG_20030526_221156.39 79 | CNNHL_ENG_20030603_230307.3 80 | CNNHL_ENG_20030604_230238.5 81 | CNNHL_ENG_20030609_133335.37 82 | CNNHL_ENG_20030610_133347.6 83 | CNNHL_ENG_20030610_230438.14 84 | CNNHL_ENG_20030611_133445.24 85 | CNNHL_ENG_20030616_230155.28 86 | CNNHL_ENG_20030616_230155.7 87 | CNNHL_ENG_20030618_230303.36 88 | CNNHL_ENG_20030618_230303.6 89 | CNNHL_ENG_20030624_133331.33 90 | CNNHL_ENG_20030624_230338.34 91 | CNNHL_ENG_20030625_193346.7 92 | CNNHL_ENG_20030625_230351.4 93 | CNN_ENG_20030305_170125.1 94 | CNN_ENG_20030306_070606.18 95 | CNN_ENG_20030306_083604.6 96 | CNN_ENG_20030312_083725.3 97 | CNN_ENG_20030312_223733.14 98 | CNN_ENG_20030313_083739.0 99 | CNN_ENG_20030318_140851.8 100 | CNN_ENG_20030320_153434.7 101 | CNN_ENG_20030325_150531.10 102 | CNN_ENG_20030325_220534.6 103 | CNN_ENG_20030327_163556.20 104 | CNN_ENG_20030329_170349.7 105 | CNN_ENG_20030331_123648.4 106 | CNN_ENG_20030331_193655.14 107 | CNN_ENG_20030401_073033.14 108 | CNN_ENG_20030401_233449.5 109 | CNN_ENG_20030402_190500.11 110 | CNN_ENG_20030403_060032.0 111 | CNN_ENG_20030403_080032.9 112 | CNN_ENG_20030403_090032.1 113 | CNN_ENG_20030403_180511.16 114 | CNN_ENG_20030403_183513.1 115 | CNN_ENG_20030404_073033.4 116 | CNN_ENG_20030404_163526.10 117 | CNN_ENG_20030407_080037.12 118 | CNN_ENG_20030407_130604.10 119 | CNN_ENG_20030407_170605.7 120 | CNN_ENG_20030408_083034.11 121 | CNN_ENG_20030408_123613.0 122 | CNN_ENG_20030408_153616.9 123 | CNN_ENG_20030408_200618.14 124 | CNN_ENG_20030409_180633.8 125 | CNN_ENG_20030410_183644.8 126 | CNN_ENG_20030411_193701.3 127 | CNN_ENG_20030411_233701.11 128 | CNN_ENG_20030414_130735.7 129 | CNN_ENG_20030415_103039.0 130 | CNN_ENG_20030415_173752.0 131 | CNN_ENG_20030415_180754.5 132 | CNN_ENG_20030415_183752.14 133 | CNN_ENG_20030416_100042.7 134 | CNN_ENG_20030416_160804.4 135 | CNN_ENG_20030416_180808.15 136 | CNN_ENG_20030416_190806.4 137 | CNN_ENG_20030417_063039.0 138 | CNN_ENG_20030417_073039.2 139 | CNN_ENG_20030418_063040.1 140 | CNN_ENG_20030418_083040.11 141 | CNN_ENG_20030418_130831.5 142 | CNN_ENG_20030418_163834.14 143 | CNN_ENG_20030421_090007.11 144 | CNN_ENG_20030421_120508.13 145 | CNN_ENG_20030421_120508.17 146 | CNN_ENG_20030421_133510.6 147 | CNN_ENG_20030422_083005.10 148 | CNN_ENG_20030422_213527.4 149 | CNN_ENG_20030423_180539.2 150 | CNN_ENG_20030424_073006.4 151 | CNN_ENG_20030424_113549.11 152 | CNN_ENG_20030424_173553.8 153 | CNN_ENG_20030424_183556.7 154 | CNN_ENG_20030425_063006.5 155 | CNN_ENG_20030425_133605.6 156 | CNN_ENG_20030426_160621.0 157 | CNN_ENG_20030428_130651.4 158 | CNN_ENG_20030428_173654.13 159 | CNN_ENG_20030428_193655.2 160 | CNN_ENG_20030429_083016.5 161 | CNN_ENG_20030429_110706.7 162 | CNN_ENG_20030429_143706.14 163 | CNN_ENG_20030429_170710.4 164 | CNN_ENG_20030429_190711.14 165 | CNN_ENG_20030430_063016.14 166 | CNN_ENG_20030430_093016.0 167 | CNN_ENG_20030430_160723.6 168 | CNN_ENG_20030501_063017.15 169 | CNN_ENG_20030501_160459.0 170 | CNN_ENG_20030502_080020.7 171 | CNN_ENG_20030502_093018.6 172 | CNN_ENG_20030505_090022.1 173 | CNN_ENG_20030506_053020.14 174 | CNN_ENG_20030506_160524.18 175 | CNN_ENG_20030506_163523.22 176 | CNN_ENG_20030507_060023.1 177 | CNN_ENG_20030507_160538.15 178 | CNN_ENG_20030507_170539.0 179 | CNN_ENG_20030508_170552.18 180 | CNN_ENG_20030508_210555.5 181 | CNN_ENG_20030509_090025.5 182 | CNN_ENG_20030509_123601.13 183 | CNN_ENG_20030512_190454.7 184 | CNN_ENG_20030513_080020.2 185 | CNN_ENG_20030513_113501.6 186 | CNN_ENG_20030513_160506.16 187 | CNN_ENG_20030514_130518.5 188 | CNN_ENG_20030515_063019.6 189 | CNN_ENG_20030515_073019.7 190 | CNN_ENG_20030515_193533.6 191 | CNN_ENG_20030516_090022.7 192 | CNN_ENG_20030516_123543.8 193 | CNN_ENG_20030524_143511.4 194 | CNN_ENG_20030525_143522.8 195 | CNN_ENG_20030525_160525.13 196 | CNN_ENG_20030526_133535.4 197 | CNN_ENG_20030526_180540.6 198 | CNN_ENG_20030526_183538.3 199 | CNN_ENG_20030527_195948.3 200 | CNN_ENG_20030527_215946.12 201 | CNN_ENG_20030528_082823.9 202 | CNN_ENG_20030528_125956.8 203 | CNN_ENG_20030528_165958.16 204 | CNN_ENG_20030528_172957.18 205 | CNN_ENG_20030528_195959.20 206 | CNN_ENG_20030529_085826.10 207 | CNN_ENG_20030529_130011.6 208 | CNN_ENG_20030530_130025.12 209 | CNN_ENG_20030602_072826.1 210 | CNN_ENG_20030602_102826.13 211 | CNN_ENG_20030602_105829.2 212 | CNN_ENG_20030602_133012.9 213 | CNN_ENG_20030603_095830.17 214 | CNN_ENG_20030603_133025.7 215 | CNN_ENG_20030604_092828.7 216 | CNN_ENG_20030604_102828.6 217 | CNN_ENG_20030605_065831.18 218 | CNN_ENG_20030605_085831.13 219 | CNN_ENG_20030605_105831.11 220 | CNN_ENG_20030605_193002.8 221 | CNN_ENG_20030605_223004.4 222 | CNN_ENG_20030607_170312.6 223 | CNN_ENG_20030607_173310.4 224 | CNN_ENG_20030610_085833.10 225 | CNN_ENG_20030610_095857.4 226 | CNN_ENG_20030610_105832.1 227 | CNN_ENG_20030610_123040.9 228 | CNN_ENG_20030610_130042.17 229 | CNN_ENG_20030610_133041.17 230 | CNN_ENG_20030611_102832.3 231 | CNN_ENG_20030611_102832.4 232 | CNN_ENG_20030611_175950.5 233 | CNN_ENG_20030612_072835.2 234 | CNN_ENG_20030612_160005.13 235 | CNN_ENG_20030612_173004.10 236 | CNN_ENG_20030612_173004.2 237 | CNN_ENG_20030614_173123.4 238 | CNN_ENG_20030616_130059.25 239 | CNN_ENG_20030617_065838.21 240 | CNN_ENG_20030617_105836.4 241 | CNN_ENG_20030617_112838.4 242 | CNN_ENG_20030617_173115.14 243 | CNN_ENG_20030617_173115.22 244 | CNN_ENG_20030617_193116.10 245 | CNN_ENG_20030618_065839.11 246 | CNN_ENG_20030618_150128.5 247 | CNN_ENG_20030618_150128.6 248 | CNN_ENG_20030618_193127.17 249 | CNN_ENG_20030619_115954.10 250 | CNN_ENG_20030619_115954.4 251 | CNN_ENG_20030619_125955.10 252 | CNN_ENG_20030620_095840.4 253 | CNN_ENG_20030620_170011.14 254 | CNN_ENG_20030621_115841.16 255 | CNN_ENG_20030621_160254.25 256 | CNN_ENG_20030622_173306.9 257 | CNN_ENG_20030624_065843.24 258 | CNN_ENG_20030624_082841.12 259 | CNN_ENG_20030624_140104.22 260 | CNN_ENG_20030624_153103.16 261 | CNN_ENG_20030624_153103.17 262 | CNN_ENG_20030625_210122.0 263 | CNN_ENG_20030625_220123.3 264 | CNN_ENG_20030626_193133.8 265 | CNN_ENG_20030627_065846.3 266 | CNN_ENG_20030627_130145.6 267 | CNN_ENG_20030630_075848.7 268 | CNN_ENG_20030630_085848.18 269 | fsh_29097 270 | fsh_29105 271 | fsh_29121 272 | fsh_29138 273 | fsh_29139 274 | fsh_29141 275 | fsh_29171 276 | fsh_29187 277 | fsh_29191 278 | fsh_29192 279 | fsh_29195 280 | fsh_29226 281 | fsh_29272 282 | fsh_29302 283 | fsh_29303 284 | fsh_29326 285 | fsh_29336 286 | fsh_29344 287 | fsh_29348 288 | fsh_29350 289 | fsh_29361 290 | fsh_29388 291 | fsh_29395 292 | fsh_29505 293 | fsh_29520 294 | fsh_29521 295 | fsh_29526 296 | fsh_29581_1 297 | fsh_29586 298 | fsh_29592 299 | fsh_29601 300 | fsh_29622 301 | fsh_29628 302 | fsh_29630 303 | fsh_29770 304 | fsh_29774 305 | fsh_29782_2 306 | fsh_29783 307 | fsh_29786 308 | APW_ENG_20030326.0190 309 | APW_ENG_20030327.0376 310 | APW_ENG_20030331.0410 311 | APW_ENG_20030403.0862 312 | APW_ENG_20030404.0439 313 | APW_ENG_20030406.0191 314 | APW_ENG_20030407.0030 315 | APW_ENG_20030408.0090 316 | APW_ENG_20030409.0013 317 | APW_ENG_20030410.0906 318 | APW_ENG_20030411.0304 319 | APW_ENG_20030412.0531 320 | APW_ENG_20030414.0392 321 | APW_ENG_20030415.0742 322 | APW_ENG_20030416.0581 323 | APW_ENG_20030417.0555 324 | APW_ENG_20030418.0084 325 | APW_ENG_20030419.0358 326 | APW_ENG_20030422.0469 327 | APW_ENG_20030422.0485 328 | APW_ENG_20030423.0079 329 | APW_ENG_20030424.0532 330 | APW_ENG_20030424.0698 331 | APW_ENG_20030502.0470 332 | APW_ENG_20030502.0686 333 | APW_ENG_20030508.0772 334 | APW_ENG_20030510.0228 335 | APW_ENG_20030513.0139 336 | APW_ENG_20030519.0367 337 | APW_ENG_20030519.0548 338 | APW_ENG_20030520.0081 339 | APW_ENG_20030520.0757 340 | APW_ENG_20030527.0232 341 | APW_ENG_20030602.0037 342 | APW_ENG_20030603.0303 343 | APW_ENG_20030610.0010 344 | APW_ENG_20030610.0554 345 | APW_ENG_20030619.0383 346 | NYT_ENG_20030403.0008 347 | NYT_ENG_20030602.0074 348 | NYT_ENG_20030630.0079 349 | XIN_ENG_20030314.0208 350 | XIN_ENG_20030317.0177 351 | XIN_ENG_20030324.0191 352 | XIN_ENG_20030327.0202 353 | XIN_ENG_20030408.0341 354 | XIN_ENG_20030415.0379 355 | XIN_ENG_20030423.0011 356 | XIN_ENG_20030425.0184 357 | XIN_ENG_20030509.0137 358 | XIN_ENG_20030513.0002 359 | XIN_ENG_20030523.0202 360 | XIN_ENG_20030609.0118 361 | XIN_ENG_20030610.0299 362 | XIN_ENG_20030616.0274 363 | XIN_ENG_20030624.0085 364 | Austin-Grad-Community_20050212.2454 365 | Integritas-Group-Community-Forum_20050110.0557 366 | alt.atheism_20041104.2428 367 | alt.books.tom-clancy_20050130.1848 368 | alt.collecting.autographs_20050224.2438 369 | alt.corel_20041228.0503 370 | alt.gossip.celebrities_20041118.2331 371 | alt.gossip.celebrities_20050218.0826 372 | alt.obituaries_20041121.1339 373 | alt.politics.economics_20041206.1835 374 | alt.politics_20050124.0640 375 | alt.religion.mormon_20050103.0854 376 | alt.support.divorce_20050113.2451 377 | alt.sys.pc-clone.dell_20050226.2350 378 | alt.vacation.las-vegas_20050109.0133 379 | aus.cars_20041206.0903 380 | misc.invest.marketplace_20050208.2406 381 | misc.kids.pregnancy_20050120.0404 382 | misc.legal.moderated_20041202.1648 383 | misc.legal.moderated_20050129.2225 384 | misc.survivalism_20050210.0232 385 | misc.taxes_20050218.1250 386 | rec.arts.mystery_20050219.1126 387 | rec.arts.sf.written.robert-jordan_20050208.1350 388 | rec.boats_20050130.1006 389 | rec.music.makers.guitar.acoustic_20041228.1628 390 | rec.music.phish_20041215.1554 391 | rec.music.phish_20050217.1804 392 | rec.parks.theme_20050217.2019 393 | rec.sport.disc_20050209.2202 394 | rec.travel.cruises_20050216.1636 395 | rec.travel.cruises_20050222.0313 396 | rec.travel.europe_20050101.1800 397 | rec.travel.usa-canada_20050128.0121 398 | seattle.politics_20050122.2412 399 | soc.culture.china_20050203.0639 400 | soc.culture.hmong_20050210.1130 401 | soc.culture.indian_20041104.2348 402 | soc.culture.iraq_20050211.0445 403 | soc.culture.jewish_20050130.2105 404 | soc.history.war.world-war-ii_20050127.2403 405 | soc.history.what-if_20050129.1404 406 | talk.politics.misc_20050216.1337 407 | uk.gay-lesbian-bi_20050127.0311 408 | AGGRESSIVEVOICEDAILY_20041101.1144 409 | AGGRESSIVEVOICEDAILY_20041101.1806 410 | AGGRESSIVEVOICEDAILY_20041201.2313 411 | AGGRESSIVEVOICEDAILY_20041203.1959 412 | AGGRESSIVEVOICEDAILY_20041208.2133 413 | AGGRESSIVEVOICEDAILY_20041215.2302 414 | AGGRESSIVEVOICEDAILY_20041218.0146 415 | AGGRESSIVEVOICEDAILY_20041218.1004 416 | AGGRESSIVEVOICEDAILY_20041223.1449 417 | AGGRESSIVEVOICEDAILY_20041226.1712 418 | AGGRESSIVEVOICEDAILY_20050105.1344 419 | AGGRESSIVEVOICEDAILY_20050106.1310 420 | AGGRESSIVEVOICEDAILY_20050107.2012 421 | AGGRESSIVEVOICEDAILY_20050109.1627 422 | AGGRESSIVEVOICEDAILY_20050113.1400 423 | AGGRESSIVEVOICEDAILY_20050114.1922 424 | AGGRESSIVEVOICEDAILY_20050116.2149 425 | AGGRESSIVEVOICEDAILY_20050124.1354 426 | AGGRESSIVEVOICEDAILY_20050125.0136 427 | AGGRESSIVEVOICEDAILY_20050203.1356 428 | AGGRESSIVEVOICEDAILY_20050205.1954 429 | AGGRESSIVEVOICEDAILY_20050208.1142 430 | AGGRESSIVEVOICEDAILY_20050213.2123 431 | AGGRESSIVEVOICEDAILY_20050224.1207 432 | AGGRESSIVEVOICEDAILY_20050224.2252 433 | BACONSREBELLION_20050123.1639 434 | BACONSREBELLION_20050125.1108 435 | BACONSREBELLION_20050127.1017 436 | BACONSREBELLION_20050204.1326 437 | BACONSREBELLION_20050205.1919 438 | BACONSREBELLION_20050206.1345 439 | BACONSREBELLION_20050209.0721 440 | BACONSREBELLION_20050210.0728 441 | BACONSREBELLION_20050214.0944 442 | BACONSREBELLION_20050216.1536 443 | BACONSREBELLION_20050216.1618 444 | BACONSREBELLION_20050216.1632 445 | BACONSREBELLION_20050217.0744 446 | BACONSREBELLION_20050218.0848 447 | BACONSREBELLION_20050218.1214 448 | BACONSREBELLION_20050222.1348 449 | BACONSREBELLION_20050227.1238 450 | FLOPPINGACES_20041113.1528.042 451 | FLOPPINGACES_20041114.1240.039 452 | FLOPPINGACES_20041115.1613.032 453 | FLOPPINGACES_20041116.0833.027 454 | FLOPPINGACES_20041228.0927.010 455 | FLOPPINGACES_20041230.1844.003 456 | FLOPPINGACES_20050101.2244.048 457 | GETTINGPOLITICAL_20050105.0127.001 458 | HEALINGIRAQ_20041108.1942.05 459 | MARKBACKER_20041103.1300 460 | MARKBACKER_20041108.1507 461 | MARKBACKER_20041112.0707 462 | MARKBACKER_20041117.0723 463 | MARKBACKER_20041117.1107 464 | MARKBACKER_20041119.1002 465 | MARKBACKER_20041128.1641 466 | MARKBACKER_20041202.0711 467 | MARKBACKER_20041206.0733 468 | MARKBACKER_20041216.0656 469 | MARKBACKER_20041217.1639 470 | MARKBACKER_20041220.0919 471 | MARKBACKER_20050103.0829 472 | MARKBACKER_20050105.1526 473 | MARKBACKER_20050105.1632 474 | MARKBACKER_20050217.0647 475 | MARKETVIEW_20041209.1401 476 | MARKETVIEW_20041211.1845 477 | MARKETVIEW_20041212.1447 478 | MARKETVIEW_20041213.0722 479 | MARKETVIEW_20041215.2128 480 | MARKETVIEW_20041217.0801 481 | MARKETVIEW_20041219.1509 482 | MARKETVIEW_20041220.1537 483 | MARKETVIEW_20050105.1901 484 | MARKETVIEW_20050120.1641 485 | MARKETVIEW_20050126.0711 486 | MARKETVIEW_20050127.0716 487 | MARKETVIEW_20050201.0748 488 | MARKETVIEW_20050204.1322 489 | MARKETVIEW_20050204.1337 490 | MARKETVIEW_20050204.1736 491 | MARKETVIEW_20050205.1358 492 | MARKETVIEW_20050206.1951 493 | MARKETVIEW_20050206.2009 494 | MARKETVIEW_20050207.0746 495 | MARKETVIEW_20050208.2033 496 | MARKETVIEW_20050208.2059 497 | MARKETVIEW_20050209.1923 498 | MARKETVIEW_20050210.2138 499 | MARKETVIEW_20050212.1607 500 | MARKETVIEW_20050212.1717 501 | MARKETVIEW_20050214.2115 502 | MARKETVIEW_20050215.1858 503 | MARKETVIEW_20050216.2120 504 | MARKETVIEW_20050217.2115 505 | MARKETVIEW_20050222.0729 506 | MARKETVIEW_20050222.1919 507 | MARKETVIEW_20050225.0541 508 | MARKETVIEW_20050226.1307 509 | MARKETVIEW_20050226.1444 510 | MARKETVIEW_20050228.2211 511 | OIADVANTAGE_20041224.1007 512 | OIADVANTAGE_20050103.0944 513 | OIADVANTAGE_20050105.0922 514 | OIADVANTAGE_20050108.1323 515 | OIADVANTAGE_20050109.1947 516 | OIADVANTAGE_20050110.1009 517 | OIADVANTAGE_20050203.1000 518 | OIADVANTAGE_20050203.2102 519 | OIADVANTAGE_20050204.1155 520 | CNN_CF_20030304.1900.01 521 | marcellapr_20050211.2013 522 | BACONSREBELLION_20050222.0817 523 | BACONSREBELLION_20050226.1317 524 | CNN_ENG_20030626_203133.11 525 | CNN_ENG_20030605_153000.9 526 | CNN_ENG_20030411_070039.21 527 | CNNHL_ENG_20030410_193626.13 528 | AFP_ENG_20030330.0211 529 | AFP_ENG_20030323.0020 530 | AFP_ENG_20030304.0250 531 | rec.games.chess.politics_20041216.1047 532 | -------------------------------------------------------------------------------- /resource/splits/ERE-EN/dev.doc.txt: -------------------------------------------------------------------------------- 1 | 101d0fc4a78dc1b84953ebd399b2fad5 2 | 0f03cc5a508d630c6c8c8c61396e31a9 3 | NYT_ENG_20130910.0191 4 | 14294db341956a71811c9dd015b04ed7 5 | 0659c87d9fd3d5efd258ee6de3ba1003 6 | 11a29a0d63a79b0f5d19ccae1838b125 7 | 3dff15d768dbfe27e4d6b81fb63aee95 8 | 4aea880c68f1708f68271a7913f2001f 9 | 2bdb9d86091c6f412ffa767bdc749be9 10 | 1a0f894682abf633cc94b06405b78a8e 11 | 7bac41e8aea34c7ef9462fcc1a572109 12 | 45b9b8f7d17ce5f352c16a339e96705f 13 | 75a85a5de2dd86d7b7662b83aa639d0a 14 | 06fa2a5cdc50c1d2a96bfe02adcc0b40 15 | 22ca1a5aa492b429d274169c54554a7c 16 | edb392c8323a4f5f27cc0e59df409c68 17 | NYT_ENG_20131022.0102 18 | 9e49d5babe9b22ac5ebe1afd3d440ff2 19 | 5bac42475431a87070720e94b27cfd99 20 | 48dafc1e3678fa7b13cb467ab3eed071 21 | 3ddbad6f438c88eec387131477ffe1b9 22 | 44169f6a3f5b04e8dbab2a26e572a136 23 | NYT_ENG_20131029.0228 24 | bec156fe4d6369a40f347477578d28b0 25 | 14fbeb82a73a7df37bcda0583c9bca7e 26 | 61d2b0dcc730f0b4e92ae0d1929b3caf 27 | 428e1e095b4e6e830b47e72f133faf87 28 | APW_ENG_20090611.0697 29 | bb1fba8ce6504faf37892e990d50fb68 30 | c0cae135f2727d4e61315f719cb27434 31 | 90f8a4e01d7a52940959427f10e45f8c 32 | -------------------------------------------------------------------------------- /resource/splits/ERE-EN/test.doc.txt: -------------------------------------------------------------------------------- 1 | NYT_ENG_20130716.0217 2 | 963549e727a8abe0e772e51580fca702 3 | 35621bc5e29e511198d6eabe34676975 4 | NYT_ENG_20130625.0192 5 | 17a2dc40635ec239e9e16d10b6dd45e8 6 | NYT_ENG_20130712.0047 7 | f81535eaaa2c20ef26d54d1d87a02186 8 | 7677d625b58ce649c8aeda2ff4a56389 9 | ae6d0c01a0bea085e48016ac29a3c535 10 | 4622b60202cf3944119daf2be53aa74f 11 | NYT_ENG_20130506.0045 12 | 56af144a4d1d2e662531bdfd00d3c725 13 | 0e6c9afe37a18411d275ee225a0f0f9b 14 | 34d49f3357eaf14c849e9cdfeb893273 15 | dd0b65f632f64369c530f9bbb4b024b4 16 | 0648a08469a3be9eb972f0d213562805 17 | aa33a695c3e28d1f3dd03f4e0b373f70 18 | 1f288dcbcb562b39031c6a9402ebf6d0 19 | e8ad0cb1356161f82fb56c9f88b41990 20 | e5e3faef4fb44311a0ec8aab24903c41 21 | c728ed6c29213079b5f66788047ec89e 22 | 6154640fdb94510274583591cad7b379 23 | 5bbe1c6185296d179b95810e48ee3834 24 | a268efbb260f633c3979688e3b07e7d0 25 | bb6cb93cbd13b91ca52bfc582af0eb45 26 | 19569b08f07d751d6ac4a07633653c50 27 | 3b4d58c0a53671c6ce03f0529bb6089d 28 | a72d82525600c5a2e1aa428264bf089c 29 | d81d2b468875c49a9f6453d78a8e1ddc 30 | a08e03759505523de8475e3bf906dd5d 31 | NYT_ENG_20130710.0155 32 | -------------------------------------------------------------------------------- /resource/splits/ERE-EN/train.doc.txt: -------------------------------------------------------------------------------- 1 | 459bc8b09f4dd2e1fec7c77d26193b01 2 | 43611a2f256d101f910b852379c70959 3 | 6521f6bd1eb405232a5e852423722bac 4 | 565fa81d640f451b20955887a43b3a23 5 | 5fa0f2a7f323a781640b126978ca8a42 6 | 1d2911e09a6746b942c3e7b3cbdcb0ce 7 | 08b0dfe15192c063055ed7db8d24c625 8 | 644706e2d97c9a9a1f9874510180f136 9 | fd103b2c981e724f64d70a22c392ee93 10 | e98123aa18eb4ce95d2d4eccace51169 11 | 5254f96ac3a601e99b6357c4f7627991 12 | 4743a10c1d5f1ad35c31646049acb9db 13 | c793b6b583e008f105af586fe433d4ac 14 | NYT_ENG_20130822.0136 15 | 2ac3b55a10d5395ded9e8e54c345553b 16 | 38cd9b530a5be18dbad52400da435934 17 | 59f8514f6db132207ba9e5828f73d706 18 | NYT_ENG_20130525.0040 19 | aa54ac32868c5de9b05b65a8ee7a4329 20 | NYT_ENG_20130509.0160 21 | 52e569e00b6428b94205d3dd5c457c54 22 | 7c5b86ed55f4e5b8667423ef88f49fb5 23 | 78333509dffd4a7df90b029a5d851dfe 24 | 1ae45904ad12b1540dc390e162b61235 25 | 27eb0b9d14d45ede66fe86534e36a2ce 26 | 2a54459212636289034af844f8634e37 27 | 7e520221ddb1602a0f2aa10560a50a66 28 | 24d93564f48ae17904aa82f937db8c21 29 | 35587c6d8aa67724ba23231dd16f7b44 30 | af79ea77b8fb92424dbc02d88d8c14e8 31 | NYT_ENG_20130422.0048 32 | 361e1c2ca3a1e21c618e0e8fab959e30 33 | 9777919d54ccbb7810bd1c73df91fa4a 34 | 18e8a277f2659f79291efa0e12e80cb3 35 | 5e3fbf49f8301654bb4954c0f1e386a9 36 | 44b011cd504c9ed71beb851324db886a 37 | 5fa7fbe87758a02a1e4591f88175ccf3 38 | 0eb03fc279066b84ed49d44b2405469a 39 | 2f5ee4e363c30678dc3b55caf43bc63d 40 | 57026b7bcb8f855de3e26d572db35285 41 | 3446f8cbcf53eaca5692913ced012b11 42 | 2c2e8b3286bd34e30a4cb57cb7e26ce5 43 | 4df3dfff1ee1683ac6e1c2ea24ce2589 44 | 105249d0d0575a1a5939b16139f6229d 45 | 01f69c4c2206e7c3fa3706ccd5b8b350 46 | 648abb9000309b9807cc8b212c11254f 47 | deb3e0ea36b437c34b52d95aa6a9631f 48 | 1badbb95e5e70ef90e49cdf5a46b6d9b 49 | 7734fb9363c2adf91c6ede6c7bb7df90 50 | 1d6c0e3df079663f6bceca0b44c98a40 51 | 464e03afec9c80f8c1ce4acfe2d002ae 52 | 04debcc4da342dc971bdef4210fe468a 53 | 63dca285201d1fcda72a54f4302b2c3e 54 | 2cf358ab89c732d6b35b65e619d2bc86 55 | 07c9c8ca974b6e9333c38720b0b06896 56 | a68c8d0ef75bbbd2923bf7aa78b72d3e 57 | NYT_ENG_20130813.0006 58 | 3ac3c99241c2243a9e233b091eddfe15 59 | a13d4f9511d799fc25b73e4d5cf28d13 60 | 9e4a09ec419e110a3a12f184e66aea72 61 | 255bae1c133d1d77ef727c063e435a78 62 | fd80f8b1a5694813bbda3253139c6395 63 | 4bab621aef9d14b5d20ac23cb8142112 64 | 96bf72399b104346f3e79022e0c08e5a 65 | 3b34a76a3589417f5db02883b47280a6 66 | 26175bdbe49b712d7412c273c111e813 67 | 3eb834d9a5d9c9fcad258087b5c2794a 68 | d3b5c32563ebb009bc1b1f5bc1b9eb14 69 | NYT_ENG_20130703.0214 70 | a48d00241e327e54ca914b950e97c7d4 71 | 3c9fb643a48360935c1044efca570514 72 | 130a86739522ab7c56232e798d04cbf9 73 | af18d29036ab0a9f8cf2742a5a1b4804 74 | 652f1fbc927a6c358447947d0d77f95f 75 | 66fba4f92d2f9d8c3bee5dfad3af9828 76 | 37b56b6dd846ad0dd6e8cd00ba2efaf4 77 | 4d7e1af80bc46167ef3d81cf642bf94b 78 | NYT_ENG_20130613.0153 79 | 2ca0238925d38f345acbf826854ea448 80 | 2a10c5cc27e7504dc9df92396b9e28b8 81 | 0c100ebc18cc55f80cdae6343f72db69 82 | dbed9b6ed7d2eaf75fef0aa5a245a663 83 | 5d4273298e649a13c4dce27c89f414ac 84 | cb156ad2a5458fabc9e093b6b5e0f97f 85 | 0929d82f7059353f9593b9558983efba 86 | af36543ebce546c7c678fbf9767bfdbb 87 | 120fe19a9bc68fd85fc4963c166e9345 88 | 774caed283a1e55ef9490864771029c3 89 | 561a0178f4b846b9bbcf39f7e63afe4e 90 | 3f987a93959acff3609a251b5abbecd7 91 | 0f947223d04c10118b523cfeec5d231e 92 | 0fe5904ced20c20537fe29c1db11cd28 93 | 9c500ea2248358171d77d419e67f5760 94 | 043b35fbf220a2d1bbe7d0612ad87635 95 | da156c00417e2020948c009d39341607 96 | NYT_ENG_20130501.0255 97 | NYT_ENG_20131118.0019 98 | 26542fb5b83cdb4b98a3fe31e0226b39 99 | 95af1b55c359f28ff3a9159d55e9528a 100 | 2ac34d012c8d909d4a29aa3f6be1f23d 101 | 1d16a571f14fb1032bc19e9314a46deb 102 | 9f23d711bf5016fec9d05081772b4f24 103 | NYT_ENG_20130914.0094 104 | 5d7b429073c60d53acba21bb6e7e6caa 105 | 11c906f2f798abb05f143b206edf77a5 106 | 334de29f692ef2c5460b78fcad5c6c9e 107 | ffc5cc6892ff203f43b2dc8d83bcd725 108 | abbdf0048737e9e639403f8fe8cd7dd2 109 | 23987125927d321ec6f0c30c8f453cb3 110 | 31ea929baed3887e762b0b7f9196ce7e 111 | ae656f6d658efca126f9721087608e95 112 | ecb7c8154bf58b48ae00b252ff283c29 113 | 67db76e5116c4c809107948d4b0a5ecc 114 | 861cdd1a5c6c41610021b25c3795e293 115 | 63878a2b6d34b576361d2a2778f321a6 116 | ed6c37ed1996fc89f5fe813731c71b9d 117 | NYT_ENG_20131115.0084 118 | 8492134197b5bf8e9179e2fa245ae02f 119 | d6bc66d7c8423368aaa8d789b5bdf5db 120 | 4f7eedf44076ea050d7db3715f9333fa 121 | f0612c786635ed96ee3df84821a17685 122 | 459b795a150e7866d6e4ef75e1b92b4a 123 | 1473ea2ded50c05b29b4f55f1b83ada3 124 | cf88887857b155d8822f82cad3597744 125 | 1980ed7ea6a283f8dd19da5a4e9952d6 126 | 17f98f0c6cda0227e732e6761f396d1f 127 | 477135a713d07aafe00d5e86648ea408 128 | 33bdb079026f1fcbe47c64b8c6968d0e 129 | 290e2643c2f91c108b206c5edb7a1c0f 130 | 29f64df7feb04dfb16f4667ce199c9f0 131 | NYT_ENG_20130716.0036 132 | ca2a6fbf721ca102c149ad6a90d5b00a 133 | 5b7cab1d1cfc0c05686399d8bcbcfe5b 134 | 4fca88a5c29716cbb7c0f9aa9b84007a 135 | 40f1f697a457e39c30ad94b7cc712c96 136 | 8073c89ca4fdbe3b1eba0352bfe15d78 137 | 4deb48e2b0ab194ce37c1bd31c73586a 138 | bf1047c7c17ae3daab59c3bee423e12f 139 | c397ecd66789b905c6b1c5ef21af03ec 140 | NYT_ENG_20130504.0098 141 | NYT_ENG_20131121.0040 142 | 178e7de35eccad0df800f0c7539cf614 143 | f913574a9c0637dbcf66def4a2c1dc84 144 | 3e9bbf75058a3f16585889bb9c64a903 145 | a83302f9002b6707fc7a91a7d7d29e6e 146 | d7369ce92ed0b6327412c705dbbab654 147 | 70b2f9277a1c78bd13cef68ba6485bd9 148 | 5d0b5755e212a88afbbb8b29c34c4f13 149 | c1f185252a2837aa464e36f263d1ebe9 150 | 6291811a3fe70d3ec8fc26b91060e2f5 151 | aa32f4f9534045b9f33a9599d0c1b580 152 | 2bebb50073ceefd0c9ccfdf3e07b3258 153 | 30cced37fcceb1800341d18d4f97b670 154 | 3b9c27eda65c635e109a547930942486 155 | 3322caacf140c92366a639ee004560ce 156 | d5825f99faec1ae48589b98560a98d61 157 | 408dff173c599256711f23238e280c15 158 | 47de592453663260c44944346d669611 159 | 86a94ca907de6688cca64610730fa11b 160 | 18a89cdd00dadc593a88c924111575f1 161 | 4edd239ce7d1f7274154cd05081f8995 162 | 7c0e0e53980aeb2868cbe4e1c1cb79db 163 | 33c71a5cec78e7d766d75c9a73b327b8 164 | 9b3bc3c727dfaa49218b57254087ff5d 165 | NYT_ENG_20131210.0203 166 | 2d2a4ddb1c8f4a669541704f9fb78472 167 | 5dfd5bfee062cd5896b619a2b1309766 168 | 2701285c791f423cd2f8fd827df9c2c9 169 | 04952b874a2a34d602faaa74712d435e 170 | 2ee2377e5d4ae6f5922ea2af11f9d4e1 171 | 79a3cc37998a99808583eba765aedca1 172 | NYT_ENG_20131025.0190 173 | 5bfd613fd31f0c2bdfb5c41f21629144 174 | 61d6f81f680f83a1a3281fde24d9c3ac 175 | 79c976f694784ced2b0c8752eb767901 176 | 0a421343005f3241376fa01e1cb3c6fb 177 | 5753617c893938f625b349cf6bd2b388 178 | NYT_ENG_20130428.0140 179 | 34f729e5ac124e9898b2744a6598d50e 180 | a724033bff06e750d27cd7e3bf8263ac 181 | 1b0f90c029f75d326ea39c0371901ef4 182 | 51d64c51a2363954454ee9e921b590ce 183 | 1656bbad43fee4569b5c5f14110c1342 184 | 5f3a6a4c39c15d7382c2cafe64ae898d 185 | 6667fb9e43ac7edde844453cba97baf0 186 | 52a77871923a7f86bb1a52812bc7f2e1 187 | 44a65adb7f74e6c99d05eb2721fd0baf 188 | NYT_ENG_20131029.0042 189 | 5bb3c2b1094912a6df7e862bb2981481 190 | 47c26ba3563092e41c5a42252931baf1 191 | 41404718f9c1e94cf58aad1fc90c70a7 192 | APW_ENG_20101231.0037 193 | a05c08e340a73270592f62361a19274d 194 | 99ab1cad51361e94c2fe3f997c45705a 195 | NYT_ENG_20131029.0091 196 | 5c7ea2b51202d80ee37eba8a182afad3 197 | NYT_ENG_20131128.0177 198 | 0e0abbf0da91d9e34750441c08d5d262 199 | 15ba31cca04cc5300361f46319247c40 200 | 459f9a2b3eddd436f0232395f129dfd0 201 | NYT_ENG_20130508.0098 202 | 04134f2be20afbb868d7a8292f49e277 203 | 0cde024ee993679967f7ac397000ad52 204 | 593cb5020613a4695859130542f7fc94 205 | NYT_ENG_20131122.0237 206 | 9f6e4c46ae753bf14edff7e2ac767213 207 | cd04993849c889a56ea66c6670f002f4 208 | 4042cd8643253f65df3a4e8de320a1c9 209 | 3f0e2f2fb9b773bc178522a6535a9651 210 | 4798bc0e166fe93893bdf2d922f06258 211 | a9318b72c7a2ff32d459af958c7defe1 212 | 3ae6760a860a33cb90af23596fac475c 213 | 11329f1cdb44019afc8f48b6fdc5376d 214 | 39ff7dcae4034417ba175de97d14b165 215 | 43341a312ffd84a4ad3c3ab0df8bcd7c 216 | 21dbe23f56aaef87fd0980234895b321 217 | 02905b7ce3a6b8b0961c6c2310392ef9 218 | f6ad2150f6c32fcb1488438f6b4275ce 219 | f9af64dc0cf1e7edd4a8feef75018b81 220 | aa003ea934a97bac86cee52b7122f1f8 221 | 766386bc5cb9eb40419a80d082472d50 222 | 4435a7cb258d37b4fafc3ef0e833582e 223 | 736fa00bfb16f3298883be5e962fe01b 224 | NYT_ENG_20130731.0133 225 | d409fd37c208c5a7a5b2c64b4130b0ec 226 | 5cd7d603e1cf8d2c134d039dc90112f0 227 | 1e9dfabe5e068a4142e768c0c5c37b6b 228 | 36b12cef6f7a805e3e74a4f430129028 229 | e2e2039f203f36b821d15e2cb6f588e0 230 | NYT_ENG_20130816.0151 231 | 856bc3bee118c826c394ed09548db9b2 232 | 909239794c799f2d2e79c023ae090c35 233 | 087f58983ef5e94e54024bc9f0f009ae 234 | b49eee97fd373efbb4cb41926e60e385 235 | f801d26c9b4d7577df089a196e242a04 236 | 2a46fcf4ff6ce3896f249848e48b3b4c 237 | NYT_ENG_20130619.0092 238 | 1a79f9d5c3f784a494196a9bbb586f3b 239 | fab32c473df923a6a9242054c8d23bf3 240 | 1a0f101744b34677ce1e1da1b1b91beb 241 | 4572d22caf3e1924f894002b724f958b 242 | 30eadb19db9f0db62cba7be66862920d 243 | 59a5d2e146c13f7519130193fc773610 244 | 3878ab866ca434318076c4e7eac49c0d 245 | 2d7d6761aad911a63a235a571fa7862f 246 | 4d996a22855cc2ec9f54990a23d51c56 247 | 324274e50f2d07757e2d88ff58a0c33b 248 | AFP_ENG_20100414.0615 249 | 563b1e8fcb1de7a4c0e01da9100d6e09 250 | 5dd42026c76290af6689691fbe2b8d1c 251 | edc4216d65afa47fe7bc6004ac172e92 252 | 2aaa319d1e1a0600837d013cb84290ea 253 | b9109877820d90dbc5efcdda02e6d450 254 | 0f316bb245762eedec6682acbecf2822 255 | 3dc7812b2b39ed067cc7c8ab1218e128 256 | 648fc5834f73b4196b4ceb3daad954f9 257 | 0fab386f8b6527439481f526c92341c7 258 | NYT_ENG_20131220.0283 259 | 36d45aff571e3fbe036f309c18d31668 260 | 3a0d64b5cb2bc7319e803e344dc695b5 261 | 39280a4d31d81837e17469e18a854116 262 | 670b5425fcd1700e2c27af5f09244cb1 263 | f3e00fa1d34bca154aea0845c628f0e6 264 | ae9a0d394c5e3d3d812c7ffc07c2f836 265 | f18a7b77b1fd1065db9aeaf3f6143a5e 266 | 0536891daea71ab51ee1123137b67146 267 | 6491f0650d9628b84dee6f539df5a53d 268 | 2ba8bbf004fe30c0a01f6fcd25f01dcc 269 | f0aabfc899d1c17b8e99039bb4f80d64 270 | 4ae1669fc17f6b863ff35fa14a960270 271 | 2bbf45266e4ec0ae72977c89ac8d55c1 272 | 0ba982819aaf9f5b94a7cebd48ac6018 273 | 010aaf594ae6ef20eb28e3ee26038375 274 | 0c49bb860962aa0d5b8e3fc277592da0 275 | e972c0257d72aefc52cfdf7e7f5a1623 276 | 82f0af70bf68f4e78e6ea60a339f830d 277 | acdf07c9477b21e1d29c51dc692e085b 278 | 186ef6837e001cd9b97a132c86705545 279 | 389c70a4859f7528cc6e8b84c10766d7 280 | 91147deeeec220cc445a8d546585cdb7 281 | 370e7ee173951eeff13998a416b8b3d0 282 | 9fc05e3fab69893da830adfa6513510d 283 | 3b9b81a3a446c24009c7642da54dbd28 284 | 1bf9912633f942d6d1d4e87df33cee40 285 | a42f7cf822523c76c225602537aefc7a 286 | 4fbb1eec7dfd5c2fefb94a2d873ddfa5 287 | ea4d6baa1d6174c45fce1e6bbb58e1b4 288 | 3059538a2542c71687871b3444f8d921 289 | NYT_ENG_20131121.0250 290 | 661ece467567ffbb54b551dfc1c2c254 291 | 204f8f6bdb24c5198175bf1ed483247b 292 | 1f60eb9697e240af089b134b69c2042d 293 | 44087d95184e9d94f3948f47e9b602af 294 | cade0d91e2e82e4db58efe64d7462c33 295 | 10953ba63f691cb49f47f852b359a6e3 296 | 15c96bac6c08ef94fe249fde914b53d7 297 | 5c59566e9132c060423cad5b2d1bac1e 298 | 57b2773ab54bbc5c119a46fd9be2c4f0 299 | 368df106b2eaa0b4091e099f360a07d6 300 | cb824da90723fed309217c6e28b1c7cd 301 | NYT_ENG_20130828.0147 302 | NYT_ENG_20131225.0200 303 | 3f115570c2fcc85263ba97e0134fb039 304 | 44fd27d40ae65547c3b584c2ff360cd7 305 | 07b79a8764693a80861e5a3e5fd47fa5 306 | 6f9d5ec51264868ada3c2c22c70fc57c 307 | NYT_ENG_20130709.0087 308 | 6837dcaff76ad3235d46708dd89e7306 309 | 2251a78817e67a2adaf0722fd05c7ac0 310 | 97655df62dd4a176b65cf8a2c2a6e82d 311 | fa371b1fbb4d20143e638a7dac6e4f6b 312 | c8930568f1175e8bb0bff9b932a5c2d4 313 | 43326b9fa7deac9d3f8f9e2a0aa0e5cf 314 | 5685a6069312d52a897fe69973269338 315 | 4829d3d91263ed9d8801e6d94c3569a5 316 | 1a11228e8230c359e0f357cbd8240b01 317 | 342431e61e80263f606c46bb5e399cc7 318 | f703536e3212f51cbf26ce47aa7b5eff 319 | 086e26ec92d1cc02f3900e9ac46d6962 320 | 502c46cc149d30f9ad0c25194636dcb6 321 | 33ed1c9fdee1000e2340ac7f92c77752 322 | 609d5112c0386dc4e5f2e90b93cb7a5f 323 | 0fbcb8f76124b9654076889ce04a045b 324 | NYT_ENG_20130603.0111 325 | 824610c87232d345dcc130521f20f72a 326 | 57fb3f87bbb8c3205163ea256f658891 327 | 09098ae4e956a51b038876197814735e 328 | 073020eb350fc73f123bfac8ec485ecc 329 | 48c498c9762046efbece8d183ed996ca 330 | 4b2d9d5984b731dbdd3db398b5fb5e46 331 | 2d8d3572658fdb8754fdc84d2b15f302 332 | 8f575db98ccc3af0a904b650898368dd 333 | e37cfedb8a3a32769a12262eaef9ee0d 334 | 542d2b2755c23b22e9747d8a3b020bf2 335 | 373a3b4bb2a9e67a12c50ad54a1be657 336 | b6b443777e5ca92aa5152f5593960fd9 337 | 3065902101e4282b89ed4ac8f64d4a84 338 | bdca67a0bacec61b5e691d5ca51ba724 339 | 6f13620752b8bd5acf2e1e94c49faef5 340 | XIN_ENG_20101125.0137 341 | NYT_ENG_20130910.0002 342 | 65814a1b2cccd0fd9be5ee3d5068038d 343 | 84828469f40b28161c559e3d01526039 344 | 584b6272bb8c9cc134621ff5ace8c98d 345 | 590baa25bb1cc16c31fd02395edf6835 346 | 39ebaa0bb958e3529b331f4c71025e62 347 | 17f22c2b1e5642b41a9aeedb03261d1a 348 | 081fede2fca345dce82bf6b2355d4ae5 349 | 3e6c7121211de578d7fd831eae801438 350 | 4175e3da216dcc8710a26359e4ecaaad 351 | d4698e3ad06f896058ade2e8f3a09577 352 | d528b874a0a6bd6011279a3239360aa2 353 | 3f78c311ad97d4bbc6b4914deb4ab1ec 354 | 08ebdc5f0ec8588af38ab1684318d99c 355 | NYT_ENG_20130625.0044 356 | 543e319fb067ef8cba81c74bb13c5711 357 | 2c8bcca93da4097da338a8754e4f03b0 358 | 52355a4167e6ac3a80d19c94ad6259a7 359 | 1557734399e8da2b84a2dd9ddb4eba49 360 | a223ebce2f7481c8feecaba0982b4fa7 361 | 4a3d067b19686b281e0beb437573a28c 362 | 1b268b27094ba9c5feb11192dad940ab 363 | 376c304800b734b2a5a2c87b19eddc2a 364 | 018fb4e59ac5474167ffc5940d7e55e7 365 | NYT_ENG_20131003.0269 366 | 3138f7fb2f8575ed762eb0bc11023d59 367 | AFP_ENG_20100601.0724 368 | 37d781089c669131c5118415cf470422 369 | 3a9a0c07af53fce42e1a55c21826c54d 370 | 0f565d3822dca80336582ffac4adaf78 371 | cfd86b06365dab636d13523c7ed93ad6 372 | c06e8bbdf69f73a69cd3d5dbb4d06a21 373 | d0b9b1747f4a6247294cde9ac0165c60 374 | 416cfc6a5717682cd35d381c5be07734 375 | 22696c601df1a7359e9b629c689700ad 376 | NYT_ENG_20130506.0130 377 | 4764f1400fa336d1fb972719b10b939a 378 | cca700aed62fd497e64e507752409b41 379 | 56c895a1c8dead5698a49321a674f3f4 380 | 17af00d74fca31bceab4ad463bf1c384 381 | 026e0a2c96e90bd8bf9aecde62d7530d 382 | 4eb58398a5c2ef35b16d885c5573b3d4 383 | 5c29f9e575b94c61db8ed52bdfa53843 384 | 3d8f19221d257f81e3376b9e0731d4db 385 | 12bbeaf10a36d36d82824a72352ac178 386 | 362f9d9707c4da0c8068bc7034aae4b4 387 | 026bd1c7eae9f14da9480a4b88ba2fb6 388 | 4683e6affe801713ed4cc9d596b57fac 389 | 2b96d1172d37f60aea5ce64a0b410248 390 | b608865c83b6612bf9ccb4e4c6e66ee7 391 | 584ccaef38f5936e973f0561966bbf06 392 | 0cfdfe102b7a4cb34e1a181c1d36d23d 393 | 83d7cb6d5b663f34dcf83879a8729fb4 394 | 30fa916e5173b52d449300e2ea71b787 395 | 25f868780ac18430a6f10ab4de22ffb8 396 | 4c2488e10c34e5412d3b67e794c9bc84 397 | -------------------------------------------------------------------------------- /scripts/eval.sh: -------------------------------------------------------------------------------- 1 | export OMP_NUM_THREADS=4 2 | 3 | OUTPUT_DIR="output/ace05e_high_resources/multi-task-Keywords-full" 4 | DATASET="ace05e" 5 | # DATASET="ace05ep" 6 | # DATASET="ere" 7 | 8 | python keyee/eval.py \ 9 | -c config/config_keyee_${DATASET}.json \ 10 | -e ${OUTPUT_DIR}/best_model.mdl \ 11 | --eval_batch_size 16 \ 12 | --write_file ${OUTPUT_DIR}/eval_result.json \ 13 | --no_dev -------------------------------------------------------------------------------- /scripts/process_ace05e.sh: -------------------------------------------------------------------------------- 1 | export DYGIEFORMAT_PATH="./processed_data/ace05e_dygieppformat" 2 | export OUTPUT_PATH="./processed_data/ace05e_bart" 3 | 4 | mkdir $OUTPUT_PATH 5 | 6 | python preprocessing/process_ace05e.py -i $DYGIEFORMAT_PATH/train.json -o $OUTPUT_PATH/train.w1.oneie.json -b facebook/bart-large -w 1 7 | python preprocessing/process_ace05e.py -i $DYGIEFORMAT_PATH/dev.json -o $OUTPUT_PATH/dev.w1.oneie.json -b facebook/bart-large -w 1 8 | python preprocessing/process_ace05e.py -i $DYGIEFORMAT_PATH/test.json -o $OUTPUT_PATH/test.w1.oneie.json -b facebook/bart-large -w 1 9 | 10 | export BASE_PATH="./processed_data/" 11 | export SPLIT_PATH="./resource/low_resource_split/ace05e" 12 | 13 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_001 -o $BASE_PATH/ace05e_bart/train.001.w1.oneie.json 14 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_002 -o $BASE_PATH/ace05e_bart/train.002.w1.oneie.json 15 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_003 -o $BASE_PATH/ace05e_bart/train.003.w1.oneie.json 16 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_005 -o $BASE_PATH/ace05e_bart/train.005.w1.oneie.json 17 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_010 -o $BASE_PATH/ace05e_bart/train.010.w1.oneie.json 18 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_020 -o $BASE_PATH/ace05e_bart/train.020.w1.oneie.json 19 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_030 -o $BASE_PATH/ace05e_bart/train.030.w1.oneie.json 20 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_050 -o $BASE_PATH/ace05e_bart/train.050.w1.oneie.json 21 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05e_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_075 -o $BASE_PATH/ace05e_bart/train.075.w1.oneie.json 22 | -------------------------------------------------------------------------------- /scripts/process_ace05ep.sh: -------------------------------------------------------------------------------- 1 | export ACE_PATH="../../datasets/ace_2005_td_v7/data/" 2 | export OUTPUT_PATH="./processed_data/ace05ep_bart" 3 | 4 | mkdir $OUTPUT_PATH 5 | 6 | python preprocessing/process_ace05ep.py -i $ACE_PATH -o $OUTPUT_PATH -s resource/splits/ACE05-EP -b facebook/bart-large -w 1 -l english 7 | 8 | export BASE_PATH="./processed_data/" 9 | export SPLIT_PATH="./resource/low_resource_split/ace05ep" 10 | 11 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_001 -o $BASE_PATH/ace05ep_bart/train.001.w1.oneie.json 12 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_002 -o $BASE_PATH/ace05ep_bart/train.002.w1.oneie.json 13 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_003 -o $BASE_PATH/ace05ep_bart/train.003.w1.oneie.json 14 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_005 -o $BASE_PATH/ace05ep_bart/train.005.w1.oneie.json 15 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_010 -o $BASE_PATH/ace05ep_bart/train.010.w1.oneie.json 16 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_020 -o $BASE_PATH/ace05ep_bart/train.020.w1.oneie.json 17 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_030 -o $BASE_PATH/ace05ep_bart/train.030.w1.oneie.json 18 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_050 -o $BASE_PATH/ace05ep_bart/train.050.w1.oneie.json 19 | python preprocessing/split_dataset.py -i $BASE_PATH/ace05ep_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_075 -o $BASE_PATH/ace05ep_bart/train.075.w1.oneie.json 20 | -------------------------------------------------------------------------------- /scripts/process_ere.sh: -------------------------------------------------------------------------------- 1 | export ERE_PATH="../../datasets/ERE/" 2 | export OUTPUT_PATH="./processed_data/ere_bart" 3 | 4 | mkdir $OUTPUT_PATH 5 | 6 | python preprocessing/process_ere.py -i $ERE_PATH -o $OUTPUT_PATH -s resource/splits/ERE-EN -b facebook/bart-large -w 1 7 | 8 | export BASE_PATH="./processed_data/" 9 | export SPLIT_PATH="./resource/low_resource_split/ere" 10 | 11 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_001 -o $BASE_PATH/ere_bart/train.001.w1.oneie.json 12 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_002 -o $BASE_PATH/ere_bart/train.002.w1.oneie.json 13 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_003 -o $BASE_PATH/ere_bart/train.003.w1.oneie.json 14 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_005 -o $BASE_PATH/ere_bart/train.005.w1.oneie.json 15 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_010 -o $BASE_PATH/ere_bart/train.010.w1.oneie.json 16 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_020 -o $BASE_PATH/ere_bart/train.020.w1.oneie.json 17 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_030 -o $BASE_PATH/ere_bart/train.030.w1.oneie.json 18 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_050 -o $BASE_PATH/ere_bart/train.050.w1.oneie.json 19 | python preprocessing/split_dataset.py -i $BASE_PATH/ere_bart/train.w1.oneie.json -s $SPLIT_PATH/doc_list_075 -o $BASE_PATH/ere_bart/train.075.w1.oneie.json 20 | -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | export OMP_NUM_THREADS=4 2 | 3 | DATASET="ace05e" 4 | # DATASET="ace05ep" 5 | # DATASET="ere" 6 | 7 | python keyee/generate_data.py -c config/config_keyee_${DATASET}.json 8 | python keyee/train.py -c config/config_keyee_${DATASET}.json --------------------------------------------------------------------------------