├── .gitignore ├── LICENSE ├── README.md ├── data_list.csv ├── main.py ├── output └── sample.json └── parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | venv/ 3 | .idea/ 4 | 5 | data/ace_2005_td_v7 6 | 7 | output/dev.json 8 | output/train.json 9 | output/dev.json 10 | output/debug.json 11 | 12 | test.json 13 | 14 | stanford-corenlp-full-2018-10-05.zip 15 | stanford-corenlp-full-2018-10-05/ 16 | 17 | test/ 18 | analysis/ 19 | logdir/ 20 | baseline/ 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 swyoon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ACE2005 preprocessing 2 | 3 | This is a simple code for preprocessing ACE 2005 corpus for Event Extraction task. 4 | 5 | Using the existing methods were complicated for me, so I made this project. 6 | 7 | ## Prerequisites 8 | 9 | 1. Prepare **ACE 2005 dataset**. 10 | 11 | (Download: https://catalog.ldc.upenn.edu/LDC2006T06. Note that ACE 2005 dataset is not free.) 12 | 13 | 2. Install the packages. 14 | ``` 15 | pip install stanfordcorenlp beautifulsoup4 nltk tqdm 16 | ``` 17 | 18 | 3. Download stanford-corenlp model. 19 | ```bash 20 | wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip 21 | unzip stanford-corenlp-full-2018-10-05.zip 22 | ``` 23 | 24 | ## Usage 25 | 26 | Run: 27 | 28 | ```bash 29 | sudo python main.py --data=./data/ace_2005_td_v7/data/English --nlp=./stanford-corenlp-full-2018-10-05 30 | ``` 31 | 32 | - Then you can get the parsed data in `output directory`. 33 | 34 | - If it is not executed with the `sudo`, an error can occur when using `stanford-corenlp`. 35 | 36 | - It takes about 30 minutes to complete the pre-processing. 37 | 38 | ## Output 39 | 40 | ### Format 41 | 42 | I follow the json format described in 43 | [EMNLP2018-JMEE](https://github.com/lx865712528/EMNLP2018-JMEE) 44 | repository like the bellow sample. Furthermore, I add entity head for 45 | each entity, because many nlp tasks exploit the head of entity not the 46 | mention of entity. 47 | 48 | If you want to know event types and arguments in detail, read [this document (ACE 2005 event guidelines)](https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/english-events-guidelines-v5.4.3.pdf). 49 | 50 | 51 | **`sample.json`** 52 | ```json 53 | [ 54 | { 55 | "sentence": "Earlier documents in the case have included embarrassing details about perks Welch received as part of his retirement package from GE at a time when corporate scandals were sparking outrage.", 56 | "golden-entity-mentions": [ 57 | { 58 | "text": "Welch", 59 | "entity-type": "PER:Individual", 60 | "head": { 61 | "text": "Welch", 62 | "start": 11, 63 | "end": 12 64 | }, 65 | "entity_id": "APW_ENG_20030325.0786-E24-38", 66 | "start": 11, 67 | "end": 12 68 | }, 69 | { 70 | "text": "his", 71 | "entity-type": "PER:Individual", 72 | "head": { 73 | "text": "his", 74 | "start": 16, 75 | "end": 17 76 | }, 77 | "entity_id": "APW_ENG_20030325.0786-E24-39", 78 | "start": 16, 79 | "end": 17 80 | }, 81 | { 82 | "text": "GE", 83 | "entity-type": "ORG:Commercial", 84 | "head": { 85 | "text": "GE", 86 | "start": 20, 87 | "end": 21 88 | }, 89 | "entity_id": "APW_ENG_20030325.0786-E26-40", 90 | "start": 20, 91 | "end": 21 92 | } 93 | ], 94 | "golden-event-mentions": [ 95 | { 96 | "trigger": { 97 | "text": "retirement", 98 | "start": 17, 99 | "end": 18 100 | }, 101 | "arguments": [ 102 | { 103 | "role": "Person", 104 | "entity-type": "PER:Individual", 105 | "text": "Welch", 106 | "start": 11, 107 | "end": 12 108 | }, 109 | { 110 | "role": "Entity", 111 | "entity-type": "ORG:Commercial", 112 | "text": "GE", 113 | "start": 20, 114 | "end": 21 115 | } 116 | ], 117 | "event_type": "Personnel:End-Position" 118 | } 119 | ], 120 | "stanford-colcc": [ 121 | "ROOT/dep=6/gov=-1", 122 | "amod/dep=0/gov=1", 123 | "nsubj/dep=1/gov=6", 124 | "case/dep=2/gov=4", 125 | "det/dep=3/gov=4", 126 | "nmod:in/dep=4/gov=1", 127 | "aux/dep=5/gov=6", 128 | "amod/dep=7/gov=8", 129 | "dobj/dep=8/gov=6", 130 | "case/dep=9/gov=10", 131 | "nmod:about/dep=10/gov=6", 132 | "nsubj/dep=11/gov=12", 133 | "acl:relcl/dep=12/gov=10", 134 | "case/dep=13/gov=14", 135 | "nmod:as/dep=14/gov=12", 136 | "case/dep=15/gov=18", 137 | "nmod:poss/dep=16/gov=18", 138 | "compound/dep=17/gov=18", 139 | "nmod:of/dep=18/gov=14", 140 | "case/dep=19/gov=20", 141 | "nmod:from/dep=20/gov=12", 142 | "case/dep=21/gov=23", 143 | "det/dep=22/gov=23", 144 | "nmod:at/dep=23/gov=12", 145 | "advmod/dep=24/gov=28", 146 | "amod/dep=25/gov=26", 147 | "nsubj/dep=26/gov=28", 148 | "aux/dep=27/gov=28", 149 | "acl:relcl/dep=28/gov=23", 150 | "dobj/dep=29/gov=28", 151 | "punct/dep=30/gov=6" 152 | ], 153 | "words": [ 154 | "Earlier", 155 | "documents", 156 | "in", 157 | "the", 158 | "case", 159 | "have", 160 | "included", 161 | "embarrassing", 162 | "details", 163 | "about", 164 | "perks", 165 | "Welch", 166 | "received", 167 | "as", 168 | "part", 169 | "of", 170 | "his", 171 | "retirement", 172 | "package", 173 | "from", 174 | "GE", 175 | "at", 176 | "a", 177 | "time", 178 | "when", 179 | "corporate", 180 | "scandals", 181 | "were", 182 | "sparking", 183 | "outrage", 184 | "." 185 | ], 186 | "pos-tags": [ 187 | "JJR", 188 | "NNS", 189 | "IN", 190 | "DT", 191 | "NN", 192 | "VBP", 193 | "VBN", 194 | "JJ", 195 | "NNS", 196 | "IN", 197 | "NNS", 198 | "NNP", 199 | "VBD", 200 | "IN", 201 | "NN", 202 | "IN", 203 | "PRP$", 204 | "NN", 205 | "NN", 206 | "IN", 207 | "NNP", 208 | "IN", 209 | "DT", 210 | "NN", 211 | "WRB", 212 | "JJ", 213 | "NNS", 214 | "VBD", 215 | "VBG", 216 | "NN", 217 | "." 218 | ], 219 | "lemma": [ 220 | "earlier", 221 | "document", 222 | "in", 223 | "the", 224 | "case", 225 | "have", 226 | "include", 227 | "embarrassing", 228 | "detail", 229 | "about", 230 | "perk", 231 | "Welch", 232 | "receive", 233 | "as", 234 | "part", 235 | "of", 236 | "he", 237 | "retirement", 238 | "package", 239 | "from", 240 | "GE", 241 | "at", 242 | "a", 243 | "time", 244 | "when", 245 | "corporate", 246 | "scandal", 247 | "be", 248 | "spark", 249 | "outrage", 250 | "." 251 | ], 252 | "parse": "(ROOT\n (S\n (NP\n (NP (JJR Earlier) (NNS documents))\n (PP (IN in)\n (NP (DT the) (NN case))))\n (VP (VBP have)\n (VP (VBN included)\n (NP (JJ embarrassing) (NNS details))\n (PP (IN about)\n (NP\n (NP (NNS perks))\n (SBAR\n (S\n (NP (NNP Welch))\n (VP (VBD received)\n (PP (IN as)\n (NP\n (NP (NN part))\n (PP (IN of)\n (NP (PRP$ his) (NN retirement) (NN package)))))\n (PP (IN from)\n (NP (NNP GE)))\n (PP (IN at)\n (NP\n (NP (DT a) (NN time))\n (SBAR\n (WHADVP (WRB when))\n (S\n (NP (JJ corporate) (NNS scandals))\n (VP (VBD were)\n (VP (VBG sparking)\n (NP (NN outrage)))))))))))))))\n (. .)))" 253 | } 254 | ] 255 | ``` 256 | 257 | 258 | ### Data Split 259 | 260 | The result of data is divided into test/dev/train as follows. 261 | ``` 262 | ├── output 263 | │ └── test.json 264 | │ └── dev.json 265 | │ └── train.json 266 | │... 267 | ``` 268 | 269 | This project use the same data partitioning as the previous work ([Yang and Mitchell, 2016](https://www.cs.cmu.edu/~bishan/papers/joint_event_naacl16.pdf); [Nguyen et al., 2016](https://www.aclweb.org/anthology/N16-1034)). The data segmentation is specified in `data_list.csv`. 270 | 271 | Below is information about the amount of parsed data when using this project. It is slightly different from the parsing results of the two papers above. The difference seems to have occurred because there are no promised rules for splitting sentences within the sgm format files. 272 | 273 | | | Documents | Sentences |Triggers | Arguments | Entity Mentions | 274 | |------- |--------------|--------------|------------|-----------|----------------- | 275 | | Test | 40 | 713 | 422 | 892 | 4226 | 276 | | Dev | 30 | 875 | 492 | 933 | 4050 | 277 | | Train | 529 | 14724 | 4312 | 7811 | 53045 | 278 | -------------------------------------------------------------------------------- /data_list.csv: -------------------------------------------------------------------------------- 1 | type,path test,nw/timex2norm/AFP_ENG_20030401.0476 test,nw/timex2norm/AFP_ENG_20030413.0098 test,nw/timex2norm/AFP_ENG_20030415.0734 test,nw/timex2norm/AFP_ENG_20030417.0004 test,nw/timex2norm/AFP_ENG_20030417.0307 test,nw/timex2norm/AFP_ENG_20030417.0764 test,nw/timex2norm/AFP_ENG_20030418.0556 test,nw/timex2norm/AFP_ENG_20030425.0408 test,nw/timex2norm/AFP_ENG_20030427.0118 test,nw/timex2norm/AFP_ENG_20030428.0720 test,nw/timex2norm/AFP_ENG_20030429.0007 test,nw/timex2norm/AFP_ENG_20030430.0075 test,nw/timex2norm/AFP_ENG_20030502.0614 test,nw/timex2norm/AFP_ENG_20030504.0248 test,nw/timex2norm/AFP_ENG_20030508.0118 test,nw/timex2norm/AFP_ENG_20030508.0357 test,nw/timex2norm/AFP_ENG_20030509.0345 test,nw/timex2norm/AFP_ENG_20030514.0706 test,nw/timex2norm/AFP_ENG_20030519.0049 test,nw/timex2norm/AFP_ENG_20030519.0372 test,nw/timex2norm/AFP_ENG_20030522.0878 test,nw/timex2norm/AFP_ENG_20030527.0616 test,nw/timex2norm/AFP_ENG_20030528.0561 test,nw/timex2norm/AFP_ENG_20030530.0132 test,nw/timex2norm/AFP_ENG_20030601.0262 test,nw/timex2norm/AFP_ENG_20030607.0030 test,nw/timex2norm/AFP_ENG_20030616.0715 test,nw/timex2norm/AFP_ENG_20030617.0846 test,nw/timex2norm/AFP_ENG_20030625.0057 test,nw/timex2norm/AFP_ENG_20030630.0271 test,nw/timex2norm/APW_ENG_20030304.0555 test,nw/timex2norm/APW_ENG_20030306.0191 test,nw/timex2norm/APW_ENG_20030308.0314 test,nw/timex2norm/APW_ENG_20030310.0719 test,nw/timex2norm/APW_ENG_20030311.0775 test,nw/timex2norm/APW_ENG_20030318.0689 test,nw/timex2norm/APW_ENG_20030319.0545 test,nw/timex2norm/APW_ENG_20030322.0119 test,nw/timex2norm/APW_ENG_20030324.0768 test,nw/timex2norm/APW_ENG_20030325.0786 dev,bc/timex2norm/CNN_CF_20030303.1900.02 dev,bc/timex2norm/CNN_IP_20030329.1600.00-2 dev,bc/timex2norm/CNN_IP_20030402.1600.00-1 dev,bc/timex2norm/CNN_IP_20030405.1600.01-1 dev,bc/timex2norm/CNN_IP_20030409.1600.02 dev,un/timex2norm/marcellapr_20050228.2219 dev,un/timex2norm/rec.games.chess.politics_20041216.1047 dev,un/timex2norm/rec.games.chess.politics_20041217.2111 dev,un/timex2norm/soc.org.nonprofit_20050218.1902 dev,wl/timex2norm/FLOPPINGACES_20050217.1237.014 dev,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041116.1347 dev,wl/timex2norm/FLOPPINGACES_20041117.2002.024 dev,wl/timex2norm/FLOPPINGACES_20050203.1953.038 dev,wl/timex2norm/TTRACY_20050223.1049 dev,bn/timex2norm/CNNHL_ENG_20030304_142751.10 dev,bn/timex2norm/CNNHL_ENG_20030424_123502.25 dev,bn/timex2norm/CNNHL_ENG_20030513_220910.32 dev,bn/timex2norm/CNN_ENG_20030304_173120.16 dev,bn/timex2norm/CNN_ENG_20030328_150609.10 dev,bn/timex2norm/CNN_ENG_20030424_070008.15 dev,bn/timex2norm/CNN_ENG_20030512_170454.13 dev,bn/timex2norm/CNN_ENG_20030620_085840.7 dev,nw/timex2norm/AFP_ENG_20030304.0250 dev,nw/timex2norm/AFP_ENG_20030305.0918 dev,nw/timex2norm/AFP_ENG_20030311.0491 dev,nw/timex2norm/AFP_ENG_20030314.0238 dev,nw/timex2norm/AFP_ENG_20030319.0879 dev,nw/timex2norm/AFP_ENG_20030320.0722 dev,nw/timex2norm/AFP_ENG_20030327.0022 dev,nw/timex2norm/AFP_ENG_20030327.0224 train,bc/timex2norm/CNN_CF_20030303.1900.00 train,bc/timex2norm/CNN_CF_20030303.1900.05 train,bc/timex2norm/CNN_CF_20030303.1900.06-1 train,bc/timex2norm/CNN_CF_20030303.1900.06-2 train,bc/timex2norm/CNN_CF_20030304.1900.02 train,bc/timex2norm/CNN_CF_20030304.1900.04 train,bc/timex2norm/CNN_CF_20030304.1900.06-2 train,bc/timex2norm/CNN_CF_20030305.1900.00-1 train,bc/timex2norm/CNN_CF_20030305.1900.00-2 train,bc/timex2norm/CNN_CF_20030305.1900.00-3 train,bc/timex2norm/CNN_CF_20030305.1900.02 train,bc/timex2norm/CNN_CF_20030305.1900.06-1 train,bc/timex2norm/CNN_CF_20030305.1900.06-2 train,bc/timex2norm/CNN_IP_20030328.1600.07 train,bc/timex2norm/CNN_IP_20030329.1600.00-3 train,bc/timex2norm/CNN_IP_20030329.1600.00-4 train,bc/timex2norm/CNN_IP_20030329.1600.00-5 train,bc/timex2norm/CNN_IP_20030329.1600.00-6 train,bc/timex2norm/CNN_IP_20030329.1600.01-1 train,bc/timex2norm/CNN_IP_20030329.1600.01-3 train,bc/timex2norm/CNN_IP_20030329.1600.02 train,bc/timex2norm/CNN_IP_20030330.1600.05-2 train,bc/timex2norm/CNN_IP_20030330.1600.06 train,bc/timex2norm/CNN_IP_20030402.1600.00-2 train,bc/timex2norm/CNN_IP_20030402.1600.00-3 train,bc/timex2norm/CNN_IP_20030402.1600.00-4 train,bc/timex2norm/CNN_IP_20030402.1600.02-1 train,bc/timex2norm/CNN_IP_20030402.1600.02-2 train,bc/timex2norm/CNN_IP_20030403.1600.00-1 train,bc/timex2norm/CNN_IP_20030403.1600.00-2 train,bc/timex2norm/CNN_IP_20030403.1600.00-3 train,bc/timex2norm/CNN_IP_20030403.1600.00-4 train,bc/timex2norm/CNN_IP_20030404.1600.00-1 train,bc/timex2norm/CNN_IP_20030404.1600.00-2 train,bc/timex2norm/CNN_IP_20030405.1600.00-2 train,bc/timex2norm/CNN_IP_20030405.1600.00-3 train,bc/timex2norm/CNN_IP_20030405.1600.01-2 train,bc/timex2norm/CNN_IP_20030405.1600.01-3 train,bc/timex2norm/CNN_IP_20030405.1600.02 train,bc/timex2norm/CNN_IP_20030406.1600.03 train,bc/timex2norm/CNN_IP_20030407.1600.05 train,bc/timex2norm/CNN_IP_20030408.1600.03 train,bc/timex2norm/CNN_IP_20030408.1600.04 train,bc/timex2norm/CNN_IP_20030409.1600.04 train,bc/timex2norm/CNN_IP_20030410.1600.03-1 train,bc/timex2norm/CNN_IP_20030410.1600.03-2 train,bc/timex2norm/CNN_IP_20030412.1600.03 train,bc/timex2norm/CNN_IP_20030412.1600.05 train,bc/timex2norm/CNN_IP_20030414.1600.04 train,bc/timex2norm/CNN_IP_20030417.1600.06 train,bc/timex2norm/CNN_IP_20030422.1600.05 train,bc/timex2norm/CNN_LE_20030504.1200.01 train,bc/timex2norm/CNN_LE_20030504.1200.02-1 train,bc/timex2norm/CNN_LE_20030504.1200.02-2 train,bn/timex2norm/CNNHL_ENG_20030312_150218.13 train,bn/timex2norm/CNNHL_ENG_20030331_193419.9 train,bn/timex2norm/CNNHL_ENG_20030402_133449.22 train,bn/timex2norm/CNNHL_ENG_20030402_193443.5 train,bn/timex2norm/CNNHL_ENG_20030403_133453.21 train,bn/timex2norm/CNNHL_ENG_20030403_193455.30 train,bn/timex2norm/CNNHL_ENG_20030407_193547.5 train,bn/timex2norm/CNNHL_ENG_20030411_230640.38 train,bn/timex2norm/CNNHL_ENG_20030415_193729.5 train,bn/timex2norm/CNNHL_ENG_20030416_133739.13 train,bn/timex2norm/CNNHL_ENG_20030416_133739.9 train,bn/timex2norm/CNNHL_ENG_20030416_193742.26 train,bn/timex2norm/CNNHL_ENG_20030416_193742.7 train,bn/timex2norm/CNNHL_ENG_20030416_230741.33 train,bn/timex2norm/CNNHL_ENG_20030425_183518.12 train,bn/timex2norm/CNNHL_ENG_20030428_123600.14 train,bn/timex2norm/CNNHL_ENG_20030429_220618.15 train,bn/timex2norm/CNNHL_ENG_20030430_220712.37 train,bn/timex2norm/CNNHL_ENG_20030505_220734.25 train,bn/timex2norm/CNNHL_ENG_20030513_183907.5 train,bn/timex2norm/CNNHL_ENG_20030513_220910.11 train,bn/timex2norm/CNNHL_ENG_20030519_124020.23 train,bn/timex2norm/CNNHL_ENG_20030523_221118.14 train,bn/timex2norm/CNNHL_ENG_20030526_221156.39 train,bn/timex2norm/CNNHL_ENG_20030603_230307.3 train,bn/timex2norm/CNNHL_ENG_20030604_230238.5 train,bn/timex2norm/CNNHL_ENG_20030609_133335.37 train,bn/timex2norm/CNNHL_ENG_20030610_133347.6 train,bn/timex2norm/CNNHL_ENG_20030610_230438.14 train,bn/timex2norm/CNNHL_ENG_20030611_133445.24 train,bn/timex2norm/CNNHL_ENG_20030616_230155.28 train,bn/timex2norm/CNNHL_ENG_20030616_230155.7 train,bn/timex2norm/CNNHL_ENG_20030618_230303.36 train,bn/timex2norm/CNNHL_ENG_20030618_230303.6 train,bn/timex2norm/CNNHL_ENG_20030624_133331.33 train,bn/timex2norm/CNNHL_ENG_20030624_230338.34 train,bn/timex2norm/CNNHL_ENG_20030625_193346.7 train,bn/timex2norm/CNNHL_ENG_20030625_230351.4 train,bn/timex2norm/CNN_ENG_20030305_170125.1 train,bn/timex2norm/CNN_ENG_20030306_070606.18 train,bn/timex2norm/CNN_ENG_20030306_083604.6 train,bn/timex2norm/CNN_ENG_20030312_083725.3 train,bn/timex2norm/CNN_ENG_20030312_223733.14 train,bn/timex2norm/CNN_ENG_20030313_083739.0 train,bn/timex2norm/CNN_ENG_20030318_140851.8 train,bn/timex2norm/CNN_ENG_20030320_153434.7 train,bn/timex2norm/CNN_ENG_20030325_150531.10 train,bn/timex2norm/CNN_ENG_20030325_220534.6 train,bn/timex2norm/CNN_ENG_20030327_163556.20 train,bn/timex2norm/CNN_ENG_20030329_170349.7 train,bn/timex2norm/CNN_ENG_20030331_123648.4 train,bn/timex2norm/CNN_ENG_20030331_193655.14 train,bn/timex2norm/CNN_ENG_20030401_073033.14 train,bn/timex2norm/CNN_ENG_20030401_233449.5 train,bn/timex2norm/CNN_ENG_20030402_190500.11 train,bn/timex2norm/CNN_ENG_20030403_060032.0 train,bn/timex2norm/CNN_ENG_20030403_080032.9 train,bn/timex2norm/CNN_ENG_20030403_090032.1 train,bn/timex2norm/CNN_ENG_20030403_180511.16 train,bn/timex2norm/CNN_ENG_20030403_183513.1 train,bn/timex2norm/CNN_ENG_20030404_073033.4 train,bn/timex2norm/CNN_ENG_20030404_163526.10 train,bn/timex2norm/CNN_ENG_20030407_080037.12 train,bn/timex2norm/CNN_ENG_20030407_130604.10 train,bn/timex2norm/CNN_ENG_20030407_170605.7 train,bn/timex2norm/CNN_ENG_20030408_083034.11 train,bn/timex2norm/CNN_ENG_20030408_123613.0 train,bn/timex2norm/CNN_ENG_20030408_153616.9 train,bn/timex2norm/CNN_ENG_20030408_200618.14 train,bn/timex2norm/CNN_ENG_20030409_180633.8 train,bn/timex2norm/CNN_ENG_20030410_183644.8 train,bn/timex2norm/CNN_ENG_20030411_193701.3 train,bn/timex2norm/CNN_ENG_20030411_233701.11 train,bn/timex2norm/CNN_ENG_20030414_130735.7 train,bn/timex2norm/CNN_ENG_20030415_103039.0 train,bn/timex2norm/CNN_ENG_20030415_173752.0 train,bn/timex2norm/CNN_ENG_20030415_180754.5 train,bn/timex2norm/CNN_ENG_20030415_183752.14 train,bn/timex2norm/CNN_ENG_20030416_100042.7 train,bn/timex2norm/CNN_ENG_20030416_160804.4 train,bn/timex2norm/CNN_ENG_20030416_180808.15 train,bn/timex2norm/CNN_ENG_20030416_190806.4 train,bn/timex2norm/CNN_ENG_20030417_063039.0 train,bn/timex2norm/CNN_ENG_20030417_073039.2 train,bn/timex2norm/CNN_ENG_20030418_063040.1 train,bn/timex2norm/CNN_ENG_20030418_083040.11 train,bn/timex2norm/CNN_ENG_20030418_130831.5 train,bn/timex2norm/CNN_ENG_20030418_163834.14 train,bn/timex2norm/CNN_ENG_20030421_090007.11 train,bn/timex2norm/CNN_ENG_20030421_120508.13 train,bn/timex2norm/CNN_ENG_20030421_120508.17 train,bn/timex2norm/CNN_ENG_20030421_133510.6 train,bn/timex2norm/CNN_ENG_20030422_083005.10 train,bn/timex2norm/CNN_ENG_20030422_213527.4 train,bn/timex2norm/CNN_ENG_20030423_180539.2 train,bn/timex2norm/CNN_ENG_20030424_073006.4 train,bn/timex2norm/CNN_ENG_20030424_113549.11 train,bn/timex2norm/CNN_ENG_20030424_173553.8 train,bn/timex2norm/CNN_ENG_20030424_183556.7 train,bn/timex2norm/CNN_ENG_20030425_063006.5 train,bn/timex2norm/CNN_ENG_20030425_133605.6 train,bn/timex2norm/CNN_ENG_20030426_160621.0 train,bn/timex2norm/CNN_ENG_20030428_130651.4 train,bn/timex2norm/CNN_ENG_20030428_173654.13 train,bn/timex2norm/CNN_ENG_20030428_193655.2 train,bn/timex2norm/CNN_ENG_20030429_083016.5 train,bn/timex2norm/CNN_ENG_20030429_110706.7 train,bn/timex2norm/CNN_ENG_20030429_143706.14 train,bn/timex2norm/CNN_ENG_20030429_170710.4 train,bn/timex2norm/CNN_ENG_20030429_190711.14 train,bn/timex2norm/CNN_ENG_20030430_063016.14 train,bn/timex2norm/CNN_ENG_20030430_093016.0 train,bn/timex2norm/CNN_ENG_20030430_160723.6 train,bn/timex2norm/CNN_ENG_20030501_063017.15 train,bn/timex2norm/CNN_ENG_20030501_160459.0 train,bn/timex2norm/CNN_ENG_20030502_080020.7 train,bn/timex2norm/CNN_ENG_20030502_093018.6 train,bn/timex2norm/CNN_ENG_20030505_090022.1 train,bn/timex2norm/CNN_ENG_20030506_053020.14 train,bn/timex2norm/CNN_ENG_20030506_160524.18 train,bn/timex2norm/CNN_ENG_20030506_163523.22 train,bn/timex2norm/CNN_ENG_20030507_060023.1 train,bn/timex2norm/CNN_ENG_20030507_160538.15 train,bn/timex2norm/CNN_ENG_20030507_170539.0 train,bn/timex2norm/CNN_ENG_20030508_170552.18 train,bn/timex2norm/CNN_ENG_20030508_210555.5 train,bn/timex2norm/CNN_ENG_20030509_090025.5 train,bn/timex2norm/CNN_ENG_20030509_123601.13 train,bn/timex2norm/CNN_ENG_20030512_190454.7 train,bn/timex2norm/CNN_ENG_20030513_080020.2 train,bn/timex2norm/CNN_ENG_20030513_113501.6 train,bn/timex2norm/CNN_ENG_20030513_160506.16 train,bn/timex2norm/CNN_ENG_20030514_130518.5 train,bn/timex2norm/CNN_ENG_20030515_063019.6 train,bn/timex2norm/CNN_ENG_20030515_073019.7 train,bn/timex2norm/CNN_ENG_20030515_193533.6 train,bn/timex2norm/CNN_ENG_20030516_090022.7 train,bn/timex2norm/CNN_ENG_20030516_123543.8 train,bn/timex2norm/CNN_ENG_20030524_143511.4 train,bn/timex2norm/CNN_ENG_20030525_143522.8 train,bn/timex2norm/CNN_ENG_20030525_160525.13 train,bn/timex2norm/CNN_ENG_20030526_133535.4 train,bn/timex2norm/CNN_ENG_20030526_180540.6 train,bn/timex2norm/CNN_ENG_20030526_183538.3 train,bn/timex2norm/CNN_ENG_20030527_195948.3 train,bn/timex2norm/CNN_ENG_20030527_215946.12 train,bn/timex2norm/CNN_ENG_20030528_082823.9 train,bn/timex2norm/CNN_ENG_20030528_125956.8 train,bn/timex2norm/CNN_ENG_20030528_165958.16 train,bn/timex2norm/CNN_ENG_20030528_172957.18 train,bn/timex2norm/CNN_ENG_20030528_195959.20 train,bn/timex2norm/CNN_ENG_20030529_085826.10 train,bn/timex2norm/CNN_ENG_20030529_130011.6 train,bn/timex2norm/CNN_ENG_20030530_130025.12 train,bn/timex2norm/CNN_ENG_20030602_072826.1 train,bn/timex2norm/CNN_ENG_20030602_102826.13 train,bn/timex2norm/CNN_ENG_20030602_105829.2 train,bn/timex2norm/CNN_ENG_20030602_133012.9 train,bn/timex2norm/CNN_ENG_20030603_095830.17 train,bn/timex2norm/CNN_ENG_20030603_133025.7 train,bn/timex2norm/CNN_ENG_20030604_092828.7 train,bn/timex2norm/CNN_ENG_20030604_102828.6 train,bn/timex2norm/CNN_ENG_20030605_065831.18 train,bn/timex2norm/CNN_ENG_20030605_085831.13 train,bn/timex2norm/CNN_ENG_20030605_105831.11 train,bn/timex2norm/CNN_ENG_20030605_193002.8 train,bn/timex2norm/CNN_ENG_20030605_223004.4 train,bn/timex2norm/CNN_ENG_20030607_170312.6 train,bn/timex2norm/CNN_ENG_20030607_173310.4 train,bn/timex2norm/CNN_ENG_20030610_085833.10 train,bn/timex2norm/CNN_ENG_20030610_095857.4 train,bn/timex2norm/CNN_ENG_20030610_105832.1 train,bn/timex2norm/CNN_ENG_20030610_123040.9 train,bn/timex2norm/CNN_ENG_20030610_130042.17 train,bn/timex2norm/CNN_ENG_20030610_133041.17 train,bn/timex2norm/CNN_ENG_20030611_102832.3 train,bn/timex2norm/CNN_ENG_20030611_102832.4 train,bn/timex2norm/CNN_ENG_20030611_175950.5 train,bn/timex2norm/CNN_ENG_20030612_072835.2 train,bn/timex2norm/CNN_ENG_20030612_160005.13 train,bn/timex2norm/CNN_ENG_20030612_173004.10 train,bn/timex2norm/CNN_ENG_20030612_173004.2 train,bn/timex2norm/CNN_ENG_20030614_173123.4 train,bn/timex2norm/CNN_ENG_20030616_130059.25 train,bn/timex2norm/CNN_ENG_20030617_065838.21 train,bn/timex2norm/CNN_ENG_20030617_105836.4 train,bn/timex2norm/CNN_ENG_20030617_112838.4 train,bn/timex2norm/CNN_ENG_20030617_173115.14 train,bn/timex2norm/CNN_ENG_20030617_173115.22 train,bn/timex2norm/CNN_ENG_20030617_193116.10 train,bn/timex2norm/CNN_ENG_20030618_065839.11 train,bn/timex2norm/CNN_ENG_20030618_150128.5 train,bn/timex2norm/CNN_ENG_20030618_150128.6 train,bn/timex2norm/CNN_ENG_20030618_193127.17 train,bn/timex2norm/CNN_ENG_20030619_115954.10 train,bn/timex2norm/CNN_ENG_20030619_115954.4 train,bn/timex2norm/CNN_ENG_20030619_125955.10 train,bn/timex2norm/CNN_ENG_20030620_095840.4 train,bn/timex2norm/CNN_ENG_20030620_170011.14 train,bn/timex2norm/CNN_ENG_20030621_115841.16 train,bn/timex2norm/CNN_ENG_20030621_160254.25 train,bn/timex2norm/CNN_ENG_20030622_173306.9 train,bn/timex2norm/CNN_ENG_20030624_065843.24 train,bn/timex2norm/CNN_ENG_20030624_082841.12 train,bn/timex2norm/CNN_ENG_20030624_140104.22 train,bn/timex2norm/CNN_ENG_20030624_153103.16 train,bn/timex2norm/CNN_ENG_20030624_153103.17 train,bn/timex2norm/CNN_ENG_20030625_210122.0 train,bn/timex2norm/CNN_ENG_20030625_220123.3 train,bn/timex2norm/CNN_ENG_20030626_193133.8 train,bn/timex2norm/CNN_ENG_20030627_065846.3 train,bn/timex2norm/CNN_ENG_20030627_130145.6 train,bn/timex2norm/CNN_ENG_20030630_075848.7 train,bn/timex2norm/CNN_ENG_20030630_085848.18 train,cts/timex2norm/fsh_29097 train,cts/timex2norm/fsh_29105 train,cts/timex2norm/fsh_29121 train,cts/timex2norm/fsh_29138 train,cts/timex2norm/fsh_29139 train,cts/timex2norm/fsh_29141 train,cts/timex2norm/fsh_29171 train,cts/timex2norm/fsh_29187 train,cts/timex2norm/fsh_29191 train,cts/timex2norm/fsh_29192 train,cts/timex2norm/fsh_29195 train,cts/timex2norm/fsh_29226 train,cts/timex2norm/fsh_29272 train,cts/timex2norm/fsh_29302 train,cts/timex2norm/fsh_29303 train,cts/timex2norm/fsh_29326 train,cts/timex2norm/fsh_29336 train,cts/timex2norm/fsh_29344 train,cts/timex2norm/fsh_29348 train,cts/timex2norm/fsh_29350 train,cts/timex2norm/fsh_29361 train,cts/timex2norm/fsh_29388 train,cts/timex2norm/fsh_29395 train,cts/timex2norm/fsh_29505 train,cts/timex2norm/fsh_29520 train,cts/timex2norm/fsh_29521 train,cts/timex2norm/fsh_29526 train,cts/timex2norm/fsh_29581_1 train,cts/timex2norm/fsh_29586 train,cts/timex2norm/fsh_29592 train,cts/timex2norm/fsh_29601 train,cts/timex2norm/fsh_29622 train,cts/timex2norm/fsh_29628 train,cts/timex2norm/fsh_29630 train,cts/timex2norm/fsh_29770 train,cts/timex2norm/fsh_29774 train,cts/timex2norm/fsh_29782_2 train,cts/timex2norm/fsh_29783 train,cts/timex2norm/fsh_29786 train,nw/timex2norm/APW_ENG_20030326.0190 train,nw/timex2norm/APW_ENG_20030327.0376 train,nw/timex2norm/APW_ENG_20030331.0410 train,nw/timex2norm/APW_ENG_20030403.0862 train,nw/timex2norm/APW_ENG_20030404.0439 train,nw/timex2norm/APW_ENG_20030406.0191 train,nw/timex2norm/APW_ENG_20030407.0030 train,nw/timex2norm/APW_ENG_20030408.0090 train,nw/timex2norm/APW_ENG_20030409.0013 train,nw/timex2norm/APW_ENG_20030410.0906 train,nw/timex2norm/APW_ENG_20030411.0304 train,nw/timex2norm/APW_ENG_20030412.0531 train,nw/timex2norm/APW_ENG_20030414.0392 train,nw/timex2norm/APW_ENG_20030415.0742 train,nw/timex2norm/APW_ENG_20030416.0581 train,nw/timex2norm/APW_ENG_20030417.0555 train,nw/timex2norm/APW_ENG_20030418.0084 train,nw/timex2norm/APW_ENG_20030419.0358 train,nw/timex2norm/APW_ENG_20030422.0469 train,nw/timex2norm/APW_ENG_20030422.0485 train,nw/timex2norm/APW_ENG_20030423.0079 train,nw/timex2norm/APW_ENG_20030424.0532 train,nw/timex2norm/APW_ENG_20030424.0698 train,nw/timex2norm/APW_ENG_20030502.0470 train,nw/timex2norm/APW_ENG_20030502.0686 train,nw/timex2norm/APW_ENG_20030508.0772 train,nw/timex2norm/APW_ENG_20030510.0228 train,nw/timex2norm/APW_ENG_20030513.0139 train,nw/timex2norm/APW_ENG_20030519.0367 train,nw/timex2norm/APW_ENG_20030519.0548 train,nw/timex2norm/APW_ENG_20030520.0081 train,nw/timex2norm/APW_ENG_20030520.0757 train,nw/timex2norm/APW_ENG_20030527.0232 train,nw/timex2norm/APW_ENG_20030602.0037 train,nw/timex2norm/APW_ENG_20030603.0303 train,nw/timex2norm/APW_ENG_20030610.0010 train,nw/timex2norm/APW_ENG_20030610.0554 train,nw/timex2norm/APW_ENG_20030619.0383 train,nw/timex2norm/NYT_ENG_20030403.0008 train,nw/timex2norm/NYT_ENG_20030602.0074 train,nw/timex2norm/NYT_ENG_20030630.0079 train,nw/timex2norm/XIN_ENG_20030314.0208 train,nw/timex2norm/XIN_ENG_20030317.0177 train,nw/timex2norm/XIN_ENG_20030324.0191 train,nw/timex2norm/XIN_ENG_20030327.0202 train,nw/timex2norm/XIN_ENG_20030408.0341 train,nw/timex2norm/XIN_ENG_20030415.0379 train,nw/timex2norm/XIN_ENG_20030423.0011 train,nw/timex2norm/XIN_ENG_20030425.0184 train,nw/timex2norm/XIN_ENG_20030509.0137 train,nw/timex2norm/XIN_ENG_20030513.0002 train,nw/timex2norm/XIN_ENG_20030523.0202 train,nw/timex2norm/XIN_ENG_20030609.0118 train,nw/timex2norm/XIN_ENG_20030610.0299 train,nw/timex2norm/XIN_ENG_20030616.0274 train,nw/timex2norm/XIN_ENG_20030624.0085 train,un/timex2norm/Austin-Grad-Community_20050212.2454 train,un/timex2norm/Integritas-Group-Community-Forum_20050110.0557 train,un/timex2norm/alt.atheism_20041104.2428 train,un/timex2norm/alt.books.tom-clancy_20050130.1848 train,un/timex2norm/alt.collecting.autographs_20050224.2438 train,un/timex2norm/alt.corel_20041228.0503 train,un/timex2norm/alt.gossip.celebrities_20041118.2331 train,un/timex2norm/alt.gossip.celebrities_20050218.0826 train,un/timex2norm/alt.obituaries_20041121.1339 train,un/timex2norm/alt.politics.economics_20041206.1835 train,un/timex2norm/alt.politics_20050124.0640 train,un/timex2norm/alt.religion.mormon_20050103.0854 train,un/timex2norm/alt.support.divorce_20050113.2451 train,un/timex2norm/alt.sys.pc-clone.dell_20050226.2350 train,un/timex2norm/alt.vacation.las-vegas_20050109.0133 train,un/timex2norm/aus.cars_20041206.0903 train,un/timex2norm/misc.invest.marketplace_20050208.2406 train,un/timex2norm/misc.kids.pregnancy_20050120.0404 train,un/timex2norm/misc.legal.moderated_20041202.1648 train,un/timex2norm/misc.legal.moderated_20050129.2225 train,un/timex2norm/misc.survivalism_20050210.0232 train,un/timex2norm/misc.taxes_20050218.1250 train,un/timex2norm/rec.arts.mystery_20050219.1126 train,un/timex2norm/rec.arts.sf.written.robert-jordan_20050208.1350 train,un/timex2norm/rec.boats_20050130.1006 train,un/timex2norm/rec.music.makers.guitar.acoustic_20041228.1628 train,un/timex2norm/rec.music.phish_20041215.1554 train,un/timex2norm/rec.music.phish_20050217.1804 train,un/timex2norm/rec.parks.theme_20050217.2019 train,un/timex2norm/rec.sport.disc_20050209.2202 train,un/timex2norm/rec.travel.cruises_20050216.1636 train,un/timex2norm/rec.travel.cruises_20050222.0313 train,un/timex2norm/rec.travel.europe_20050101.1800 train,un/timex2norm/rec.travel.usa-canada_20050128.0121 train,un/timex2norm/seattle.politics_20050122.2412 train,un/timex2norm/soc.culture.china_20050203.0639 train,un/timex2norm/soc.culture.hmong_20050210.1130 train,un/timex2norm/soc.culture.indian_20041104.2348 train,un/timex2norm/soc.culture.iraq_20050211.0445 train,un/timex2norm/soc.culture.jewish_20050130.2105 train,un/timex2norm/soc.history.war.world-war-ii_20050127.2403 train,un/timex2norm/soc.history.what-if_20050129.1404 train,un/timex2norm/talk.politics.misc_20050216.1337 train,un/timex2norm/uk.gay-lesbian-bi_20050127.0311 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041101.1144 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041101.1806 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041201.2313 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041203.1959 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041208.2133 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041215.2302 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041218.0146 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041218.1004 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041223.1449 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041226.1712 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050105.1344 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050106.1310 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050107.2012 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050109.1627 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050113.1400 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050114.1922 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050116.2149 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050124.1354 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050125.0136 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050203.1356 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050205.1954 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050208.1142 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050213.2123 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050224.1207 train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050224.2252 train,wl/timex2norm/BACONSREBELLION_20050123.1639 train,wl/timex2norm/BACONSREBELLION_20050125.1108 train,wl/timex2norm/BACONSREBELLION_20050127.1017 train,wl/timex2norm/BACONSREBELLION_20050204.1326 train,wl/timex2norm/BACONSREBELLION_20050205.1919 train,wl/timex2norm/BACONSREBELLION_20050206.1345 train,wl/timex2norm/BACONSREBELLION_20050209.0721 train,wl/timex2norm/BACONSREBELLION_20050210.0728 train,wl/timex2norm/BACONSREBELLION_20050214.0944 train,wl/timex2norm/BACONSREBELLION_20050216.1536 train,wl/timex2norm/BACONSREBELLION_20050216.1618 train,wl/timex2norm/BACONSREBELLION_20050216.1632 train,wl/timex2norm/BACONSREBELLION_20050217.0744 train,wl/timex2norm/BACONSREBELLION_20050218.0848 train,wl/timex2norm/BACONSREBELLION_20050218.1214 train,wl/timex2norm/BACONSREBELLION_20050222.1348 train,wl/timex2norm/BACONSREBELLION_20050227.1238 train,wl/timex2norm/FLOPPINGACES_20041113.1528.042 train,wl/timex2norm/FLOPPINGACES_20041114.1240.039 train,wl/timex2norm/FLOPPINGACES_20041115.1613.032 train,wl/timex2norm/FLOPPINGACES_20041116.0833.027 train,wl/timex2norm/FLOPPINGACES_20041228.0927.010 train,wl/timex2norm/FLOPPINGACES_20041230.1844.003 train,wl/timex2norm/FLOPPINGACES_20050101.2244.048 train,wl/timex2norm/GETTINGPOLITICAL_20050105.0127.001 train,wl/timex2norm/HEALINGIRAQ_20041108.1942.05 train,wl/timex2norm/MARKBACKER_20041103.1300 train,wl/timex2norm/MARKBACKER_20041108.1507 train,wl/timex2norm/MARKBACKER_20041112.0707 train,wl/timex2norm/MARKBACKER_20041117.0723 train,wl/timex2norm/MARKBACKER_20041117.1107 train,wl/timex2norm/MARKBACKER_20041119.1002 train,wl/timex2norm/MARKBACKER_20041128.1641 train,wl/timex2norm/MARKBACKER_20041202.0711 train,wl/timex2norm/MARKBACKER_20041206.0733 train,wl/timex2norm/MARKBACKER_20041216.0656 train,wl/timex2norm/MARKBACKER_20041217.1639 train,wl/timex2norm/MARKBACKER_20041220.0919 train,wl/timex2norm/MARKBACKER_20050103.0829 train,wl/timex2norm/MARKBACKER_20050105.1526 train,wl/timex2norm/MARKBACKER_20050105.1632 train,wl/timex2norm/MARKBACKER_20050217.0647 train,wl/timex2norm/MARKETVIEW_20041209.1401 train,wl/timex2norm/MARKETVIEW_20041211.1845 train,wl/timex2norm/MARKETVIEW_20041212.1447 train,wl/timex2norm/MARKETVIEW_20041213.0722 train,wl/timex2norm/MARKETVIEW_20041215.2128 train,wl/timex2norm/MARKETVIEW_20041217.0801 train,wl/timex2norm/MARKETVIEW_20041219.1509 train,wl/timex2norm/MARKETVIEW_20041220.1537 train,wl/timex2norm/MARKETVIEW_20050105.1901 train,wl/timex2norm/MARKETVIEW_20050120.1641 train,wl/timex2norm/MARKETVIEW_20050126.0711 train,wl/timex2norm/MARKETVIEW_20050127.0716 train,wl/timex2norm/MARKETVIEW_20050201.0748 train,wl/timex2norm/MARKETVIEW_20050204.1322 train,wl/timex2norm/MARKETVIEW_20050204.1337 train,wl/timex2norm/MARKETVIEW_20050204.1736 train,wl/timex2norm/MARKETVIEW_20050205.1358 train,wl/timex2norm/MARKETVIEW_20050206.1951 train,wl/timex2norm/MARKETVIEW_20050206.2009 train,wl/timex2norm/MARKETVIEW_20050207.0746 train,wl/timex2norm/MARKETVIEW_20050208.2033 train,wl/timex2norm/MARKETVIEW_20050208.2059 train,wl/timex2norm/MARKETVIEW_20050209.1923 train,wl/timex2norm/MARKETVIEW_20050210.2138 train,wl/timex2norm/MARKETVIEW_20050212.1607 train,wl/timex2norm/MARKETVIEW_20050212.1717 train,wl/timex2norm/MARKETVIEW_20050214.2115 train,wl/timex2norm/MARKETVIEW_20050215.1858 train,wl/timex2norm/MARKETVIEW_20050216.2120 train,wl/timex2norm/MARKETVIEW_20050217.2115 train,wl/timex2norm/MARKETVIEW_20050222.0729 train,wl/timex2norm/MARKETVIEW_20050222.1919 train,wl/timex2norm/MARKETVIEW_20050225.0541 train,wl/timex2norm/MARKETVIEW_20050226.1307 train,wl/timex2norm/MARKETVIEW_20050226.1444 train,wl/timex2norm/MARKETVIEW_20050228.2211 train,wl/timex2norm/OIADVANTAGE_20041224.1007 train,wl/timex2norm/OIADVANTAGE_20050103.0944 train,wl/timex2norm/OIADVANTAGE_20050105.0922 train,wl/timex2norm/OIADVANTAGE_20050108.1323 train,wl/timex2norm/OIADVANTAGE_20050109.1947 train,wl/timex2norm/OIADVANTAGE_20050110.1009 train,wl/timex2norm/OIADVANTAGE_20050203.1000 train,wl/timex2norm/OIADVANTAGE_20050203.2102 train,wl/timex2norm/OIADVANTAGE_20050204.1155 train,bc/timex2norm/CNN_CF_20030304.1900.01 train,un/timex2norm/marcellapr_20050211.2013 train,wl/timex2norm/BACONSREBELLION_20050222.0817 train,wl/timex2norm/BACONSREBELLION_20050226.1317 train,bn/timex2norm/CNN_ENG_20030626_203133.11 train,bn/timex2norm/CNN_ENG_20030605_153000.9 train,bn/timex2norm/CNN_ENG_20030411_070039.21 train,bn/timex2norm/CNNHL_ENG_20030410_193626.13 train,nw/timex2norm/AFP_ENG_20030330.0211 train,nw/timex2norm/AFP_ENG_20030323.0020 -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import re 4 | from parser import Parser 5 | import json 6 | from stanfordcorenlp import StanfordCoreNLP 7 | import argparse 8 | from tqdm import tqdm 9 | import traceback 10 | 11 | 12 | def get_data_paths(ace2005_path): 13 | test_files, dev_files, train_files = [], [], [] 14 | with open('./data_list.csv', mode='r') as csv_file: 15 | rows = csv_file.readlines() 16 | for row in rows[1:]: 17 | items = row.replace('\n', '').split(',') 18 | data_type = items[0] 19 | name = items[1] 20 | 21 | path = os.path.join(ace2005_path, name) 22 | if data_type == 'test': 23 | test_files.append(path) 24 | elif data_type == 'dev': 25 | dev_files.append(path) 26 | elif data_type == 'train': 27 | train_files.append(path) 28 | return test_files, dev_files, train_files 29 | 30 | 31 | def find_token_index(tokens, start_pos, end_pos, phrase): 32 | start_idx, end_idx = -1, -1 33 | for idx, token in enumerate(tokens): 34 | if token['characterOffsetBegin'] <= start_pos: 35 | start_idx = idx 36 | 37 | assert start_idx != -1, "start_idx: {}, start_pos: {}, phrase: {}, tokens: {}".format(start_idx, start_pos, phrase, tokens) 38 | chars = '' 39 | 40 | def remove_punc(s): 41 | s = re.sub(r'[^\w]', '', s) 42 | return s 43 | 44 | for i in range(0, len(tokens) - start_idx): 45 | chars += remove_punc(tokens[start_idx + i]['originalText']) 46 | if remove_punc(phrase) in chars: 47 | end_idx = start_idx + i + 1 48 | break 49 | 50 | assert end_idx != -1, "end_idx: {}, end_pos: {}, phrase: {}, tokens: {}, chars:{}".format(end_idx, end_pos, phrase, tokens, chars) 51 | return start_idx, end_idx 52 | 53 | 54 | def verify_result(data): 55 | def remove_punctuation(s): 56 | for c in ['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-', '\xa0']: 57 | s = s.replace(c, '') 58 | s = re.sub(r'[^\w]', '', s) 59 | return s 60 | 61 | def check_diff(words, phrase): 62 | return remove_punctuation(phrase) not in remove_punctuation(words) 63 | 64 | for item in data: 65 | words = item['words'] 66 | for entity_mention in item['golden-entity-mentions']: 67 | if check_diff(''.join(words[entity_mention['start']:entity_mention['end']]), entity_mention['text'].replace(' ', '')): 68 | print('============================') 69 | print('[Warning] entity has invalid start/end') 70 | print('Expected: ', entity_mention['text']) 71 | print('Actual:', words[entity_mention['start']:entity_mention['end']]) 72 | print('start: {}, end: {}, words: {}'.format(entity_mention['start'], entity_mention['end'], words)) 73 | 74 | for event_mention in item['golden-event-mentions']: 75 | trigger = event_mention['trigger'] 76 | if check_diff(''.join(words[trigger['start']:trigger['end']]), trigger['text'].replace(' ', '')): 77 | print('============================') 78 | print('[Warning] trigger has invalid start/end') 79 | print('Expected: ', trigger['text']) 80 | print('Actual:', words[trigger['start']:trigger['end']]) 81 | print('start: {}, end: {}, words: {}'.format(trigger['start'], trigger['end'], words)) 82 | for argument in event_mention['arguments']: 83 | if check_diff(''.join(words[argument['start']:argument['end']]), argument['text'].replace(' ', '')): 84 | print('============================') 85 | print('[Warning] argument has invalid start/end') 86 | print('Expected: ', argument['text']) 87 | print('Actual:', words[argument['start']:argument['end']]) 88 | print('start: {}, end: {}, words: {}'.format(argument['start'], argument['end'], words)) 89 | 90 | print('Complete verification') 91 | 92 | 93 | def preprocessing(data_type, files): 94 | result = [] 95 | event_count, entity_count, sent_count, argument_count = 0, 0, 0, 0 96 | 97 | print('=' * 20) 98 | print('[preprocessing] type: ', data_type) 99 | for file in tqdm(files): 100 | parser = Parser(path=file) 101 | 102 | entity_count += len(parser.entity_mentions) 103 | event_count += len(parser.event_mentions) 104 | sent_count += len(parser.sents_with_pos) 105 | 106 | for item in parser.get_data(): 107 | data = dict() 108 | data['sentence'] = item['sentence'] 109 | data['golden-entity-mentions'] = [] 110 | data['golden-event-mentions'] = [] 111 | 112 | try: 113 | nlp_res_raw = nlp.annotate(item['sentence'], properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'}) 114 | nlp_res = json.loads(nlp_res_raw) 115 | except Exception as e: 116 | print('[Warning] StanfordCore Exception: ', nlp_res_raw, 'This sentence will be ignored.') 117 | print('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1') 118 | continue 119 | 120 | tokens = nlp_res['sentences'][0]['tokens'] 121 | 122 | if len(nlp_res['sentences']) >= 2: 123 | # TODO: issue where the sentence segmentation of NTLK and StandfordCoreNLP do not match 124 | # This error occurred so little that it was temporarily ignored (< 20 sentences). 125 | continue 126 | 127 | data['stanford-colcc'] = [] 128 | for dep in nlp_res['sentences'][0]['enhancedPlusPlusDependencies']: 129 | data['stanford-colcc'].append('{}/dep={}/gov={}'.format(dep['dep'], dep['dependent'] - 1, dep['governor'] - 1)) 130 | 131 | data['words'] = list(map(lambda x: x['word'], tokens)) 132 | data['pos-tags'] = list(map(lambda x: x['pos'], tokens)) 133 | data['lemma'] = list(map(lambda x: x['lemma'], tokens)) 134 | data['parse'] = nlp_res['sentences'][0]['parse'] 135 | 136 | sent_start_pos = item['position'][0] 137 | 138 | for entity_mention in item['golden-entity-mentions']: 139 | position = entity_mention['position'] 140 | start_idx, end_idx = find_token_index( 141 | tokens=tokens, 142 | start_pos=position[0] - sent_start_pos, 143 | end_pos=position[1] - sent_start_pos + 1, 144 | phrase=entity_mention['text'], 145 | ) 146 | 147 | entity_mention['start'] = start_idx 148 | entity_mention['end'] = end_idx 149 | 150 | del entity_mention['position'] 151 | 152 | # head 153 | head_position = entity_mention["head"]["position"] 154 | 155 | head_start_idx, head_end_idx = find_token_index( 156 | tokens=tokens, 157 | start_pos=head_position[0] - sent_start_pos, 158 | end_pos=head_position[1] - sent_start_pos + 1, 159 | phrase=entity_mention["head"]["text"] 160 | ) 161 | 162 | entity_mention["head"]["start"] = head_start_idx 163 | entity_mention["head"]["end"] = head_end_idx 164 | del entity_mention["head"]["position"] 165 | 166 | data['golden-entity-mentions'].append(entity_mention) 167 | 168 | for event_mention in item['golden-event-mentions']: 169 | # same event mention can be shared 170 | event_mention = copy.deepcopy(event_mention) 171 | position = event_mention['trigger']['position'] 172 | start_idx, end_idx = find_token_index( 173 | tokens=tokens, 174 | start_pos=position[0] - sent_start_pos, 175 | end_pos=position[1] - sent_start_pos + 1, 176 | phrase=event_mention['trigger']['text'], 177 | ) 178 | 179 | event_mention['trigger']['start'] = start_idx 180 | event_mention['trigger']['end'] = end_idx 181 | del event_mention['trigger']['position'] 182 | del event_mention['position'] 183 | 184 | arguments = [] 185 | argument_count += len(event_mention['arguments']) 186 | for argument in event_mention['arguments']: 187 | position = argument['position'] 188 | start_idx, end_idx = find_token_index( 189 | tokens=tokens, 190 | start_pos=position[0] - sent_start_pos, 191 | end_pos=position[1] - sent_start_pos + 1, 192 | phrase=argument['text'], 193 | ) 194 | 195 | argument['start'] = start_idx 196 | argument['end'] = end_idx 197 | del argument['position'] 198 | 199 | arguments.append(argument) 200 | 201 | event_mention['arguments'] = arguments 202 | data['golden-event-mentions'].append(event_mention) 203 | 204 | result.append(data) 205 | 206 | print('======[Statistics]======') 207 | print('sent :', sent_count) 208 | print('event :', event_count) 209 | print('entity :', entity_count) 210 | print('argument:', argument_count) 211 | 212 | verify_result(result) 213 | with open('output/{}.json'.format(data_type), 'w') as f: 214 | json.dump(result, f, indent=2) 215 | 216 | 217 | if __name__ == '__main__': 218 | parser = argparse.ArgumentParser() 219 | parser.add_argument('--data', help="Path of ACE2005 English data", default='./data/ace_2005_td_v7/data/English') 220 | parser.add_argument('--nlp', help="Standford Core Nlp path", default='./stanford-corenlp-full-2018-10-05') 221 | args = parser.parse_args() 222 | test_files, dev_files, train_files = get_data_paths(args.data) 223 | 224 | with StanfordCoreNLP(args.nlp, memory='8g', timeout=60000) as nlp: 225 | # res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'}) 226 | # print(res) 227 | preprocessing('dev', dev_files) 228 | preprocessing('test', test_files) 229 | preprocessing('train', train_files) 230 | -------------------------------------------------------------------------------- /output/sample.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "sentence": "Earlier documents in the case have included embarrassing details about perks Welch received as part of his retirement package from GE at a time when corporate scandals were sparking outrage.", 4 | "golden-entity-mentions": [ 5 | { 6 | "text": "Welch", 7 | "entity-type": "PER:Individual", 8 | "head": { 9 | "text": "Welch", 10 | "start": 11, 11 | "end": 12 12 | }, 13 | "entity_id": "APW_ENG_20030325.0786-E24-38", 14 | "start": 11, 15 | "end": 12 16 | }, 17 | { 18 | "text": "his", 19 | "entity-type": "PER:Individual", 20 | "head": { 21 | "text": "his", 22 | "start": 16, 23 | "end": 17 24 | }, 25 | "entity_id": "APW_ENG_20030325.0786-E24-39", 26 | "start": 16, 27 | "end": 17 28 | }, 29 | { 30 | "text": "GE", 31 | "entity-type": "ORG:Commercial", 32 | "head": { 33 | "text": "GE", 34 | "start": 20, 35 | "end": 21 36 | }, 37 | "entity_id": "APW_ENG_20030325.0786-E26-40", 38 | "start": 20, 39 | "end": 21 40 | } 41 | ], 42 | "golden-event-mentions": [ 43 | { 44 | "trigger": { 45 | "text": "retirement", 46 | "start": 17, 47 | "end": 18 48 | }, 49 | "arguments": [ 50 | { 51 | "role": "Person", 52 | "entity-type": "PER:Individual", 53 | "text": "Welch", 54 | "start": 11, 55 | "end": 12 56 | }, 57 | { 58 | "role": "Entity", 59 | "entity-type": "ORG:Commercial", 60 | "text": "GE", 61 | "start": 20, 62 | "end": 21 63 | } 64 | ], 65 | "event_type": "Personnel:End-Position" 66 | } 67 | ], 68 | "stanford-colcc": [ 69 | "ROOT/dep=6/gov=-1", 70 | "amod/dep=0/gov=1", 71 | "nsubj/dep=1/gov=6", 72 | "case/dep=2/gov=4", 73 | "det/dep=3/gov=4", 74 | "nmod:in/dep=4/gov=1", 75 | "aux/dep=5/gov=6", 76 | "amod/dep=7/gov=8", 77 | "dobj/dep=8/gov=6", 78 | "case/dep=9/gov=10", 79 | "nmod:about/dep=10/gov=6", 80 | "nsubj/dep=11/gov=12", 81 | "acl:relcl/dep=12/gov=10", 82 | "case/dep=13/gov=14", 83 | "nmod:as/dep=14/gov=12", 84 | "case/dep=15/gov=18", 85 | "nmod:poss/dep=16/gov=18", 86 | "compound/dep=17/gov=18", 87 | "nmod:of/dep=18/gov=14", 88 | "case/dep=19/gov=20", 89 | "nmod:from/dep=20/gov=12", 90 | "case/dep=21/gov=23", 91 | "det/dep=22/gov=23", 92 | "nmod:at/dep=23/gov=12", 93 | "advmod/dep=24/gov=28", 94 | "amod/dep=25/gov=26", 95 | "nsubj/dep=26/gov=28", 96 | "aux/dep=27/gov=28", 97 | "acl:relcl/dep=28/gov=23", 98 | "dobj/dep=29/gov=28", 99 | "punct/dep=30/gov=6" 100 | ], 101 | "words": [ 102 | "Earlier", 103 | "documents", 104 | "in", 105 | "the", 106 | "case", 107 | "have", 108 | "included", 109 | "embarrassing", 110 | "details", 111 | "about", 112 | "perks", 113 | "Welch", 114 | "received", 115 | "as", 116 | "part", 117 | "of", 118 | "his", 119 | "retirement", 120 | "package", 121 | "from", 122 | "GE", 123 | "at", 124 | "a", 125 | "time", 126 | "when", 127 | "corporate", 128 | "scandals", 129 | "were", 130 | "sparking", 131 | "outrage", 132 | "." 133 | ], 134 | "pos-tags": [ 135 | "JJR", 136 | "NNS", 137 | "IN", 138 | "DT", 139 | "NN", 140 | "VBP", 141 | "VBN", 142 | "JJ", 143 | "NNS", 144 | "IN", 145 | "NNS", 146 | "NNP", 147 | "VBD", 148 | "IN", 149 | "NN", 150 | "IN", 151 | "PRP$", 152 | "NN", 153 | "NN", 154 | "IN", 155 | "NNP", 156 | "IN", 157 | "DT", 158 | "NN", 159 | "WRB", 160 | "JJ", 161 | "NNS", 162 | "VBD", 163 | "VBG", 164 | "NN", 165 | "." 166 | ], 167 | "lemma": [ 168 | "earlier", 169 | "document", 170 | "in", 171 | "the", 172 | "case", 173 | "have", 174 | "include", 175 | "embarrassing", 176 | "detail", 177 | "about", 178 | "perk", 179 | "Welch", 180 | "receive", 181 | "as", 182 | "part", 183 | "of", 184 | "he", 185 | "retirement", 186 | "package", 187 | "from", 188 | "GE", 189 | "at", 190 | "a", 191 | "time", 192 | "when", 193 | "corporate", 194 | "scandal", 195 | "be", 196 | "spark", 197 | "outrage", 198 | "." 199 | ], 200 | "parse": "(ROOT\n (S\n (NP\n (NP (JJR Earlier) (NNS documents))\n (PP (IN in)\n (NP (DT the) (NN case))))\n (VP (VBP have)\n (VP (VBN included)\n (NP (JJ embarrassing) (NNS details))\n (PP (IN about)\n (NP\n (NP (NNS perks))\n (SBAR\n (S\n (NP (NNP Welch))\n (VP (VBD received)\n (PP (IN as)\n (NP\n (NP (NN part))\n (PP (IN of)\n (NP (PRP$ his) (NN retirement) (NN package)))))\n (PP (IN from)\n (NP (NNP GE)))\n (PP (IN at)\n (NP\n (NP (DT a) (NN time))\n (SBAR\n (WHADVP (WRB when))\n (S\n (NP (JJ corporate) (NNS scandals))\n (VP (VBD were)\n (VP (VBG sparking)\n (NP (NN outrage)))))))))))))))\n (. .)))" 201 | } 202 | ] 203 | -------------------------------------------------------------------------------- /parser.py: -------------------------------------------------------------------------------- 1 | from xml.etree import ElementTree 2 | from bs4 import BeautifulSoup 3 | import nltk 4 | import json 5 | import re 6 | 7 | 8 | class Parser: 9 | def __init__(self, path): 10 | self.path = path 11 | self.entity_mentions = [] 12 | self.event_mentions = [] 13 | self.sentences = [] 14 | self.sgm_text = '' 15 | 16 | self.entity_mentions, self.event_mentions = self.parse_xml(path + '.apf.xml') 17 | self.sents_with_pos = self.parse_sgm(path + '.sgm') 18 | self.fix_wrong_position() 19 | 20 | @staticmethod 21 | def clean_text(text): 22 | return text.replace('\n', ' ') 23 | 24 | def get_data(self): 25 | data = [] 26 | for sent in self.sents_with_pos: 27 | item = dict() 28 | 29 | item['sentence'] = self.clean_text(sent['text']) 30 | item['position'] = sent['position'] 31 | text_position = sent['position'] 32 | 33 | for i, s in enumerate(item['sentence']): 34 | if s != ' ': 35 | item['position'][0] += i 36 | break 37 | 38 | item['sentence'] = item['sentence'].strip() 39 | 40 | entity_map = dict() 41 | item['golden-entity-mentions'] = [] 42 | item['golden-event-mentions'] = [] 43 | 44 | for entity_mention in self.entity_mentions: 45 | entity_position = entity_mention['position'] 46 | 47 | if text_position[0] <= entity_position[0] and entity_position[1] <= text_position[1]: 48 | 49 | item['golden-entity-mentions'].append({ 50 | 'text': self.clean_text(entity_mention['text']), 51 | 'position': entity_position, 52 | 'entity-type': entity_mention['entity-type'], 53 | 'head': { 54 | "text": self.clean_text(entity_mention['head']["text"]), 55 | "position": entity_mention["head"]["position"] 56 | }, 57 | "entity_id": entity_mention['entity-id'] 58 | }) 59 | entity_map[entity_mention['entity-id']] = entity_mention 60 | 61 | for event_mention in self.event_mentions: 62 | event_position = event_mention['trigger']['position'] 63 | if text_position[0] <= event_position[0] and event_position[1] <= text_position[1]: 64 | event_arguments = [] 65 | for argument in event_mention['arguments']: 66 | try: 67 | entity_type = entity_map[argument['entity-id']]['entity-type'] 68 | except KeyError: 69 | print('[Warning] The entity in the other sentence is mentioned. This argument will be ignored.') 70 | continue 71 | 72 | event_arguments.append({ 73 | 'role': argument['role'], 74 | 'position': argument['position'], 75 | 'entity-type': entity_type, 76 | 'text': self.clean_text(argument['text']), 77 | }) 78 | 79 | item['golden-event-mentions'].append({ 80 | 'trigger': event_mention['trigger'], 81 | 'arguments': event_arguments, 82 | 'position': event_position, 83 | 'event_type': event_mention['event_type'], 84 | }) 85 | data.append(item) 86 | return data 87 | 88 | def find_correct_offset(self, sgm_text, start_index, text): 89 | offset = 0 90 | for i in range(0, 70): 91 | for j in [-1, 1]: 92 | offset = i * j 93 | if sgm_text[start_index + offset:start_index + offset + len(text)] == text: 94 | return offset 95 | 96 | print('[Warning] fail to find offset! (start_index: {}, text: {}, path: {})'.format(start_index, text, self.path)) 97 | return offset 98 | 99 | def fix_wrong_position(self): 100 | for entity_mention in self.entity_mentions: 101 | offset = self.find_correct_offset( 102 | sgm_text=self.sgm_text, 103 | start_index=entity_mention['position'][0], 104 | text=entity_mention['text']) 105 | 106 | entity_mention['position'][0] += offset 107 | entity_mention['position'][1] += offset 108 | entity_mention['head']["position"][0] += offset 109 | entity_mention['head']["position"][1] += offset 110 | 111 | for event_mention in self.event_mentions: 112 | offset1 = self.find_correct_offset( 113 | sgm_text=self.sgm_text, 114 | start_index=event_mention['trigger']['position'][0], 115 | text=event_mention['trigger']['text']) 116 | event_mention['trigger']['position'][0] += offset1 117 | event_mention['trigger']['position'][1] += offset1 118 | 119 | for argument in event_mention['arguments']: 120 | offset2 = self.find_correct_offset( 121 | sgm_text=self.sgm_text, 122 | start_index=argument['position'][0], 123 | text=argument['text']) 124 | argument['position'][0] += offset2 125 | argument['position'][1] += offset2 126 | 127 | def parse_sgm(self, sgm_path): 128 | with open(sgm_path, 'r') as f: 129 | soup = BeautifulSoup(f.read(), features='html.parser') 130 | self.sgm_text = soup.text 131 | 132 | doc_type = soup.doc.doctype.text.strip() 133 | 134 | def remove_tags(selector): 135 | tags = soup.findAll(selector) 136 | for tag in tags: 137 | tag.extract() 138 | 139 | if doc_type == 'WEB TEXT': 140 | remove_tags('poster') 141 | remove_tags('postdate') 142 | remove_tags('subject') 143 | elif doc_type in ['CONVERSATION', 'STORY']: 144 | remove_tags('speaker') 145 | 146 | sents = [] 147 | converted_text = soup.text 148 | 149 | for sent in nltk.sent_tokenize(converted_text): 150 | sents.extend(sent.split('\n\n')) 151 | sents = list(filter(lambda x: len(x) > 5, sents)) 152 | sents = sents[1:] 153 | sents_with_pos = [] 154 | last_pos = 0 155 | for sent in sents: 156 | pos = self.sgm_text.find(sent, last_pos) 157 | last_pos = pos 158 | sents_with_pos.append({ 159 | 'text': sent, 160 | 'position': [pos, pos + len(sent)] 161 | }) 162 | 163 | return sents_with_pos 164 | 165 | def parse_xml(self, xml_path): 166 | entity_mentions, event_mentions = [], [] 167 | tree = ElementTree.parse(xml_path) 168 | root = tree.getroot() 169 | 170 | for child in root[0]: 171 | if child.tag == 'entity': 172 | entity_mentions.extend(self.parse_entity_tag(child)) 173 | elif child.tag in ['value', 'timex2']: 174 | entity_mentions.extend(self.parse_value_timex_tag(child)) 175 | elif child.tag == 'event': 176 | event_mentions.extend(self.parse_event_tag(child)) 177 | 178 | return entity_mentions, event_mentions 179 | 180 | @staticmethod 181 | def parse_entity_tag(node): 182 | entity_mentions = [] 183 | 184 | for child in node: 185 | if child.tag != 'entity_mention': 186 | continue 187 | extent = child[0] 188 | head = child[1] 189 | charset = extent[0] 190 | head_charset = head[0] 191 | 192 | entity_mention = dict() 193 | entity_mention['entity-id'] = child.attrib['ID'] 194 | entity_mention['entity-type'] = '{}:{}'.format(node.attrib['TYPE'], node.attrib['SUBTYPE']) 195 | entity_mention['text'] = charset.text 196 | entity_mention['position'] = [int(charset.attrib['START']), int(charset.attrib['END'])] 197 | entity_mention["head"] = {"text": head_charset.text, 198 | "position": [int(head_charset.attrib['START']), int(head_charset.attrib['END'])]} 199 | 200 | entity_mentions.append(entity_mention) 201 | 202 | return entity_mentions 203 | 204 | @staticmethod 205 | def parse_event_tag(node): 206 | event_mentions = [] 207 | for child in node: 208 | if child.tag == 'event_mention': 209 | event_mention = dict() 210 | event_mention['event_type'] = '{}:{}'.format(node.attrib['TYPE'], node.attrib['SUBTYPE']) 211 | event_mention['arguments'] = [] 212 | for child2 in child: 213 | if child2.tag == 'ldc_scope': 214 | charset = child2[0] 215 | event_mention['text'] = charset.text 216 | event_mention['position'] = [int(charset.attrib['START']), int(charset.attrib['END'])] 217 | if child2.tag == 'anchor': 218 | charset = child2[0] 219 | event_mention['trigger'] = { 220 | 'text': charset.text, 221 | 'position': [int(charset.attrib['START']), int(charset.attrib['END'])], 222 | } 223 | if child2.tag == 'event_mention_argument': 224 | extent = child2[0] 225 | charset = extent[0] 226 | event_mention['arguments'].append({ 227 | 'text': charset.text, 228 | 'position': [int(charset.attrib['START']), int(charset.attrib['END'])], 229 | 'role': child2.attrib['ROLE'], 230 | 'entity-id': child2.attrib['REFID'], 231 | }) 232 | event_mentions.append(event_mention) 233 | return event_mentions 234 | 235 | @staticmethod 236 | def parse_value_timex_tag(node): 237 | entity_mentions = [] 238 | 239 | for child in node: 240 | extent = child[0] 241 | charset = extent[0] 242 | 243 | entity_mention = dict() 244 | entity_mention['entity-id'] = child.attrib['ID'] 245 | 246 | if 'TYPE' in node.attrib: 247 | entity_mention['entity-type'] = node.attrib['TYPE'] 248 | if 'SUBTYPE' in node.attrib: 249 | entity_mention['entity-type'] += ':{}'.format(node.attrib['SUBTYPE']) 250 | if child.tag == 'timex2_mention': 251 | entity_mention['entity-type'] = 'TIM:time' 252 | 253 | entity_mention['text'] = charset.text 254 | entity_mention['position'] = [int(charset.attrib['START']), int(charset.attrib['END'])] 255 | 256 | entity_mention["head"] = {"text": charset.text, 257 | "position": [int(charset.attrib['START']), int(charset.attrib['END'])]} 258 | 259 | entity_mentions.append(entity_mention) 260 | 261 | return entity_mentions 262 | 263 | 264 | if __name__ == '__main__': 265 | # parser = Parser('./data/ace_2005_td_v7/data/English/un/fp2/alt.gossip.celebrities_20041118.2331') 266 | parser = Parser('./data/ace_2005_td_v7/data/English/un/timex2norm/alt.corel_20041228.0503') 267 | data = parser.get_data() 268 | with open('./output/debug.json', 'w') as f: 269 | json.dump(data, f, indent=2) 270 | 271 | # index = parser.sgm_text.find("Diego Garcia") 272 | # print('index :', index) 273 | # print(parser.sgm_text[1918 - 30:]) 274 | --------------------------------------------------------------------------------