├── .gitignore
├── LICENSE
├── README.md
├── data_list.csv
├── main.py
├── output
    └── sample.json
└── parser.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | venv/
 3 | .idea/
 4 | 
 5 | data/ace_2005_td_v7
 6 | 
 7 | output/dev.json
 8 | output/train.json
 9 | output/dev.json
10 | output/debug.json
11 | 
12 | test.json
13 | 
14 | stanford-corenlp-full-2018-10-05.zip
15 | stanford-corenlp-full-2018-10-05/
16 | 
17 | test/
18 | analysis/
19 | logdir/
20 | baseline/
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 swyoon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ACE2005 preprocessing
  2 | 
  3 | This is a simple code for preprocessing ACE 2005 corpus for Event Extraction task. 
  4 | 
  5 | Using the existing methods were complicated for me, so I made this project.
  6 | 
  7 | ## Prerequisites
  8 | 
  9 | 1. Prepare **ACE 2005 dataset**. 
 10 | 
 11 |    (Download: https://catalog.ldc.upenn.edu/LDC2006T06. Note that ACE 2005 dataset is not free.)
 12 | 
 13 | 2. Install the packages.
 14 |    ```
 15 |    pip install stanfordcorenlp beautifulsoup4 nltk tqdm
 16 |    ```
 17 |     
 18 | 3. Download stanford-corenlp model.
 19 |     ```bash
 20 |     wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
 21 |     unzip stanford-corenlp-full-2018-10-05.zip
 22 |     ```
 23 | 
 24 | ## Usage
 25 | 
 26 | Run:
 27 | 
 28 | ```bash
 29 | sudo python main.py --data=./data/ace_2005_td_v7/data/English --nlp=./stanford-corenlp-full-2018-10-05
 30 | ``` 
 31 | 
 32 | - Then you can get the parsed data in `output directory`. 
 33 | 
 34 | - If it is not executed with the `sudo`, an error can occur when using `stanford-corenlp`.
 35 | 
 36 | - It takes about 30 minutes to complete the pre-processing.
 37 | 
 38 | ## Output
 39 | 
 40 | ### Format
 41 | 
 42 | I follow the json format described in
 43 | [EMNLP2018-JMEE](https://github.com/lx865712528/EMNLP2018-JMEE)
 44 | repository like the bellow sample. Furthermore, I add entity head for
 45 | each entity, because many nlp tasks exploit the head of entity not the
 46 | mention of entity.
 47 | 
 48 | If you want to know event types and arguments in detail, read [this document (ACE 2005 event guidelines)](https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/english-events-guidelines-v5.4.3.pdf).
 49 | 
 50 | 
 51 | **`sample.json`**
 52 | ```json
 53 | [
 54 |     {
 55 |     "sentence": "Earlier documents in the case have included embarrassing details about perks Welch received as part of his retirement package from GE at a time when corporate scandals were sparking outrage.",
 56 |     "golden-entity-mentions": [
 57 |       {
 58 |         "text": "Welch",
 59 |         "entity-type": "PER:Individual",
 60 |         "head": {
 61 |           "text": "Welch",
 62 |           "start": 11,
 63 |           "end": 12
 64 |         },
 65 |         "entity_id": "APW_ENG_20030325.0786-E24-38",
 66 |         "start": 11,
 67 |         "end": 12
 68 |       },
 69 |       {
 70 |         "text": "his",
 71 |         "entity-type": "PER:Individual",
 72 |         "head": {
 73 |           "text": "his",
 74 |           "start": 16,
 75 |           "end": 17
 76 |         },
 77 |         "entity_id": "APW_ENG_20030325.0786-E24-39",
 78 |         "start": 16,
 79 |         "end": 17
 80 |       },
 81 |       {
 82 |         "text": "GE",
 83 |         "entity-type": "ORG:Commercial",
 84 |         "head": {
 85 |           "text": "GE",
 86 |           "start": 20,
 87 |           "end": 21
 88 |         },
 89 |         "entity_id": "APW_ENG_20030325.0786-E26-40",
 90 |         "start": 20,
 91 |         "end": 21
 92 |       }
 93 |     ],
 94 |     "golden-event-mentions": [
 95 |       {
 96 |         "trigger": {
 97 |           "text": "retirement",
 98 |           "start": 17,
 99 |           "end": 18
100 |         },
101 |         "arguments": [
102 |           {
103 |             "role": "Person",
104 |             "entity-type": "PER:Individual",
105 |             "text": "Welch",
106 |             "start": 11,
107 |             "end": 12
108 |           },
109 |           {
110 |             "role": "Entity",
111 |             "entity-type": "ORG:Commercial",
112 |             "text": "GE",
113 |             "start": 20,
114 |             "end": 21
115 |           }
116 |         ],
117 |         "event_type": "Personnel:End-Position"
118 |       }
119 |     ],
120 |     "stanford-colcc": [
121 |       "ROOT/dep=6/gov=-1",
122 |       "amod/dep=0/gov=1",
123 |       "nsubj/dep=1/gov=6",
124 |       "case/dep=2/gov=4",
125 |       "det/dep=3/gov=4",
126 |       "nmod:in/dep=4/gov=1",
127 |       "aux/dep=5/gov=6",
128 |       "amod/dep=7/gov=8",
129 |       "dobj/dep=8/gov=6",
130 |       "case/dep=9/gov=10",
131 |       "nmod:about/dep=10/gov=6",
132 |       "nsubj/dep=11/gov=12",
133 |       "acl:relcl/dep=12/gov=10",
134 |       "case/dep=13/gov=14",
135 |       "nmod:as/dep=14/gov=12",
136 |       "case/dep=15/gov=18",
137 |       "nmod:poss/dep=16/gov=18",
138 |       "compound/dep=17/gov=18",
139 |       "nmod:of/dep=18/gov=14",
140 |       "case/dep=19/gov=20",
141 |       "nmod:from/dep=20/gov=12",
142 |       "case/dep=21/gov=23",
143 |       "det/dep=22/gov=23",
144 |       "nmod:at/dep=23/gov=12",
145 |       "advmod/dep=24/gov=28",
146 |       "amod/dep=25/gov=26",
147 |       "nsubj/dep=26/gov=28",
148 |       "aux/dep=27/gov=28",
149 |       "acl:relcl/dep=28/gov=23",
150 |       "dobj/dep=29/gov=28",
151 |       "punct/dep=30/gov=6"
152 |     ],
153 |     "words": [
154 |       "Earlier",
155 |       "documents",
156 |       "in",
157 |       "the",
158 |       "case",
159 |       "have",
160 |       "included",
161 |       "embarrassing",
162 |       "details",
163 |       "about",
164 |       "perks",
165 |       "Welch",
166 |       "received",
167 |       "as",
168 |       "part",
169 |       "of",
170 |       "his",
171 |       "retirement",
172 |       "package",
173 |       "from",
174 |       "GE",
175 |       "at",
176 |       "a",
177 |       "time",
178 |       "when",
179 |       "corporate",
180 |       "scandals",
181 |       "were",
182 |       "sparking",
183 |       "outrage",
184 |       "."
185 |     ],
186 |     "pos-tags": [
187 |       "JJR",
188 |       "NNS",
189 |       "IN",
190 |       "DT",
191 |       "NN",
192 |       "VBP",
193 |       "VBN",
194 |       "JJ",
195 |       "NNS",
196 |       "IN",
197 |       "NNS",
198 |       "NNP",
199 |       "VBD",
200 |       "IN",
201 |       "NN",
202 |       "IN",
203 |       "PRP$",
204 |       "NN",
205 |       "NN",
206 |       "IN",
207 |       "NNP",
208 |       "IN",
209 |       "DT",
210 |       "NN",
211 |       "WRB",
212 |       "JJ",
213 |       "NNS",
214 |       "VBD",
215 |       "VBG",
216 |       "NN",
217 |       "."
218 |     ],
219 |     "lemma": [
220 |       "earlier",
221 |       "document",
222 |       "in",
223 |       "the",
224 |       "case",
225 |       "have",
226 |       "include",
227 |       "embarrassing",
228 |       "detail",
229 |       "about",
230 |       "perk",
231 |       "Welch",
232 |       "receive",
233 |       "as",
234 |       "part",
235 |       "of",
236 |       "he",
237 |       "retirement",
238 |       "package",
239 |       "from",
240 |       "GE",
241 |       "at",
242 |       "a",
243 |       "time",
244 |       "when",
245 |       "corporate",
246 |       "scandal",
247 |       "be",
248 |       "spark",
249 |       "outrage",
250 |       "."
251 |     ],
252 |     "parse": "(ROOT\n  (S\n    (NP\n      (NP (JJR Earlier) (NNS documents))\n      (PP (IN in)\n        (NP (DT the) (NN case))))\n    (VP (VBP have)\n      (VP (VBN included)\n        (NP (JJ embarrassing) (NNS details))\n        (PP (IN about)\n          (NP\n            (NP (NNS perks))\n            (SBAR\n              (S\n                (NP (NNP Welch))\n                (VP (VBD received)\n                  (PP (IN as)\n                    (NP\n                      (NP (NN part))\n                      (PP (IN of)\n                        (NP (PRP$ his) (NN retirement) (NN package)))))\n                  (PP (IN from)\n                    (NP (NNP GE)))\n                  (PP (IN at)\n                    (NP\n                      (NP (DT a) (NN time))\n                      (SBAR\n                        (WHADVP (WRB when))\n                        (S\n                          (NP (JJ corporate) (NNS scandals))\n                          (VP (VBD were)\n                            (VP (VBG sparking)\n                              (NP (NN outrage)))))))))))))))\n    (. .)))"
253 |   }
254 | ]
255 | ```
256 | 
257 | 
258 | ### Data Split
259 | 
260 | The result of data is divided into test/dev/train as follows.
261 | ```
262 | ├── output
263 | │     └── test.json
264 | │     └── dev.json
265 | │     └── train.json
266 | │...
267 | ```
268 | 
269 | This project use the same data partitioning as the previous work ([Yang and Mitchell, 2016](https://www.cs.cmu.edu/~bishan/papers/joint_event_naacl16.pdf);  [Nguyen et al., 2016](https://www.aclweb.org/anthology/N16-1034)). The data segmentation is specified in `data_list.csv`.
270 | 
271 | Below is information about the amount of parsed data when using this project. It is slightly different from the parsing results of the two papers above. The difference seems to have occurred because there are no promised rules for splitting sentences within the sgm format files.
272 | 
273 | |          | Documents    |  Sentences   |Triggers    | Arguments | Entity Mentions  |
274 | |-------   |--------------|--------------|------------|-----------|----------------- |
275 | | Test     | 40        | 713           | 422           | 892             |  4226             |
276 | | Dev      | 30        | 875           | 492           | 933             |  4050             |
277 | | Train    | 529       | 14724         | 4312          | 7811             |   53045            |
278 | 


--------------------------------------------------------------------------------
/data_list.csv:
--------------------------------------------------------------------------------
1 | ﻿type,pathtest,nw/timex2norm/AFP_ENG_20030401.0476test,nw/timex2norm/AFP_ENG_20030413.0098test,nw/timex2norm/AFP_ENG_20030415.0734test,nw/timex2norm/AFP_ENG_20030417.0004test,nw/timex2norm/AFP_ENG_20030417.0307test,nw/timex2norm/AFP_ENG_20030417.0764test,nw/timex2norm/AFP_ENG_20030418.0556test,nw/timex2norm/AFP_ENG_20030425.0408test,nw/timex2norm/AFP_ENG_20030427.0118test,nw/timex2norm/AFP_ENG_20030428.0720test,nw/timex2norm/AFP_ENG_20030429.0007test,nw/timex2norm/AFP_ENG_20030430.0075test,nw/timex2norm/AFP_ENG_20030502.0614test,nw/timex2norm/AFP_ENG_20030504.0248test,nw/timex2norm/AFP_ENG_20030508.0118test,nw/timex2norm/AFP_ENG_20030508.0357test,nw/timex2norm/AFP_ENG_20030509.0345test,nw/timex2norm/AFP_ENG_20030514.0706test,nw/timex2norm/AFP_ENG_20030519.0049test,nw/timex2norm/AFP_ENG_20030519.0372test,nw/timex2norm/AFP_ENG_20030522.0878test,nw/timex2norm/AFP_ENG_20030527.0616test,nw/timex2norm/AFP_ENG_20030528.0561test,nw/timex2norm/AFP_ENG_20030530.0132test,nw/timex2norm/AFP_ENG_20030601.0262test,nw/timex2norm/AFP_ENG_20030607.0030test,nw/timex2norm/AFP_ENG_20030616.0715test,nw/timex2norm/AFP_ENG_20030617.0846test,nw/timex2norm/AFP_ENG_20030625.0057test,nw/timex2norm/AFP_ENG_20030630.0271test,nw/timex2norm/APW_ENG_20030304.0555test,nw/timex2norm/APW_ENG_20030306.0191test,nw/timex2norm/APW_ENG_20030308.0314test,nw/timex2norm/APW_ENG_20030310.0719test,nw/timex2norm/APW_ENG_20030311.0775test,nw/timex2norm/APW_ENG_20030318.0689test,nw/timex2norm/APW_ENG_20030319.0545test,nw/timex2norm/APW_ENG_20030322.0119test,nw/timex2norm/APW_ENG_20030324.0768test,nw/timex2norm/APW_ENG_20030325.0786dev,bc/timex2norm/CNN_CF_20030303.1900.02dev,bc/timex2norm/CNN_IP_20030329.1600.00-2dev,bc/timex2norm/CNN_IP_20030402.1600.00-1dev,bc/timex2norm/CNN_IP_20030405.1600.01-1dev,bc/timex2norm/CNN_IP_20030409.1600.02dev,un/timex2norm/marcellapr_20050228.2219dev,un/timex2norm/rec.games.chess.politics_20041216.1047dev,un/timex2norm/rec.games.chess.politics_20041217.2111dev,un/timex2norm/soc.org.nonprofit_20050218.1902dev,wl/timex2norm/FLOPPINGACES_20050217.1237.014dev,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041116.1347dev,wl/timex2norm/FLOPPINGACES_20041117.2002.024dev,wl/timex2norm/FLOPPINGACES_20050203.1953.038dev,wl/timex2norm/TTRACY_20050223.1049dev,bn/timex2norm/CNNHL_ENG_20030304_142751.10dev,bn/timex2norm/CNNHL_ENG_20030424_123502.25dev,bn/timex2norm/CNNHL_ENG_20030513_220910.32dev,bn/timex2norm/CNN_ENG_20030304_173120.16dev,bn/timex2norm/CNN_ENG_20030328_150609.10dev,bn/timex2norm/CNN_ENG_20030424_070008.15dev,bn/timex2norm/CNN_ENG_20030512_170454.13dev,bn/timex2norm/CNN_ENG_20030620_085840.7dev,nw/timex2norm/AFP_ENG_20030304.0250dev,nw/timex2norm/AFP_ENG_20030305.0918dev,nw/timex2norm/AFP_ENG_20030311.0491dev,nw/timex2norm/AFP_ENG_20030314.0238dev,nw/timex2norm/AFP_ENG_20030319.0879dev,nw/timex2norm/AFP_ENG_20030320.0722dev,nw/timex2norm/AFP_ENG_20030327.0022dev,nw/timex2norm/AFP_ENG_20030327.0224train,bc/timex2norm/CNN_CF_20030303.1900.00train,bc/timex2norm/CNN_CF_20030303.1900.05train,bc/timex2norm/CNN_CF_20030303.1900.06-1train,bc/timex2norm/CNN_CF_20030303.1900.06-2train,bc/timex2norm/CNN_CF_20030304.1900.02train,bc/timex2norm/CNN_CF_20030304.1900.04train,bc/timex2norm/CNN_CF_20030304.1900.06-2train,bc/timex2norm/CNN_CF_20030305.1900.00-1train,bc/timex2norm/CNN_CF_20030305.1900.00-2train,bc/timex2norm/CNN_CF_20030305.1900.00-3train,bc/timex2norm/CNN_CF_20030305.1900.02train,bc/timex2norm/CNN_CF_20030305.1900.06-1train,bc/timex2norm/CNN_CF_20030305.1900.06-2train,bc/timex2norm/CNN_IP_20030328.1600.07train,bc/timex2norm/CNN_IP_20030329.1600.00-3train,bc/timex2norm/CNN_IP_20030329.1600.00-4train,bc/timex2norm/CNN_IP_20030329.1600.00-5train,bc/timex2norm/CNN_IP_20030329.1600.00-6train,bc/timex2norm/CNN_IP_20030329.1600.01-1train,bc/timex2norm/CNN_IP_20030329.1600.01-3train,bc/timex2norm/CNN_IP_20030329.1600.02train,bc/timex2norm/CNN_IP_20030330.1600.05-2train,bc/timex2norm/CNN_IP_20030330.1600.06train,bc/timex2norm/CNN_IP_20030402.1600.00-2train,bc/timex2norm/CNN_IP_20030402.1600.00-3train,bc/timex2norm/CNN_IP_20030402.1600.00-4train,bc/timex2norm/CNN_IP_20030402.1600.02-1train,bc/timex2norm/CNN_IP_20030402.1600.02-2train,bc/timex2norm/CNN_IP_20030403.1600.00-1train,bc/timex2norm/CNN_IP_20030403.1600.00-2train,bc/timex2norm/CNN_IP_20030403.1600.00-3train,bc/timex2norm/CNN_IP_20030403.1600.00-4train,bc/timex2norm/CNN_IP_20030404.1600.00-1train,bc/timex2norm/CNN_IP_20030404.1600.00-2train,bc/timex2norm/CNN_IP_20030405.1600.00-2train,bc/timex2norm/CNN_IP_20030405.1600.00-3train,bc/timex2norm/CNN_IP_20030405.1600.01-2train,bc/timex2norm/CNN_IP_20030405.1600.01-3train,bc/timex2norm/CNN_IP_20030405.1600.02train,bc/timex2norm/CNN_IP_20030406.1600.03train,bc/timex2norm/CNN_IP_20030407.1600.05train,bc/timex2norm/CNN_IP_20030408.1600.03train,bc/timex2norm/CNN_IP_20030408.1600.04train,bc/timex2norm/CNN_IP_20030409.1600.04train,bc/timex2norm/CNN_IP_20030410.1600.03-1train,bc/timex2norm/CNN_IP_20030410.1600.03-2train,bc/timex2norm/CNN_IP_20030412.1600.03train,bc/timex2norm/CNN_IP_20030412.1600.05train,bc/timex2norm/CNN_IP_20030414.1600.04train,bc/timex2norm/CNN_IP_20030417.1600.06train,bc/timex2norm/CNN_IP_20030422.1600.05train,bc/timex2norm/CNN_LE_20030504.1200.01train,bc/timex2norm/CNN_LE_20030504.1200.02-1train,bc/timex2norm/CNN_LE_20030504.1200.02-2train,bn/timex2norm/CNNHL_ENG_20030312_150218.13train,bn/timex2norm/CNNHL_ENG_20030331_193419.9train,bn/timex2norm/CNNHL_ENG_20030402_133449.22train,bn/timex2norm/CNNHL_ENG_20030402_193443.5train,bn/timex2norm/CNNHL_ENG_20030403_133453.21train,bn/timex2norm/CNNHL_ENG_20030403_193455.30train,bn/timex2norm/CNNHL_ENG_20030407_193547.5train,bn/timex2norm/CNNHL_ENG_20030411_230640.38train,bn/timex2norm/CNNHL_ENG_20030415_193729.5train,bn/timex2norm/CNNHL_ENG_20030416_133739.13train,bn/timex2norm/CNNHL_ENG_20030416_133739.9train,bn/timex2norm/CNNHL_ENG_20030416_193742.26train,bn/timex2norm/CNNHL_ENG_20030416_193742.7train,bn/timex2norm/CNNHL_ENG_20030416_230741.33train,bn/timex2norm/CNNHL_ENG_20030425_183518.12train,bn/timex2norm/CNNHL_ENG_20030428_123600.14train,bn/timex2norm/CNNHL_ENG_20030429_220618.15train,bn/timex2norm/CNNHL_ENG_20030430_220712.37train,bn/timex2norm/CNNHL_ENG_20030505_220734.25train,bn/timex2norm/CNNHL_ENG_20030513_183907.5train,bn/timex2norm/CNNHL_ENG_20030513_220910.11train,bn/timex2norm/CNNHL_ENG_20030519_124020.23train,bn/timex2norm/CNNHL_ENG_20030523_221118.14train,bn/timex2norm/CNNHL_ENG_20030526_221156.39train,bn/timex2norm/CNNHL_ENG_20030603_230307.3train,bn/timex2norm/CNNHL_ENG_20030604_230238.5train,bn/timex2norm/CNNHL_ENG_20030609_133335.37train,bn/timex2norm/CNNHL_ENG_20030610_133347.6train,bn/timex2norm/CNNHL_ENG_20030610_230438.14train,bn/timex2norm/CNNHL_ENG_20030611_133445.24train,bn/timex2norm/CNNHL_ENG_20030616_230155.28train,bn/timex2norm/CNNHL_ENG_20030616_230155.7train,bn/timex2norm/CNNHL_ENG_20030618_230303.36train,bn/timex2norm/CNNHL_ENG_20030618_230303.6train,bn/timex2norm/CNNHL_ENG_20030624_133331.33train,bn/timex2norm/CNNHL_ENG_20030624_230338.34train,bn/timex2norm/CNNHL_ENG_20030625_193346.7train,bn/timex2norm/CNNHL_ENG_20030625_230351.4train,bn/timex2norm/CNN_ENG_20030305_170125.1train,bn/timex2norm/CNN_ENG_20030306_070606.18train,bn/timex2norm/CNN_ENG_20030306_083604.6train,bn/timex2norm/CNN_ENG_20030312_083725.3train,bn/timex2norm/CNN_ENG_20030312_223733.14train,bn/timex2norm/CNN_ENG_20030313_083739.0train,bn/timex2norm/CNN_ENG_20030318_140851.8train,bn/timex2norm/CNN_ENG_20030320_153434.7train,bn/timex2norm/CNN_ENG_20030325_150531.10train,bn/timex2norm/CNN_ENG_20030325_220534.6train,bn/timex2norm/CNN_ENG_20030327_163556.20train,bn/timex2norm/CNN_ENG_20030329_170349.7train,bn/timex2norm/CNN_ENG_20030331_123648.4train,bn/timex2norm/CNN_ENG_20030331_193655.14train,bn/timex2norm/CNN_ENG_20030401_073033.14train,bn/timex2norm/CNN_ENG_20030401_233449.5train,bn/timex2norm/CNN_ENG_20030402_190500.11train,bn/timex2norm/CNN_ENG_20030403_060032.0train,bn/timex2norm/CNN_ENG_20030403_080032.9train,bn/timex2norm/CNN_ENG_20030403_090032.1train,bn/timex2norm/CNN_ENG_20030403_180511.16train,bn/timex2norm/CNN_ENG_20030403_183513.1train,bn/timex2norm/CNN_ENG_20030404_073033.4train,bn/timex2norm/CNN_ENG_20030404_163526.10train,bn/timex2norm/CNN_ENG_20030407_080037.12train,bn/timex2norm/CNN_ENG_20030407_130604.10train,bn/timex2norm/CNN_ENG_20030407_170605.7train,bn/timex2norm/CNN_ENG_20030408_083034.11train,bn/timex2norm/CNN_ENG_20030408_123613.0train,bn/timex2norm/CNN_ENG_20030408_153616.9train,bn/timex2norm/CNN_ENG_20030408_200618.14train,bn/timex2norm/CNN_ENG_20030409_180633.8train,bn/timex2norm/CNN_ENG_20030410_183644.8train,bn/timex2norm/CNN_ENG_20030411_193701.3train,bn/timex2norm/CNN_ENG_20030411_233701.11train,bn/timex2norm/CNN_ENG_20030414_130735.7train,bn/timex2norm/CNN_ENG_20030415_103039.0train,bn/timex2norm/CNN_ENG_20030415_173752.0train,bn/timex2norm/CNN_ENG_20030415_180754.5train,bn/timex2norm/CNN_ENG_20030415_183752.14train,bn/timex2norm/CNN_ENG_20030416_100042.7train,bn/timex2norm/CNN_ENG_20030416_160804.4train,bn/timex2norm/CNN_ENG_20030416_180808.15train,bn/timex2norm/CNN_ENG_20030416_190806.4train,bn/timex2norm/CNN_ENG_20030417_063039.0train,bn/timex2norm/CNN_ENG_20030417_073039.2train,bn/timex2norm/CNN_ENG_20030418_063040.1train,bn/timex2norm/CNN_ENG_20030418_083040.11train,bn/timex2norm/CNN_ENG_20030418_130831.5train,bn/timex2norm/CNN_ENG_20030418_163834.14train,bn/timex2norm/CNN_ENG_20030421_090007.11train,bn/timex2norm/CNN_ENG_20030421_120508.13train,bn/timex2norm/CNN_ENG_20030421_120508.17train,bn/timex2norm/CNN_ENG_20030421_133510.6train,bn/timex2norm/CNN_ENG_20030422_083005.10train,bn/timex2norm/CNN_ENG_20030422_213527.4train,bn/timex2norm/CNN_ENG_20030423_180539.2train,bn/timex2norm/CNN_ENG_20030424_073006.4train,bn/timex2norm/CNN_ENG_20030424_113549.11train,bn/timex2norm/CNN_ENG_20030424_173553.8train,bn/timex2norm/CNN_ENG_20030424_183556.7train,bn/timex2norm/CNN_ENG_20030425_063006.5train,bn/timex2norm/CNN_ENG_20030425_133605.6train,bn/timex2norm/CNN_ENG_20030426_160621.0train,bn/timex2norm/CNN_ENG_20030428_130651.4train,bn/timex2norm/CNN_ENG_20030428_173654.13train,bn/timex2norm/CNN_ENG_20030428_193655.2train,bn/timex2norm/CNN_ENG_20030429_083016.5train,bn/timex2norm/CNN_ENG_20030429_110706.7train,bn/timex2norm/CNN_ENG_20030429_143706.14train,bn/timex2norm/CNN_ENG_20030429_170710.4train,bn/timex2norm/CNN_ENG_20030429_190711.14train,bn/timex2norm/CNN_ENG_20030430_063016.14train,bn/timex2norm/CNN_ENG_20030430_093016.0train,bn/timex2norm/CNN_ENG_20030430_160723.6train,bn/timex2norm/CNN_ENG_20030501_063017.15train,bn/timex2norm/CNN_ENG_20030501_160459.0train,bn/timex2norm/CNN_ENG_20030502_080020.7train,bn/timex2norm/CNN_ENG_20030502_093018.6train,bn/timex2norm/CNN_ENG_20030505_090022.1train,bn/timex2norm/CNN_ENG_20030506_053020.14train,bn/timex2norm/CNN_ENG_20030506_160524.18train,bn/timex2norm/CNN_ENG_20030506_163523.22train,bn/timex2norm/CNN_ENG_20030507_060023.1train,bn/timex2norm/CNN_ENG_20030507_160538.15train,bn/timex2norm/CNN_ENG_20030507_170539.0train,bn/timex2norm/CNN_ENG_20030508_170552.18train,bn/timex2norm/CNN_ENG_20030508_210555.5train,bn/timex2norm/CNN_ENG_20030509_090025.5train,bn/timex2norm/CNN_ENG_20030509_123601.13train,bn/timex2norm/CNN_ENG_20030512_190454.7train,bn/timex2norm/CNN_ENG_20030513_080020.2train,bn/timex2norm/CNN_ENG_20030513_113501.6train,bn/timex2norm/CNN_ENG_20030513_160506.16train,bn/timex2norm/CNN_ENG_20030514_130518.5train,bn/timex2norm/CNN_ENG_20030515_063019.6train,bn/timex2norm/CNN_ENG_20030515_073019.7train,bn/timex2norm/CNN_ENG_20030515_193533.6train,bn/timex2norm/CNN_ENG_20030516_090022.7train,bn/timex2norm/CNN_ENG_20030516_123543.8train,bn/timex2norm/CNN_ENG_20030524_143511.4train,bn/timex2norm/CNN_ENG_20030525_143522.8train,bn/timex2norm/CNN_ENG_20030525_160525.13train,bn/timex2norm/CNN_ENG_20030526_133535.4train,bn/timex2norm/CNN_ENG_20030526_180540.6train,bn/timex2norm/CNN_ENG_20030526_183538.3train,bn/timex2norm/CNN_ENG_20030527_195948.3train,bn/timex2norm/CNN_ENG_20030527_215946.12train,bn/timex2norm/CNN_ENG_20030528_082823.9train,bn/timex2norm/CNN_ENG_20030528_125956.8train,bn/timex2norm/CNN_ENG_20030528_165958.16train,bn/timex2norm/CNN_ENG_20030528_172957.18train,bn/timex2norm/CNN_ENG_20030528_195959.20train,bn/timex2norm/CNN_ENG_20030529_085826.10train,bn/timex2norm/CNN_ENG_20030529_130011.6train,bn/timex2norm/CNN_ENG_20030530_130025.12train,bn/timex2norm/CNN_ENG_20030602_072826.1train,bn/timex2norm/CNN_ENG_20030602_102826.13train,bn/timex2norm/CNN_ENG_20030602_105829.2train,bn/timex2norm/CNN_ENG_20030602_133012.9train,bn/timex2norm/CNN_ENG_20030603_095830.17train,bn/timex2norm/CNN_ENG_20030603_133025.7train,bn/timex2norm/CNN_ENG_20030604_092828.7train,bn/timex2norm/CNN_ENG_20030604_102828.6train,bn/timex2norm/CNN_ENG_20030605_065831.18train,bn/timex2norm/CNN_ENG_20030605_085831.13train,bn/timex2norm/CNN_ENG_20030605_105831.11train,bn/timex2norm/CNN_ENG_20030605_193002.8train,bn/timex2norm/CNN_ENG_20030605_223004.4train,bn/timex2norm/CNN_ENG_20030607_170312.6train,bn/timex2norm/CNN_ENG_20030607_173310.4train,bn/timex2norm/CNN_ENG_20030610_085833.10train,bn/timex2norm/CNN_ENG_20030610_095857.4train,bn/timex2norm/CNN_ENG_20030610_105832.1train,bn/timex2norm/CNN_ENG_20030610_123040.9train,bn/timex2norm/CNN_ENG_20030610_130042.17train,bn/timex2norm/CNN_ENG_20030610_133041.17train,bn/timex2norm/CNN_ENG_20030611_102832.3train,bn/timex2norm/CNN_ENG_20030611_102832.4train,bn/timex2norm/CNN_ENG_20030611_175950.5train,bn/timex2norm/CNN_ENG_20030612_072835.2train,bn/timex2norm/CNN_ENG_20030612_160005.13train,bn/timex2norm/CNN_ENG_20030612_173004.10train,bn/timex2norm/CNN_ENG_20030612_173004.2train,bn/timex2norm/CNN_ENG_20030614_173123.4train,bn/timex2norm/CNN_ENG_20030616_130059.25train,bn/timex2norm/CNN_ENG_20030617_065838.21train,bn/timex2norm/CNN_ENG_20030617_105836.4train,bn/timex2norm/CNN_ENG_20030617_112838.4train,bn/timex2norm/CNN_ENG_20030617_173115.14train,bn/timex2norm/CNN_ENG_20030617_173115.22train,bn/timex2norm/CNN_ENG_20030617_193116.10train,bn/timex2norm/CNN_ENG_20030618_065839.11train,bn/timex2norm/CNN_ENG_20030618_150128.5train,bn/timex2norm/CNN_ENG_20030618_150128.6train,bn/timex2norm/CNN_ENG_20030618_193127.17train,bn/timex2norm/CNN_ENG_20030619_115954.10train,bn/timex2norm/CNN_ENG_20030619_115954.4train,bn/timex2norm/CNN_ENG_20030619_125955.10train,bn/timex2norm/CNN_ENG_20030620_095840.4train,bn/timex2norm/CNN_ENG_20030620_170011.14train,bn/timex2norm/CNN_ENG_20030621_115841.16train,bn/timex2norm/CNN_ENG_20030621_160254.25train,bn/timex2norm/CNN_ENG_20030622_173306.9train,bn/timex2norm/CNN_ENG_20030624_065843.24train,bn/timex2norm/CNN_ENG_20030624_082841.12train,bn/timex2norm/CNN_ENG_20030624_140104.22train,bn/timex2norm/CNN_ENG_20030624_153103.16train,bn/timex2norm/CNN_ENG_20030624_153103.17train,bn/timex2norm/CNN_ENG_20030625_210122.0train,bn/timex2norm/CNN_ENG_20030625_220123.3train,bn/timex2norm/CNN_ENG_20030626_193133.8train,bn/timex2norm/CNN_ENG_20030627_065846.3train,bn/timex2norm/CNN_ENG_20030627_130145.6train,bn/timex2norm/CNN_ENG_20030630_075848.7train,bn/timex2norm/CNN_ENG_20030630_085848.18train,cts/timex2norm/fsh_29097train,cts/timex2norm/fsh_29105train,cts/timex2norm/fsh_29121train,cts/timex2norm/fsh_29138train,cts/timex2norm/fsh_29139train,cts/timex2norm/fsh_29141train,cts/timex2norm/fsh_29171train,cts/timex2norm/fsh_29187train,cts/timex2norm/fsh_29191train,cts/timex2norm/fsh_29192train,cts/timex2norm/fsh_29195train,cts/timex2norm/fsh_29226train,cts/timex2norm/fsh_29272train,cts/timex2norm/fsh_29302train,cts/timex2norm/fsh_29303train,cts/timex2norm/fsh_29326train,cts/timex2norm/fsh_29336train,cts/timex2norm/fsh_29344train,cts/timex2norm/fsh_29348train,cts/timex2norm/fsh_29350train,cts/timex2norm/fsh_29361train,cts/timex2norm/fsh_29388train,cts/timex2norm/fsh_29395train,cts/timex2norm/fsh_29505train,cts/timex2norm/fsh_29520train,cts/timex2norm/fsh_29521train,cts/timex2norm/fsh_29526train,cts/timex2norm/fsh_29581_1train,cts/timex2norm/fsh_29586train,cts/timex2norm/fsh_29592train,cts/timex2norm/fsh_29601train,cts/timex2norm/fsh_29622train,cts/timex2norm/fsh_29628train,cts/timex2norm/fsh_29630train,cts/timex2norm/fsh_29770train,cts/timex2norm/fsh_29774train,cts/timex2norm/fsh_29782_2train,cts/timex2norm/fsh_29783train,cts/timex2norm/fsh_29786train,nw/timex2norm/APW_ENG_20030326.0190train,nw/timex2norm/APW_ENG_20030327.0376train,nw/timex2norm/APW_ENG_20030331.0410train,nw/timex2norm/APW_ENG_20030403.0862train,nw/timex2norm/APW_ENG_20030404.0439train,nw/timex2norm/APW_ENG_20030406.0191train,nw/timex2norm/APW_ENG_20030407.0030train,nw/timex2norm/APW_ENG_20030408.0090train,nw/timex2norm/APW_ENG_20030409.0013train,nw/timex2norm/APW_ENG_20030410.0906train,nw/timex2norm/APW_ENG_20030411.0304train,nw/timex2norm/APW_ENG_20030412.0531train,nw/timex2norm/APW_ENG_20030414.0392train,nw/timex2norm/APW_ENG_20030415.0742train,nw/timex2norm/APW_ENG_20030416.0581train,nw/timex2norm/APW_ENG_20030417.0555train,nw/timex2norm/APW_ENG_20030418.0084train,nw/timex2norm/APW_ENG_20030419.0358train,nw/timex2norm/APW_ENG_20030422.0469train,nw/timex2norm/APW_ENG_20030422.0485train,nw/timex2norm/APW_ENG_20030423.0079train,nw/timex2norm/APW_ENG_20030424.0532train,nw/timex2norm/APW_ENG_20030424.0698train,nw/timex2norm/APW_ENG_20030502.0470train,nw/timex2norm/APW_ENG_20030502.0686train,nw/timex2norm/APW_ENG_20030508.0772train,nw/timex2norm/APW_ENG_20030510.0228train,nw/timex2norm/APW_ENG_20030513.0139train,nw/timex2norm/APW_ENG_20030519.0367train,nw/timex2norm/APW_ENG_20030519.0548train,nw/timex2norm/APW_ENG_20030520.0081train,nw/timex2norm/APW_ENG_20030520.0757train,nw/timex2norm/APW_ENG_20030527.0232train,nw/timex2norm/APW_ENG_20030602.0037train,nw/timex2norm/APW_ENG_20030603.0303train,nw/timex2norm/APW_ENG_20030610.0010train,nw/timex2norm/APW_ENG_20030610.0554train,nw/timex2norm/APW_ENG_20030619.0383train,nw/timex2norm/NYT_ENG_20030403.0008train,nw/timex2norm/NYT_ENG_20030602.0074train,nw/timex2norm/NYT_ENG_20030630.0079train,nw/timex2norm/XIN_ENG_20030314.0208train,nw/timex2norm/XIN_ENG_20030317.0177train,nw/timex2norm/XIN_ENG_20030324.0191train,nw/timex2norm/XIN_ENG_20030327.0202train,nw/timex2norm/XIN_ENG_20030408.0341train,nw/timex2norm/XIN_ENG_20030415.0379train,nw/timex2norm/XIN_ENG_20030423.0011train,nw/timex2norm/XIN_ENG_20030425.0184train,nw/timex2norm/XIN_ENG_20030509.0137train,nw/timex2norm/XIN_ENG_20030513.0002train,nw/timex2norm/XIN_ENG_20030523.0202train,nw/timex2norm/XIN_ENG_20030609.0118train,nw/timex2norm/XIN_ENG_20030610.0299train,nw/timex2norm/XIN_ENG_20030616.0274train,nw/timex2norm/XIN_ENG_20030624.0085train,un/timex2norm/Austin-Grad-Community_20050212.2454train,un/timex2norm/Integritas-Group-Community-Forum_20050110.0557train,un/timex2norm/alt.atheism_20041104.2428train,un/timex2norm/alt.books.tom-clancy_20050130.1848train,un/timex2norm/alt.collecting.autographs_20050224.2438train,un/timex2norm/alt.corel_20041228.0503train,un/timex2norm/alt.gossip.celebrities_20041118.2331train,un/timex2norm/alt.gossip.celebrities_20050218.0826train,un/timex2norm/alt.obituaries_20041121.1339train,un/timex2norm/alt.politics.economics_20041206.1835train,un/timex2norm/alt.politics_20050124.0640train,un/timex2norm/alt.religion.mormon_20050103.0854train,un/timex2norm/alt.support.divorce_20050113.2451train,un/timex2norm/alt.sys.pc-clone.dell_20050226.2350train,un/timex2norm/alt.vacation.las-vegas_20050109.0133train,un/timex2norm/aus.cars_20041206.0903train,un/timex2norm/misc.invest.marketplace_20050208.2406train,un/timex2norm/misc.kids.pregnancy_20050120.0404train,un/timex2norm/misc.legal.moderated_20041202.1648train,un/timex2norm/misc.legal.moderated_20050129.2225train,un/timex2norm/misc.survivalism_20050210.0232train,un/timex2norm/misc.taxes_20050218.1250train,un/timex2norm/rec.arts.mystery_20050219.1126train,un/timex2norm/rec.arts.sf.written.robert-jordan_20050208.1350train,un/timex2norm/rec.boats_20050130.1006train,un/timex2norm/rec.music.makers.guitar.acoustic_20041228.1628train,un/timex2norm/rec.music.phish_20041215.1554train,un/timex2norm/rec.music.phish_20050217.1804train,un/timex2norm/rec.parks.theme_20050217.2019train,un/timex2norm/rec.sport.disc_20050209.2202train,un/timex2norm/rec.travel.cruises_20050216.1636train,un/timex2norm/rec.travel.cruises_20050222.0313train,un/timex2norm/rec.travel.europe_20050101.1800train,un/timex2norm/rec.travel.usa-canada_20050128.0121train,un/timex2norm/seattle.politics_20050122.2412train,un/timex2norm/soc.culture.china_20050203.0639train,un/timex2norm/soc.culture.hmong_20050210.1130train,un/timex2norm/soc.culture.indian_20041104.2348train,un/timex2norm/soc.culture.iraq_20050211.0445train,un/timex2norm/soc.culture.jewish_20050130.2105train,un/timex2norm/soc.history.war.world-war-ii_20050127.2403train,un/timex2norm/soc.history.what-if_20050129.1404train,un/timex2norm/talk.politics.misc_20050216.1337train,un/timex2norm/uk.gay-lesbian-bi_20050127.0311train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041101.1144train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041101.1806train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041201.2313train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041203.1959train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041208.2133train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041215.2302train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041218.0146train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041218.1004train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041223.1449train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20041226.1712train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050105.1344train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050106.1310train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050107.2012train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050109.1627train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050113.1400train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050114.1922train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050116.2149train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050124.1354train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050125.0136train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050203.1356train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050205.1954train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050208.1142train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050213.2123train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050224.1207train,wl/timex2norm/AGGRESSIVEVOICEDAILY_20050224.2252train,wl/timex2norm/BACONSREBELLION_20050123.1639train,wl/timex2norm/BACONSREBELLION_20050125.1108train,wl/timex2norm/BACONSREBELLION_20050127.1017train,wl/timex2norm/BACONSREBELLION_20050204.1326train,wl/timex2norm/BACONSREBELLION_20050205.1919train,wl/timex2norm/BACONSREBELLION_20050206.1345train,wl/timex2norm/BACONSREBELLION_20050209.0721train,wl/timex2norm/BACONSREBELLION_20050210.0728train,wl/timex2norm/BACONSREBELLION_20050214.0944train,wl/timex2norm/BACONSREBELLION_20050216.1536train,wl/timex2norm/BACONSREBELLION_20050216.1618train,wl/timex2norm/BACONSREBELLION_20050216.1632train,wl/timex2norm/BACONSREBELLION_20050217.0744train,wl/timex2norm/BACONSREBELLION_20050218.0848train,wl/timex2norm/BACONSREBELLION_20050218.1214train,wl/timex2norm/BACONSREBELLION_20050222.1348train,wl/timex2norm/BACONSREBELLION_20050227.1238train,wl/timex2norm/FLOPPINGACES_20041113.1528.042train,wl/timex2norm/FLOPPINGACES_20041114.1240.039train,wl/timex2norm/FLOPPINGACES_20041115.1613.032train,wl/timex2norm/FLOPPINGACES_20041116.0833.027train,wl/timex2norm/FLOPPINGACES_20041228.0927.010train,wl/timex2norm/FLOPPINGACES_20041230.1844.003train,wl/timex2norm/FLOPPINGACES_20050101.2244.048train,wl/timex2norm/GETTINGPOLITICAL_20050105.0127.001train,wl/timex2norm/HEALINGIRAQ_20041108.1942.05train,wl/timex2norm/MARKBACKER_20041103.1300train,wl/timex2norm/MARKBACKER_20041108.1507train,wl/timex2norm/MARKBACKER_20041112.0707train,wl/timex2norm/MARKBACKER_20041117.0723train,wl/timex2norm/MARKBACKER_20041117.1107train,wl/timex2norm/MARKBACKER_20041119.1002train,wl/timex2norm/MARKBACKER_20041128.1641train,wl/timex2norm/MARKBACKER_20041202.0711train,wl/timex2norm/MARKBACKER_20041206.0733train,wl/timex2norm/MARKBACKER_20041216.0656train,wl/timex2norm/MARKBACKER_20041217.1639train,wl/timex2norm/MARKBACKER_20041220.0919train,wl/timex2norm/MARKBACKER_20050103.0829train,wl/timex2norm/MARKBACKER_20050105.1526train,wl/timex2norm/MARKBACKER_20050105.1632train,wl/timex2norm/MARKBACKER_20050217.0647train,wl/timex2norm/MARKETVIEW_20041209.1401train,wl/timex2norm/MARKETVIEW_20041211.1845train,wl/timex2norm/MARKETVIEW_20041212.1447train,wl/timex2norm/MARKETVIEW_20041213.0722train,wl/timex2norm/MARKETVIEW_20041215.2128train,wl/timex2norm/MARKETVIEW_20041217.0801train,wl/timex2norm/MARKETVIEW_20041219.1509train,wl/timex2norm/MARKETVIEW_20041220.1537train,wl/timex2norm/MARKETVIEW_20050105.1901train,wl/timex2norm/MARKETVIEW_20050120.1641train,wl/timex2norm/MARKETVIEW_20050126.0711train,wl/timex2norm/MARKETVIEW_20050127.0716train,wl/timex2norm/MARKETVIEW_20050201.0748train,wl/timex2norm/MARKETVIEW_20050204.1322train,wl/timex2norm/MARKETVIEW_20050204.1337train,wl/timex2norm/MARKETVIEW_20050204.1736train,wl/timex2norm/MARKETVIEW_20050205.1358train,wl/timex2norm/MARKETVIEW_20050206.1951train,wl/timex2norm/MARKETVIEW_20050206.2009train,wl/timex2norm/MARKETVIEW_20050207.0746train,wl/timex2norm/MARKETVIEW_20050208.2033train,wl/timex2norm/MARKETVIEW_20050208.2059train,wl/timex2norm/MARKETVIEW_20050209.1923train,wl/timex2norm/MARKETVIEW_20050210.2138train,wl/timex2norm/MARKETVIEW_20050212.1607train,wl/timex2norm/MARKETVIEW_20050212.1717train,wl/timex2norm/MARKETVIEW_20050214.2115train,wl/timex2norm/MARKETVIEW_20050215.1858train,wl/timex2norm/MARKETVIEW_20050216.2120train,wl/timex2norm/MARKETVIEW_20050217.2115train,wl/timex2norm/MARKETVIEW_20050222.0729train,wl/timex2norm/MARKETVIEW_20050222.1919train,wl/timex2norm/MARKETVIEW_20050225.0541train,wl/timex2norm/MARKETVIEW_20050226.1307train,wl/timex2norm/MARKETVIEW_20050226.1444train,wl/timex2norm/MARKETVIEW_20050228.2211train,wl/timex2norm/OIADVANTAGE_20041224.1007train,wl/timex2norm/OIADVANTAGE_20050103.0944train,wl/timex2norm/OIADVANTAGE_20050105.0922train,wl/timex2norm/OIADVANTAGE_20050108.1323train,wl/timex2norm/OIADVANTAGE_20050109.1947train,wl/timex2norm/OIADVANTAGE_20050110.1009train,wl/timex2norm/OIADVANTAGE_20050203.1000train,wl/timex2norm/OIADVANTAGE_20050203.2102train,wl/timex2norm/OIADVANTAGE_20050204.1155train,bc/timex2norm/CNN_CF_20030304.1900.01train,un/timex2norm/marcellapr_20050211.2013train,wl/timex2norm/BACONSREBELLION_20050222.0817train,wl/timex2norm/BACONSREBELLION_20050226.1317train,bn/timex2norm/CNN_ENG_20030626_203133.11train,bn/timex2norm/CNN_ENG_20030605_153000.9train,bn/timex2norm/CNN_ENG_20030411_070039.21train,bn/timex2norm/CNNHL_ENG_20030410_193626.13train,nw/timex2norm/AFP_ENG_20030330.0211train,nw/timex2norm/AFP_ENG_20030323.0020


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | import re
  4 | from parser import Parser
  5 | import json
  6 | from stanfordcorenlp import StanfordCoreNLP
  7 | import argparse
  8 | from tqdm import tqdm
  9 | import traceback
 10 | 
 11 | 
 12 | def get_data_paths(ace2005_path):
 13 |     test_files, dev_files, train_files = [], [], []
 14 |     with open('./data_list.csv', mode='r') as csv_file:
 15 |         rows = csv_file.readlines()
 16 |         for row in rows[1:]:
 17 |             items = row.replace('\n', '').split(',')
 18 |             data_type = items[0]
 19 |             name = items[1]
 20 | 
 21 |             path = os.path.join(ace2005_path, name)
 22 |             if data_type == 'test':
 23 |                 test_files.append(path)
 24 |             elif data_type == 'dev':
 25 |                 dev_files.append(path)
 26 |             elif data_type == 'train':
 27 |                 train_files.append(path)
 28 |     return test_files, dev_files, train_files
 29 | 
 30 | 
 31 | def find_token_index(tokens, start_pos, end_pos, phrase):
 32 |     start_idx, end_idx = -1, -1
 33 |     for idx, token in enumerate(tokens):
 34 |         if token['characterOffsetBegin'] <= start_pos:
 35 |             start_idx = idx
 36 | 
 37 |     assert start_idx != -1, "start_idx: {}, start_pos: {}, phrase: {}, tokens: {}".format(start_idx, start_pos, phrase, tokens)
 38 |     chars = ''
 39 | 
 40 |     def remove_punc(s):
 41 |         s = re.sub(r'[^\w]', '', s)
 42 |         return s
 43 | 
 44 |     for i in range(0, len(tokens) - start_idx):
 45 |         chars += remove_punc(tokens[start_idx + i]['originalText'])
 46 |         if remove_punc(phrase) in chars:
 47 |             end_idx = start_idx + i + 1
 48 |             break
 49 | 
 50 |     assert end_idx != -1, "end_idx: {}, end_pos: {}, phrase: {}, tokens: {}, chars:{}".format(end_idx, end_pos, phrase, tokens, chars)
 51 |     return start_idx, end_idx
 52 | 
 53 | 
 54 | def verify_result(data):
 55 |     def remove_punctuation(s):
 56 |         for c in ['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-', '\xa0']:
 57 |             s = s.replace(c, '')
 58 |         s = re.sub(r'[^\w]', '', s)
 59 |         return s
 60 | 
 61 |     def check_diff(words, phrase):
 62 |         return remove_punctuation(phrase) not in remove_punctuation(words)
 63 | 
 64 |     for item in data:
 65 |         words = item['words']
 66 |         for entity_mention in item['golden-entity-mentions']:
 67 |             if check_diff(''.join(words[entity_mention['start']:entity_mention['end']]), entity_mention['text'].replace(' ', '')):
 68 |                 print('============================')
 69 |                 print('[Warning] entity has invalid start/end')
 70 |                 print('Expected: ', entity_mention['text'])
 71 |                 print('Actual:', words[entity_mention['start']:entity_mention['end']])
 72 |                 print('start: {}, end: {}, words: {}'.format(entity_mention['start'], entity_mention['end'], words))
 73 | 
 74 |         for event_mention in item['golden-event-mentions']:
 75 |             trigger = event_mention['trigger']
 76 |             if check_diff(''.join(words[trigger['start']:trigger['end']]), trigger['text'].replace(' ', '')):
 77 |                 print('============================')
 78 |                 print('[Warning] trigger has invalid start/end')
 79 |                 print('Expected: ', trigger['text'])
 80 |                 print('Actual:', words[trigger['start']:trigger['end']])
 81 |                 print('start: {}, end: {}, words: {}'.format(trigger['start'], trigger['end'], words))
 82 |             for argument in event_mention['arguments']:
 83 |                 if check_diff(''.join(words[argument['start']:argument['end']]), argument['text'].replace(' ', '')):
 84 |                     print('============================')
 85 |                     print('[Warning] argument has invalid start/end')
 86 |                     print('Expected: ', argument['text'])
 87 |                     print('Actual:', words[argument['start']:argument['end']])
 88 |                     print('start: {}, end: {}, words: {}'.format(argument['start'], argument['end'], words))
 89 | 
 90 |     print('Complete verification')
 91 | 
 92 | 
 93 | def preprocessing(data_type, files):
 94 |     result = []
 95 |     event_count, entity_count, sent_count, argument_count = 0, 0, 0, 0
 96 | 
 97 |     print('=' * 20)
 98 |     print('[preprocessing] type: ', data_type)
 99 |     for file in tqdm(files):
100 |         parser = Parser(path=file)
101 | 
102 |         entity_count += len(parser.entity_mentions)
103 |         event_count += len(parser.event_mentions)
104 |         sent_count += len(parser.sents_with_pos)
105 | 
106 |         for item in parser.get_data():
107 |             data = dict()
108 |             data['sentence'] = item['sentence']
109 |             data['golden-entity-mentions'] = []
110 |             data['golden-event-mentions'] = []
111 | 
112 |             try:
113 |                 nlp_res_raw = nlp.annotate(item['sentence'], properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
114 |                 nlp_res = json.loads(nlp_res_raw)
115 |             except Exception as e:
116 |                 print('[Warning] StanfordCore Exception: ', nlp_res_raw, 'This sentence will be ignored.')
117 |                 print('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1')
118 |                 continue
119 | 
120 |             tokens = nlp_res['sentences'][0]['tokens']
121 | 
122 |             if len(nlp_res['sentences']) >= 2:
123 |                 # TODO: issue where the sentence segmentation of NTLK and StandfordCoreNLP do not match
124 |                 # This error occurred so little that it was temporarily ignored (< 20 sentences).
125 |                 continue
126 | 
127 |             data['stanford-colcc'] = []
128 |             for dep in nlp_res['sentences'][0]['enhancedPlusPlusDependencies']:
129 |                 data['stanford-colcc'].append('{}/dep={}/gov={}'.format(dep['dep'], dep['dependent'] - 1, dep['governor'] - 1))
130 | 
131 |             data['words'] = list(map(lambda x: x['word'], tokens))
132 |             data['pos-tags'] = list(map(lambda x: x['pos'], tokens))
133 |             data['lemma'] = list(map(lambda x: x['lemma'], tokens))
134 |             data['parse'] = nlp_res['sentences'][0]['parse']
135 | 
136 |             sent_start_pos = item['position'][0]
137 | 
138 |             for entity_mention in item['golden-entity-mentions']:
139 |                 position = entity_mention['position']
140 |                 start_idx, end_idx = find_token_index(
141 |                     tokens=tokens,
142 |                     start_pos=position[0] - sent_start_pos,
143 |                     end_pos=position[1] - sent_start_pos + 1,
144 |                     phrase=entity_mention['text'],
145 |                 )
146 | 
147 |                 entity_mention['start'] = start_idx
148 |                 entity_mention['end'] = end_idx
149 | 
150 |                 del entity_mention['position']
151 | 
152 |                 # head
153 |                 head_position = entity_mention["head"]["position"]
154 | 
155 |                 head_start_idx, head_end_idx = find_token_index(
156 |                     tokens=tokens,
157 |                     start_pos=head_position[0] - sent_start_pos,
158 |                     end_pos=head_position[1] - sent_start_pos + 1,
159 |                     phrase=entity_mention["head"]["text"]
160 |                 )
161 | 
162 |                 entity_mention["head"]["start"] = head_start_idx
163 |                 entity_mention["head"]["end"] = head_end_idx
164 |                 del entity_mention["head"]["position"]
165 | 
166 |                 data['golden-entity-mentions'].append(entity_mention)
167 | 
168 |             for event_mention in item['golden-event-mentions']:
169 |                 # same event mention can be shared
170 |                 event_mention = copy.deepcopy(event_mention)
171 |                 position = event_mention['trigger']['position']
172 |                 start_idx, end_idx = find_token_index(
173 |                     tokens=tokens,
174 |                     start_pos=position[0] - sent_start_pos,
175 |                     end_pos=position[1] - sent_start_pos + 1,
176 |                     phrase=event_mention['trigger']['text'],
177 |                 )
178 | 
179 |                 event_mention['trigger']['start'] = start_idx
180 |                 event_mention['trigger']['end'] = end_idx
181 |                 del event_mention['trigger']['position']
182 |                 del event_mention['position']
183 | 
184 |                 arguments = []
185 |                 argument_count += len(event_mention['arguments'])
186 |                 for argument in event_mention['arguments']:
187 |                     position = argument['position']
188 |                     start_idx, end_idx = find_token_index(
189 |                         tokens=tokens,
190 |                         start_pos=position[0] - sent_start_pos,
191 |                         end_pos=position[1] - sent_start_pos + 1,
192 |                         phrase=argument['text'],
193 |                     )
194 | 
195 |                     argument['start'] = start_idx
196 |                     argument['end'] = end_idx
197 |                     del argument['position']
198 | 
199 |                     arguments.append(argument)
200 | 
201 |                 event_mention['arguments'] = arguments
202 |                 data['golden-event-mentions'].append(event_mention)
203 | 
204 |             result.append(data)
205 | 
206 |     print('======[Statistics]======')
207 |     print('sent :', sent_count)
208 |     print('event :', event_count)
209 |     print('entity :', entity_count)
210 |     print('argument:', argument_count)
211 | 
212 |     verify_result(result)
213 |     with open('output/{}.json'.format(data_type), 'w') as f:
214 |         json.dump(result, f, indent=2)
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     parser = argparse.ArgumentParser()
219 |     parser.add_argument('--data', help="Path of ACE2005 English data", default='./data/ace_2005_td_v7/data/English')
220 |     parser.add_argument('--nlp', help="Standford Core Nlp path", default='./stanford-corenlp-full-2018-10-05')
221 |     args = parser.parse_args()
222 |     test_files, dev_files, train_files = get_data_paths(args.data)
223 | 
224 |     with StanfordCoreNLP(args.nlp, memory='8g', timeout=60000) as nlp:
225 |         # res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
226 |         # print(res)
227 |         preprocessing('dev', dev_files)
228 |         preprocessing('test', test_files)
229 |         preprocessing('train', train_files)
230 | 


--------------------------------------------------------------------------------
/output/sample.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |     "sentence": "Earlier documents in the case have included embarrassing details about perks Welch received as part of his retirement package from GE at a time when corporate scandals were sparking outrage.",
  4 |     "golden-entity-mentions": [
  5 |       {
  6 |         "text": "Welch",
  7 |         "entity-type": "PER:Individual",
  8 |         "head": {
  9 |           "text": "Welch",
 10 |           "start": 11,
 11 |           "end": 12
 12 |         },
 13 |         "entity_id": "APW_ENG_20030325.0786-E24-38",
 14 |         "start": 11,
 15 |         "end": 12
 16 |       },
 17 |       {
 18 |         "text": "his",
 19 |         "entity-type": "PER:Individual",
 20 |         "head": {
 21 |           "text": "his",
 22 |           "start": 16,
 23 |           "end": 17
 24 |         },
 25 |         "entity_id": "APW_ENG_20030325.0786-E24-39",
 26 |         "start": 16,
 27 |         "end": 17
 28 |       },
 29 |       {
 30 |         "text": "GE",
 31 |         "entity-type": "ORG:Commercial",
 32 |         "head": {
 33 |           "text": "GE",
 34 |           "start": 20,
 35 |           "end": 21
 36 |         },
 37 |         "entity_id": "APW_ENG_20030325.0786-E26-40",
 38 |         "start": 20,
 39 |         "end": 21
 40 |       }
 41 |     ],
 42 |     "golden-event-mentions": [
 43 |       {
 44 |         "trigger": {
 45 |           "text": "retirement",
 46 |           "start": 17,
 47 |           "end": 18
 48 |         },
 49 |         "arguments": [
 50 |           {
 51 |             "role": "Person",
 52 |             "entity-type": "PER:Individual",
 53 |             "text": "Welch",
 54 |             "start": 11,
 55 |             "end": 12
 56 |           },
 57 |           {
 58 |             "role": "Entity",
 59 |             "entity-type": "ORG:Commercial",
 60 |             "text": "GE",
 61 |             "start": 20,
 62 |             "end": 21
 63 |           }
 64 |         ],
 65 |         "event_type": "Personnel:End-Position"
 66 |       }
 67 |     ],
 68 |     "stanford-colcc": [
 69 |       "ROOT/dep=6/gov=-1",
 70 |       "amod/dep=0/gov=1",
 71 |       "nsubj/dep=1/gov=6",
 72 |       "case/dep=2/gov=4",
 73 |       "det/dep=3/gov=4",
 74 |       "nmod:in/dep=4/gov=1",
 75 |       "aux/dep=5/gov=6",
 76 |       "amod/dep=7/gov=8",
 77 |       "dobj/dep=8/gov=6",
 78 |       "case/dep=9/gov=10",
 79 |       "nmod:about/dep=10/gov=6",
 80 |       "nsubj/dep=11/gov=12",
 81 |       "acl:relcl/dep=12/gov=10",
 82 |       "case/dep=13/gov=14",
 83 |       "nmod:as/dep=14/gov=12",
 84 |       "case/dep=15/gov=18",
 85 |       "nmod:poss/dep=16/gov=18",
 86 |       "compound/dep=17/gov=18",
 87 |       "nmod:of/dep=18/gov=14",
 88 |       "case/dep=19/gov=20",
 89 |       "nmod:from/dep=20/gov=12",
 90 |       "case/dep=21/gov=23",
 91 |       "det/dep=22/gov=23",
 92 |       "nmod:at/dep=23/gov=12",
 93 |       "advmod/dep=24/gov=28",
 94 |       "amod/dep=25/gov=26",
 95 |       "nsubj/dep=26/gov=28",
 96 |       "aux/dep=27/gov=28",
 97 |       "acl:relcl/dep=28/gov=23",
 98 |       "dobj/dep=29/gov=28",
 99 |       "punct/dep=30/gov=6"
100 |     ],
101 |     "words": [
102 |       "Earlier",
103 |       "documents",
104 |       "in",
105 |       "the",
106 |       "case",
107 |       "have",
108 |       "included",
109 |       "embarrassing",
110 |       "details",
111 |       "about",
112 |       "perks",
113 |       "Welch",
114 |       "received",
115 |       "as",
116 |       "part",
117 |       "of",
118 |       "his",
119 |       "retirement",
120 |       "package",
121 |       "from",
122 |       "GE",
123 |       "at",
124 |       "a",
125 |       "time",
126 |       "when",
127 |       "corporate",
128 |       "scandals",
129 |       "were",
130 |       "sparking",
131 |       "outrage",
132 |       "."
133 |     ],
134 |     "pos-tags": [
135 |       "JJR",
136 |       "NNS",
137 |       "IN",
138 |       "DT",
139 |       "NN",
140 |       "VBP",
141 |       "VBN",
142 |       "JJ",
143 |       "NNS",
144 |       "IN",
145 |       "NNS",
146 |       "NNP",
147 |       "VBD",
148 |       "IN",
149 |       "NN",
150 |       "IN",
151 |       "PRP$",
152 |       "NN",
153 |       "NN",
154 |       "IN",
155 |       "NNP",
156 |       "IN",
157 |       "DT",
158 |       "NN",
159 |       "WRB",
160 |       "JJ",
161 |       "NNS",
162 |       "VBD",
163 |       "VBG",
164 |       "NN",
165 |       "."
166 |     ],
167 |     "lemma": [
168 |       "earlier",
169 |       "document",
170 |       "in",
171 |       "the",
172 |       "case",
173 |       "have",
174 |       "include",
175 |       "embarrassing",
176 |       "detail",
177 |       "about",
178 |       "perk",
179 |       "Welch",
180 |       "receive",
181 |       "as",
182 |       "part",
183 |       "of",
184 |       "he",
185 |       "retirement",
186 |       "package",
187 |       "from",
188 |       "GE",
189 |       "at",
190 |       "a",
191 |       "time",
192 |       "when",
193 |       "corporate",
194 |       "scandal",
195 |       "be",
196 |       "spark",
197 |       "outrage",
198 |       "."
199 |     ],
200 |     "parse": "(ROOT\n  (S\n    (NP\n      (NP (JJR Earlier) (NNS documents))\n      (PP (IN in)\n        (NP (DT the) (NN case))))\n    (VP (VBP have)\n      (VP (VBN included)\n        (NP (JJ embarrassing) (NNS details))\n        (PP (IN about)\n          (NP\n            (NP (NNS perks))\n            (SBAR\n              (S\n                (NP (NNP Welch))\n                (VP (VBD received)\n                  (PP (IN as)\n                    (NP\n                      (NP (NN part))\n                      (PP (IN of)\n                        (NP (PRP$ his) (NN retirement) (NN package)))))\n                  (PP (IN from)\n                    (NP (NNP GE)))\n                  (PP (IN at)\n                    (NP\n                      (NP (DT a) (NN time))\n                      (SBAR\n                        (WHADVP (WRB when))\n                        (S\n                          (NP (JJ corporate) (NNS scandals))\n                          (VP (VBD were)\n                            (VP (VBG sparking)\n                              (NP (NN outrage)))))))))))))))\n    (. .)))"
201 |   }
202 | ]
203 | 


--------------------------------------------------------------------------------
/parser.py:
--------------------------------------------------------------------------------
  1 | from xml.etree import ElementTree
  2 | from bs4 import BeautifulSoup
  3 | import nltk
  4 | import json
  5 | import re
  6 | 
  7 | 
  8 | class Parser:
  9 |     def __init__(self, path):
 10 |         self.path = path
 11 |         self.entity_mentions = []
 12 |         self.event_mentions = []
 13 |         self.sentences = []
 14 |         self.sgm_text = ''
 15 | 
 16 |         self.entity_mentions, self.event_mentions = self.parse_xml(path + '.apf.xml')
 17 |         self.sents_with_pos = self.parse_sgm(path + '.sgm')
 18 |         self.fix_wrong_position()
 19 | 
 20 |     @staticmethod
 21 |     def clean_text(text):
 22 |         return text.replace('\n', ' ')
 23 | 
 24 |     def get_data(self):
 25 |         data = []
 26 |         for sent in self.sents_with_pos:
 27 |             item = dict()
 28 | 
 29 |             item['sentence'] = self.clean_text(sent['text'])
 30 |             item['position'] = sent['position']
 31 |             text_position = sent['position']
 32 | 
 33 |             for i, s in enumerate(item['sentence']):
 34 |                 if s != ' ':
 35 |                     item['position'][0] += i
 36 |                     break
 37 | 
 38 |             item['sentence'] = item['sentence'].strip()
 39 | 
 40 |             entity_map = dict()
 41 |             item['golden-entity-mentions'] = []
 42 |             item['golden-event-mentions'] = []
 43 | 
 44 |             for entity_mention in self.entity_mentions:
 45 |                 entity_position = entity_mention['position']
 46 | 
 47 |                 if text_position[0] <= entity_position[0] and entity_position[1] <= text_position[1]:
 48 | 
 49 |                     item['golden-entity-mentions'].append({
 50 |                         'text': self.clean_text(entity_mention['text']),
 51 |                         'position': entity_position,
 52 |                         'entity-type': entity_mention['entity-type'],
 53 |                         'head': {
 54 |                             "text": self.clean_text(entity_mention['head']["text"]),
 55 |                             "position": entity_mention["head"]["position"]
 56 |                         },
 57 |                         "entity_id": entity_mention['entity-id']
 58 |                     })
 59 |                     entity_map[entity_mention['entity-id']] = entity_mention
 60 | 
 61 |             for event_mention in self.event_mentions:
 62 |                 event_position = event_mention['trigger']['position']
 63 |                 if text_position[0] <= event_position[0] and event_position[1] <= text_position[1]:
 64 |                     event_arguments = []
 65 |                     for argument in event_mention['arguments']:
 66 |                         try:
 67 |                             entity_type = entity_map[argument['entity-id']]['entity-type']
 68 |                         except KeyError:
 69 |                             print('[Warning] The entity in the other sentence is mentioned. This argument will be ignored.')
 70 |                             continue
 71 | 
 72 |                         event_arguments.append({
 73 |                             'role': argument['role'],
 74 |                             'position': argument['position'],
 75 |                             'entity-type': entity_type,
 76 |                             'text': self.clean_text(argument['text']),
 77 |                         })
 78 | 
 79 |                     item['golden-event-mentions'].append({
 80 |                         'trigger': event_mention['trigger'],
 81 |                         'arguments': event_arguments,
 82 |                         'position': event_position,
 83 |                         'event_type': event_mention['event_type'],
 84 |                     })
 85 |             data.append(item)
 86 |         return data
 87 | 
 88 |     def find_correct_offset(self, sgm_text, start_index, text):
 89 |         offset = 0
 90 |         for i in range(0, 70):
 91 |             for j in [-1, 1]:
 92 |                 offset = i * j
 93 |                 if sgm_text[start_index + offset:start_index + offset + len(text)] == text:
 94 |                     return offset
 95 | 
 96 |         print('[Warning] fail to find offset! (start_index: {}, text: {}, path: {})'.format(start_index, text, self.path))
 97 |         return offset
 98 | 
 99 |     def fix_wrong_position(self):
100 |         for entity_mention in self.entity_mentions:
101 |             offset = self.find_correct_offset(
102 |                 sgm_text=self.sgm_text,
103 |                 start_index=entity_mention['position'][0],
104 |                 text=entity_mention['text'])
105 | 
106 |             entity_mention['position'][0] += offset
107 |             entity_mention['position'][1] += offset
108 |             entity_mention['head']["position"][0] += offset
109 |             entity_mention['head']["position"][1] += offset
110 | 
111 |         for event_mention in self.event_mentions:
112 |             offset1 = self.find_correct_offset(
113 |                 sgm_text=self.sgm_text,
114 |                 start_index=event_mention['trigger']['position'][0],
115 |                 text=event_mention['trigger']['text'])
116 |             event_mention['trigger']['position'][0] += offset1
117 |             event_mention['trigger']['position'][1] += offset1
118 | 
119 |             for argument in event_mention['arguments']:
120 |                 offset2 = self.find_correct_offset(
121 |                     sgm_text=self.sgm_text,
122 |                     start_index=argument['position'][0],
123 |                     text=argument['text'])
124 |                 argument['position'][0] += offset2
125 |                 argument['position'][1] += offset2
126 | 
127 |     def parse_sgm(self, sgm_path):
128 |         with open(sgm_path, 'r') as f:
129 |             soup = BeautifulSoup(f.read(), features='html.parser')
130 |             self.sgm_text = soup.text
131 | 
132 |             doc_type = soup.doc.doctype.text.strip()
133 | 
134 |             def remove_tags(selector):
135 |                 tags = soup.findAll(selector)
136 |                 for tag in tags:
137 |                     tag.extract()
138 | 
139 |             if doc_type == 'WEB TEXT':
140 |                 remove_tags('poster')
141 |                 remove_tags('postdate')
142 |                 remove_tags('subject')
143 |             elif doc_type in ['CONVERSATION', 'STORY']:
144 |                 remove_tags('speaker')
145 | 
146 |             sents = []
147 |             converted_text = soup.text
148 | 
149 |             for sent in nltk.sent_tokenize(converted_text):
150 |                 sents.extend(sent.split('\n\n'))
151 |             sents = list(filter(lambda x: len(x) > 5, sents))
152 |             sents = sents[1:]
153 |             sents_with_pos = []
154 |             last_pos = 0
155 |             for sent in sents:
156 |                 pos = self.sgm_text.find(sent, last_pos)
157 |                 last_pos = pos
158 |                 sents_with_pos.append({
159 |                     'text': sent,
160 |                     'position': [pos, pos + len(sent)]
161 |                 })
162 | 
163 |             return sents_with_pos
164 | 
165 |     def parse_xml(self, xml_path):
166 |         entity_mentions, event_mentions = [], []
167 |         tree = ElementTree.parse(xml_path)
168 |         root = tree.getroot()
169 | 
170 |         for child in root[0]:
171 |             if child.tag == 'entity':
172 |                 entity_mentions.extend(self.parse_entity_tag(child))
173 |             elif child.tag in ['value', 'timex2']:
174 |                 entity_mentions.extend(self.parse_value_timex_tag(child))
175 |             elif child.tag == 'event':
176 |                 event_mentions.extend(self.parse_event_tag(child))
177 | 
178 |         return entity_mentions, event_mentions
179 | 
180 |     @staticmethod
181 |     def parse_entity_tag(node):
182 |         entity_mentions = []
183 | 
184 |         for child in node:
185 |             if child.tag != 'entity_mention':
186 |                 continue
187 |             extent = child[0]
188 |             head = child[1]
189 |             charset = extent[0]
190 |             head_charset = head[0]
191 | 
192 |             entity_mention = dict()
193 |             entity_mention['entity-id'] = child.attrib['ID']
194 |             entity_mention['entity-type'] = '{}:{}'.format(node.attrib['TYPE'], node.attrib['SUBTYPE'])
195 |             entity_mention['text'] = charset.text
196 |             entity_mention['position'] = [int(charset.attrib['START']), int(charset.attrib['END'])]
197 |             entity_mention["head"] = {"text": head_charset.text,
198 |                                       "position": [int(head_charset.attrib['START']), int(head_charset.attrib['END'])]}
199 | 
200 |             entity_mentions.append(entity_mention)
201 | 
202 |         return entity_mentions
203 | 
204 |     @staticmethod
205 |     def parse_event_tag(node):
206 |         event_mentions = []
207 |         for child in node:
208 |             if child.tag == 'event_mention':
209 |                 event_mention = dict()
210 |                 event_mention['event_type'] = '{}:{}'.format(node.attrib['TYPE'], node.attrib['SUBTYPE'])
211 |                 event_mention['arguments'] = []
212 |                 for child2 in child:
213 |                     if child2.tag == 'ldc_scope':
214 |                         charset = child2[0]
215 |                         event_mention['text'] = charset.text
216 |                         event_mention['position'] = [int(charset.attrib['START']), int(charset.attrib['END'])]
217 |                     if child2.tag == 'anchor':
218 |                         charset = child2[0]
219 |                         event_mention['trigger'] = {
220 |                             'text': charset.text,
221 |                             'position': [int(charset.attrib['START']), int(charset.attrib['END'])],
222 |                         }
223 |                     if child2.tag == 'event_mention_argument':
224 |                         extent = child2[0]
225 |                         charset = extent[0]
226 |                         event_mention['arguments'].append({
227 |                             'text': charset.text,
228 |                             'position': [int(charset.attrib['START']), int(charset.attrib['END'])],
229 |                             'role': child2.attrib['ROLE'],
230 |                             'entity-id': child2.attrib['REFID'],
231 |                         })
232 |                 event_mentions.append(event_mention)
233 |         return event_mentions
234 | 
235 |     @staticmethod
236 |     def parse_value_timex_tag(node):
237 |         entity_mentions = []
238 | 
239 |         for child in node:
240 |             extent = child[0]
241 |             charset = extent[0]
242 | 
243 |             entity_mention = dict()
244 |             entity_mention['entity-id'] = child.attrib['ID']
245 | 
246 |             if 'TYPE' in node.attrib:
247 |                 entity_mention['entity-type'] = node.attrib['TYPE']
248 |             if 'SUBTYPE' in node.attrib:
249 |                 entity_mention['entity-type'] += ':{}'.format(node.attrib['SUBTYPE'])
250 |             if child.tag == 'timex2_mention':
251 |                 entity_mention['entity-type'] = 'TIM:time'
252 | 
253 |             entity_mention['text'] = charset.text
254 |             entity_mention['position'] = [int(charset.attrib['START']), int(charset.attrib['END'])]
255 | 
256 |             entity_mention["head"] = {"text": charset.text,
257 |                                       "position": [int(charset.attrib['START']), int(charset.attrib['END'])]}
258 | 
259 |             entity_mentions.append(entity_mention)
260 | 
261 |         return entity_mentions
262 | 
263 | 
264 | if __name__ == '__main__':
265 |     # parser = Parser('./data/ace_2005_td_v7/data/English/un/fp2/alt.gossip.celebrities_20041118.2331')
266 |     parser = Parser('./data/ace_2005_td_v7/data/English/un/timex2norm/alt.corel_20041228.0503')
267 |     data = parser.get_data()
268 |     with open('./output/debug.json', 'w') as f:
269 |         json.dump(data, f, indent=2)
270 | 
271 |     # index = parser.sgm_text.find("Diego Garcia")
272 |     # print('index :', index)
273 |     # print(parser.sgm_text[1918 - 30:])
274 | 


--------------------------------------------------------------------------------