├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── crisis_event_timeline.zip
    └── timeline17_event_timeline.zip
├── event_graph_construction
    ├── aida_timetable.py
    ├── bm25.py
    ├── event_coref_cross.py
    ├── ie_aida.sh
    ├── temporal_filter.py
    └── time_expression.py
└── overview.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | bak/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Timeline Summarization based on Event Graph Compression via Time-Aware Optimal Transport
 2 | 
 3 | Table of Contents
 4 | =================
 5 |   * [Overview](#overview)
 6 |   * [Data](#data)
 7 |   * [Reference](#reference)
 8 | 
 9 | ## Overview
10 | Data and code for the paper ["Timeline Summarization based on Event Graph Compression via Time-Aware Optimal Transport"](https://aclanthology.org/2021.emnlp-main.519/). The code will be released soon.
11 | 
12 | <p align="center">
13 |   <img src="./overview.png" alt="Photo" style="width="100%;"/>
14 | </p>
15 | 
16 | 
17 | ## Data
18 | ### Timeline Datasets
19 | Please find the timeline dataset in [timeline](http://www.l3s.de/~gtran/timeline/). 
20 | 
21 | ### Event Graphs extracted from the timeline datasets
22 | 
23 | Event graphs are extracted using scripts under `event_graph_construction`. Please find the event graphs in [timeline_event_graphs](https://uofi.box.com/s/juxquy21z0z9a2ckw5okvrg8ay5i7apn).
24 | 
25 | ### Event Graphs from unlabeled data
26 | 
27 | 1. Unlabeled data are news articles collected from Voice of American website. Please find the raw data in [voa_news](https://uofi.box.com/s/rcylt88xxjra5iyvru8g9luhrya09trh).
28 | 
29 | 2. Event graphs are extracted in [voa_events](https://uofi.box.com/s/ptp10x620p6m1k3mav79ap08je9fw6fn).
30 | 
31 | 
32 | ## Reference
33 | ```
34 | @article{li2021timeline,
35 |   author    = {Manling Li and Tengfei Ma and Mo Yu and Lingfei Wu and Tian Gao and Heng Ji and Kathleen McKeown},
36 |   title     = {Timeline Summarization based on Event Graph Compression
37 | via Time-Aware Optimal Transport},
38 |   journal   = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
39 |   year      = {2021},
40 |   url       = {https://aclanthology.org/2021.emnlp-main.519/},
41 | }
42 | ```
43 | 


--------------------------------------------------------------------------------
/data/crisis_event_timeline.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/limanling/event-graph-summarization/172d58c5b9463ae2fe60aab5fbe782bc1b2f1fe6/data/crisis_event_timeline.zip


--------------------------------------------------------------------------------
/data/timeline17_event_timeline.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/limanling/event-graph-summarization/172d58c5b9463ae2fe60aab5fbe782bc1b2f1fe6/data/timeline17_event_timeline.zip


--------------------------------------------------------------------------------
/event_graph_construction/aida_timetable.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | 
 5 | def get_date(text):
 6 |     pattern_date = re.compile('[0-9]{4}-[0-9]{2}-[0-9]{2}')
 7 | 
 8 |     date_match = pattern_date.search(text)
 9 |     if date_match: # if line.startswith('--------------'):
10 |         # date line
11 |         return date_match.group()
12 |     else:
13 |         return None
14 | 
15 | if __name__ == '__main__':
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('rsd_dir', type=str, default='/shared/nas/data/m1/manling2/ibm/graph_sum_text/data/timeline17/oneie_input', help='input_dir')
18 |     args = parser.parse_args()
19 |     rsd_dir = args.rsd_dir
20 | 
21 |     with open(rsd_dir+'.timetable.tab', 'w') as writer:
22 |         writer.write('%s\t%s\n' % ('docid', 'date')) 
23 |         for rsd_file in os.listdir(rsd_dir):
24 |             # print(rsd_file)
25 |             date = get_date(rsd_file)
26 |             # print(date)
27 |             writer.write('%s\t%s\n' % (rsd_file, date)) 


--------------------------------------------------------------------------------
/event_graph_construction/bm25.py:
--------------------------------------------------------------------------------
  1 | from dateutil import parser
  2 | import dateutil
  3 | import os
  4 | import ujson as json
  5 | import sys
  6 | 
  7 | from gensim.summarization.bm25 import BM25
  8 | 
  9 | from nltk.corpus import stopwords
 10 | from nltk.tokenize import word_tokenize
 11 | from nltk.tokenize import sent_tokenize
 12 | 
 13 | from datetime import *
 14 | 
 15 | from collections import OrderedDict, defaultdict
 16 | 
 17 | def get_dates_timeline(timeline_file, timeline_output):
 18 |     # find date range of input timeline
 19 |     min_date = date.today()
 20 |     max_date = date(1800, 1, 1)
 21 |     date_num = 0
 22 |     date_last = date.today()
 23 |     timeline_writer = open(timeline_output.replace(' ','_'), 'w')
 24 |     for line in open(timeline_file):
 25 |         line = line.rstrip()
 26 |         if line.startswith('Our Standards:') or 'Min Read' in line or '</body>' == line or '<body>' == line or line.startswith('<title>') or line.startswith('By Reuters Staff') or line.startswith('Reporting by '):
 27 |             continue
 28 |         if ':' in line[:30]:
 29 |             date_str = line.split(':')[0]
 30 |         elif '- ' in line[:30]:
 31 |             date_str = line.split('-')[0]
 32 |         elif '– ' in line[:30]:
 33 |             date_str = line.split('–')[0]
 34 |         elif ' (' in line[:30]:
 35 |             date_str = line.split('(')[0]
 36 |         elif len(line) < 20:
 37 |             date_str = line
 38 |         # elif line.startswith('On '):
 39 |         else:
 40 |             date_str = ''
 41 |             # # print(line)
 42 |             # continue
 43 |         if 'Min Read' in date_str or '</body>' in date_str or '<body>' in date_str:
 44 |             continue
 45 |         
 46 |         if len(date_str) > 0:
 47 |             date_str_raw = date_str
 48 |             try:
 49 |                 date_str = date_str.strip().split('(')[0].replace('On ', '').split(' to')[0]
 50 |                 d = parser.parse(date_str, default=date_last, fuzzy=True, fuzzy_with_tokens=False)
 51 |                 date_last = d
 52 |                 if len(date_str) == 4:
 53 |                     # save year info, but delete this row
 54 |                     continue
 55 |                 # print(d, date_last)
 56 |                 # d = d.datetime.date()
 57 |                 # print('aaaa', d, d.year, date_last)
 58 |                 # d= datetime.date(d.strftime("%Y-%m-%d"))
 59 |                 if min_date > d:
 60 |                     min_date = d
 61 |                 if max_date < d:
 62 |                     max_date = d  
 63 |                 date_num += 1
 64 |                 timeline_writer.write('--------------------------------\n')
 65 |                 timeline_writer.write(d.strftime("%Y-%m-%d"))
 66 |                 timeline_writer.write('\n')
 67 |                 content_str = line.replace(date_str_raw, '').strip('-').strip(':').strip()
 68 |                 if len(content_str) > 0:
 69 |                     timeline_writer.write('\n'.join(sent_tokenize(content_str)))
 70 |                     timeline_writer.write('\n')
 71 |             except:
 72 |                 try:
 73 |                     date_str = date_str.split(',')[-1]
 74 |                     d = parser.parse(date_str, default=date_last, fuzzy=True, fuzzy_with_tokens=False)
 75 |                     date_last = d
 76 |                     if len(date_str) == 4:
 77 |                         # save year info, but delete this row
 78 |                         continue
 79 |                     # print(d, date_last)
 80 |                     # d = d.datetime.date()
 81 |                     # print('aaaa', d, d.year, date_last)
 82 |                     # d= datetime.date(d.strftime("%Y-%m-%d"))
 83 |                     if min_date > d:
 84 |                         min_date = d
 85 |                     if max_date < d:
 86 |                         max_date = d  
 87 |                     date_num += 1
 88 |                     timeline_writer.write('--------------------------------\n')
 89 |                     timeline_writer.write(d.strftime("%Y-%m-%d"))
 90 |                     timeline_writer.write('\n')
 91 |                     content_str = line.replace(date_str_raw, '').strip('-').strip(':').strip()
 92 |                     if len(content_str) > 0:
 93 |                         timeline_writer.write('\n'.join(sent_tokenize(content_str)))
 94 |                         timeline_writer.write('\n')
 95 |                 except: #dateutil.parser._parser.ParserError:
 96 |                     # print('CANNOT PARSE DATE', date_str)
 97 |                     # pass
 98 |                     # print(sys.exc_info())
 99 |                     content_str = line.strip('-').strip(':').strip()
100 |                     if len(content_str) > 0:
101 |                         timeline_writer.write('\n'.join(sent_tokenize(content_str)))
102 |                         timeline_writer.write('\n')
103 |         else:
104 |             content_str = line.strip('-').strip(':').strip()
105 |             if len(content_str) > 0:
106 |                 timeline_writer.write('\n'.join(sent_tokenize(content_str)))
107 |                 timeline_writer.write('\n')
108 |     # Timeline of Paris attacks and investigation_idUSKBN0TB0XZ20151122.txt
109 |     return min_date, max_date, date_num
110 | 
111 | def rewrite_tl(timeline_input, timeline_output):
112 |     for timeline_file in os.listdir(timeline_input):
113 |         timeline_content = open(os.path.join(timeline_input, timeline_file)).read()
114 |         timeline_content = timeline_content[timeline_content.find('---\n') + 4:]
115 |         with open(os.path.join(timeline_output, timeline_file), 'w') as writer:
116 |             writer.write(timeline_content)
117 | 
118 | 
119 | def get_datestr(date):
120 |     # print(d.strftime("%Y-%m-%d")) #"%Y-%m-%d %H:%M:%S"))
121 |     return date.strftime("%Y-%m-%d")
122 | 
123 | # get the input candidates of that time period
124 | def get_candidate(min_date, max_date, all_doc, all_doc_date_sorted):
125 |     corpus = []
126 |     corpus_id = []
127 | 
128 |     min_date_ext = min_date - timedelta(days = 15)
129 |     max_date_ext = max_date + timedelta(days = 15)
130 | 
131 |     min_date_ext = min_date_ext.strftime("%Y-%m-%d")
132 |     max_date_ext = max_date_ext.strftime("%Y-%m-%d")
133 | 
134 |     for date_doc in all_doc_date_sorted:
135 |         if date_doc >= min_date_ext and date_doc <= max_date_ext:
136 |             for doc_path in all_doc[date_doc]:
137 |                 if 'voa_v2_processed' in doc_path:
138 |                     corpus.append(open(doc_path).read().strip('\n'))
139 |                 elif 'voa_v1_processed' in doc_path:
140 |                     corpus.append(open(doc_path).readlines()[0].rstrip('\n'))
141 |                 # corpus.append(open(doc_path).read())#os.path.join(all_doc_dir, doc_id)))
142 |             corpus_id.extend(all_doc[date_doc])
143 |         if date_doc > max_date_ext:
144 |             break
145 | 
146 |     return corpus, corpus_id
147 | 
148 | def get_all_doc_voa_v1(all_doc_dir, all_doc_list, doc_date_dict):
149 |     for doc_id in os.listdir(all_doc_dir):
150 |         if doc_id.startswith('.'):
151 |             continue
152 |         # VOA_EN_NW_2009_11_01_406231_0.rsd
153 |         date_year = doc_id[10:14]
154 |         date_month = doc_id[15:17]
155 |         date_day = doc_id[18:20]
156 |         # print(doc_id, date_year, date_month, date_day)
157 |         date_doc = date(int(date_year), int(date_month), int(date_day))
158 |         date_doc = date_doc.strftime("%Y-%m-%d")
159 |         if date_doc not in all_doc_list:
160 |             all_doc_list[date_doc] = list()
161 |         all_doc_list[date_doc].append(os.path.join(all_doc_dir, doc_id))
162 |         doc_date_dict[os.path.join(all_doc_dir, doc_id)] = date_doc
163 |     # all_doc_date_sorted = sorted(all_doc_list)
164 |     # json.dump(all_doc_date_sorted, open('/shared/nas/data/m1/manling2/ibm/graph_sum_text/src/timeline/dataset/all_doc_date_sorted_v1.json', 'w'), indent=4)
165 |     return all_doc_list, doc_date_dict
166 | 
167 | def get_all_doc_voa_v2(all_doc_head_dir, all_doc_list, doc_date_dict):
168 |     for area in os.listdir(all_doc_head_dir):
169 |         if area.startswith('.'):
170 |             continue
171 |         for doc_id in os.listdir(os.path.join(all_doc_head_dir, area, 'head_rsd')):
172 |             if doc_id.startswith('.'):
173 |                 continue
174 |             if doc_id.startswith('VOA_ENG_NW_None'):
175 |                 continue
176 |             if doc_id.startswith('VOA_ENG_NW_'):
177 |                 doc_id_clean = doc_id.replace('VOA_ENG_NW_', 'VOA_ENG_NW.')
178 |                 tabs = doc_id_clean.split('.')
179 |             else:
180 |                 # VOA_ENG_NW.12.10.2019.309_head.rsd.txt
181 |                 tabs = doc_id.split('.')
182 |                 # print(tabs)
183 |             date_year = tabs[3]
184 |             date_month = tabs[1]
185 |             date_day = tabs[2]
186 |             # print(doc_id, date_year, date_month, date_day)
187 |             date_doc = date(int(date_year), int(date_month), int(date_day))
188 |             date_doc = date_doc.strftime("%Y-%m-%d")
189 |             if date_doc not in all_doc_list:
190 |                 all_doc_list[date_doc] = list()
191 |             all_doc_list[date_doc].append(os.path.join(all_doc_head_dir, area, 'head_rsd', doc_id))
192 |             doc_date_dict[os.path.join(all_doc_head_dir, area, 'head_rsd', doc_id)] = date_doc
193 |     # all_doc_date_sorted = sorted(all_doc_list)
194 |     # json.dump(all_doc_date_sorted, open('/shared/nas/data/m1/manling2/ibm/graph_sum_text/src/timeline/dataset/all_doc_date_sorted.json', 'w'), indent=4)
195 |     return all_doc_list, doc_date_dict
196 | 
197 | # use BM25 to rank the candidates
198 | stop_words = set(stopwords.words('english'))
199 | def simple_tok(sent):
200 |     # return sent.split()
201 |  
202 |     word_tokens = word_tokenize(sent)
203 |  
204 |     filtered_sent = [w for w in word_tokens if not w.lower() in stop_words]
205 |     
206 |     return filtered_sent
207 | 
208 | def bm25(corpus, corpus_ids, query, output_dir, doc_date_dict, timeline_name, topk=300):
209 |     if len(corpus) == 0:
210 |         return list(), list()
211 |     # for s in corpus:
212 |     #     print(s)
213 |     # print('corpus', len(corpus))
214 |     tok_corpus = [simple_tok(s) for s in corpus] # [s.split(" ") for s in corpus] #
215 |     # print('tok_corpus', len(tok_corpus), corpus_ids[0], tok_corpus[0])
216 |     bm25 = BM25(tok_corpus)
217 |     tok_query = simple_tok(query) #query.split()
218 |     scores = bm25.get_scores(tok_query)
219 | 
220 |     best_docs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)#[:topk]
221 |     best_docs_ids = list()
222 |     best_docs_content = defaultdict()
223 |     # print('corpus_ids', len(corpus_ids))
224 |     for i, b in enumerate(best_docs):
225 |         # print(b, len(corpus_ids))
226 |         # print(f"rank {i+1}: {corpus_ids[b]}")
227 |         score = scores[b]
228 |         best_docs_ids.append(corpus_ids[b])
229 |         best_docs_content[corpus_ids[b]] = (scores[b], corpus[b])
230 |         dock_id = corpus_ids[b].split('/')[-1].replace('head_rsd', 'article_rsd').replace(' ','_')
231 |         if score >= 100:
232 |             os.makedirs(os.path.join(output_dir, timeline_name.replace(' ','_'), doc_date_dict[corpus_ids[b]]), exist_ok=True)
233 |             with open(os.path.join(output_dir, timeline_name.replace(' ','_'), doc_date_dict[corpus_ids[b]],  dock_id), 'w') as writer:
234 |                 content_path = corpus_ids[b].replace('_head.rsd.txt', '.rsd.txt').replace('head_rsd', 'article_rsd')
235 |                 try:
236 |                     content_str = open(content_path).read()
237 |                     writer.write('\n'.join(sent_tokenize(content_str)))
238 |                 except:
239 |                     print('cannot find file', content_path)
240 |     return best_docs_ids, best_docs_content
241 | 
242 | 
243 | if __name__ == '__main__':
244 |     input_timeline = '/shared/nas/data/m1/manling2/ibm/graph_sum_text/data/timeline/cleaned'
245 |     input_doc_dir_v1 = '/shared/nas/data/m1/manling2/mmqa/data/voa_v1_processed/article/rsd'
246 |     input_doc_head_dir_v2 = '/shared/nas/data/m1/manling2/mmqa/data/voa_v2_processed'
247 |     output_dir = '/shared/nas/data/m1/manling2/ibm/graph_sum_text/data/timeline/bm25_merge'
248 |     output_timline_std = '/shared/nas/data/m1/manling2/ibm/graph_sum_text/data/timeline/clean_format'
249 |     output_input_std = '/shared/nas/data/m1/manling2/ibm/graph_sum_text/data/timeline/input_format'
250 |     os.makedirs(output_dir, exist_ok=True)
251 |     os.makedirs(output_timline_std, exist_ok=True)
252 |     os.makedirs(output_timline_std+'_tmp', exist_ok=True)
253 |     os.makedirs(output_input_std, exist_ok=True)
254 | 
255 |     # timeline_file = 'A Timeline of Major Attacks in Kabul Over the Last Year .rsd.txt'
256 | 
257 |     # get_dates_timeline(os.path.join(input_timeline, timeline_file))
258 |     all_doc_list = dict()
259 |     doc_date_dict = dict()
260 |     all_doc_list, doc_date_dict = get_all_doc_voa_v1(input_doc_dir_v1, all_doc_list, doc_date_dict)
261 |     print('all_doc_list', len(all_doc_list))
262 |     all_doc_list, doc_date_dict = get_all_doc_voa_v2(input_doc_head_dir_v2, all_doc_list, doc_date_dict)
263 |     print('all_doc_list', len(all_doc_list))
264 |     all_doc_date_sorted = sorted(all_doc_list)
265 |     json.dump(all_doc_date_sorted, open('/shared/nas/data/m1/manling2/ibm/graph_sum_text/src/timeline/dataset/all_doc_date_sorted_merge.json', 'w'), indent=4)
266 |     
267 | 
268 |     valid_timeline = list()
269 |     for timeline_file in os.listdir(input_timeline):
270 |         if timeline_file.startswith('.'):
271 |             continue
272 |         min_date, max_date, date_num = get_dates_timeline(os.path.join(input_timeline, timeline_file), os.path.join(output_timline_std+'_tmp', timeline_file))
273 |         rewrite_tl(output_timline_std+'_tmp', output_timline_std)
274 |         date_difference = (max_date - min_date).days
275 |         print(timeline_file, date_difference, date_num)
276 | 
277 |         # if date_num < 7:
278 |         #     continue
279 |         if date_difference > 1000:
280 |             continue
281 |         # valid_timeline.append(timeline_file)  # 64
282 | 
283 |         corpus, corpus_id = get_candidate(min_date, max_date, all_doc_list, all_doc_date_sorted)
284 |         print('candidate_size', min_date, max_date, len(corpus_id))
285 |         if len(corpus_id) < 5:
286 |             continue
287 |         query = open(os.path.join(input_timeline, timeline_file)).read()
288 |         best_docs_ids, best_docs_content = bm25(corpus, corpus_id, query, output_input_std, doc_date_dict, timeline_file, topk=date_num*30)
289 |         # print(timeline_file, len(best_docs_ids), best_docs_ids)
290 |         json.dump(best_docs_content, open(os.path.join(output_dir, timeline_file+'bm25.json'), 'w'), indent=4)
291 | 
292 |         valid_timeline.append(timeline_file)
293 | 
294 |         # break
295 | 
296 |         
297 |     print('valid_timeline', len(valid_timeline))


--------------------------------------------------------------------------------
/event_graph_construction/event_coref_cross.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import json
  4 | import requests
  5 | from os.path import join
  6 | import argparse
  7 | import shutil
  8 | 
  9 | BASE_PATH = '.'
 10 | 
 11 | def read_data(json_file):
 12 |     f = open(json_file, 'r')
 13 |     data = f.read()
 14 |     f.close()
 15 |     return data
 16 | 
 17 | def save_json_format(entity_cs_file, event_cs_file, relation_cs_file,
 18 |     json_file_dir, edl_cs_file, edl_tab_file, ltf_dir):
 19 |     data = {
 20 |         "oneie": {
 21 |             "en": {
 22 |                 "cs": {
 23 |                     "entity": "",
 24 |                     "event": "",
 25 |                     "relation": ""
 26 |                 },
 27 |                 "json": {}
 28 |             },
 29 |             "es": {
 30 |                 "cs": {
 31 |                     "entity": "",
 32 |                     "event": "",
 33 |                     "relation": ""
 34 |                 },
 35 |                 "json": {}
 36 |             }
 37 |         },
 38 |         "edl": {
 39 |             "en": {
 40 |                 "cs": "",
 41 |                 "tab": ""
 42 |             },
 43 |             "es": {
 44 |                 "cs": "",
 45 |                 "tab": ""
 46 |             }
 47 |         },
 48 |         "coref": {},
 49 |         "temporal_relation": {},
 50 |         "translation": {},
 51 |         "graph_g": "",
 52 |         "ext": {
 53 |             "en": "{}",
 54 |             "es": "{}"
 55 |         },
 56 |         "matching": {},
 57 |         "data": {
 58 |             "en": [],
 59 |             "es": []
 60 |         },
 61 |         "relation_enrichment": ""
 62 |     }
 63 |     # data = dict()
 64 | 
 65 |     # # oneie
 66 |     # data['oneie']['en']['bio']['nam'] = open(bio_nam_file).read()
 67 |     # data['oneie']['en']['bio']['nam+nom+pro'] = open(bio_all_file).read()
 68 |     # data['oneie']['en']['bio']['nom'] = open(bio_nom_file).read()
 69 |     # data['oneie']['en']['bio']['pro'] = open(bio_pro_file).read()
 70 |     # data['oneie']['en']['cfet'] = open(cfet_file).read()
 71 |     data['oneie']['en']['cs']['entity'] = open(entity_cs_file).read()
 72 |     data['oneie']['en']['cs']['event'] = open(event_cs_file).read()
 73 |     data['oneie']['en']['cs']['relation'] = open(relation_cs_file).read()
 74 |     # data['oneie']['en']['json'] = dict()
 75 |     for json_file in os.listdir(json_file_dir):
 76 |         data['oneie']['en']['json'][json_file] = open(os.path.join(json_file_dir, json_file)).read()
 77 |     # data['oneie']['en']['tab']['nam'] = open(tab_nam_file).read()
 78 |     # data['oneie']['en']['tab']['nam+nom+pro'] = open(tab_all_file).read()
 79 |     # data['oneie']['en']['tab']['nom'] = open(tab_nom_file).read()
 80 |     # data['oneie']['en']['tab']['pro'] = open(tab_pro_file).read()
 81 | 
 82 |     # edl
 83 |     data['edl']['en']['cs'] = open(edl_cs_file).read()
 84 |     data['edl']['en']['tab'] = open(edl_tab_file).read()
 85 | 
 86 |     # ltf data
 87 |     for ltf_file in os.listdir(ltf_dir):
 88 |         ltf_content = open(os.path.join(ltf_dir, ltf_file)).read()
 89 |         data['data']['en'].append(ltf_content)
 90 |     
 91 |     return data
 92 | 
 93 | 
 94 | def clean_event_cs(event_cs_str):
 95 |     lines = list()
 96 |     # revise the event id
 97 |     event_cs_str = event_cs_str.replace('::Event', ':Event')
 98 |     # remove `modality`
 99 |     for line in event_cs_str.split('\n'):
100 |         if 'modality' not in line:
101 |             lines.append(line)
102 |     return '\n'.join(lines)
103 | 
104 | if __name__ == '__main__':
105 |     parser = argparse.ArgumentParser()
106 |     parser.add_argument('input_dir', type=str, help='The dataset directory.')
107 |     parser.add_argument('port', type=str, help='The port.')
108 |     args = parser.parse_args()
109 |     input_dir = args.input_dir
110 |     port = args.port
111 |     # input_dir = '/shared/nas/data/m1/manling2/ibm/graph_sum_text/data/timeline17/oneie_timeline'
112 | 
113 |     entity_cs_file = os.path.join(input_dir, 'merge/cs/entity.cs')
114 |     event_cs_file = os.path.join(input_dir, 'merge/cs/event.cs')
115 |     relation_cs_file = os.path.join(input_dir, 'merge/cs/relation.cs')
116 |     json_file_dir = os.path.join(input_dir, 'merge/json')
117 |     edl_cs_file = os.path.join(input_dir, 'edl/en.linking.cs')
118 |     edl_tab_file = os.path.join(input_dir, 'edl/en.linking.tab')
119 |     ltf_dir = os.path.join(input_dir, 'ltf')
120 |     output_file = os.path.join(input_dir, 'coref.txt')
121 |     output_file_entity = os.path.join(input_dir, 'entity_coref.cs')
122 |     output_file_relation = os.path.join(input_dir, 'relation_coref.cs')
123 |     output_file_event = os.path.join(input_dir, 'event_coref.cs')
124 | 
125 |     # input_data = read_data('./sample_input.json')
126 |     # input_data = json.loads(input_data)
127 |     input_data = save_json_format(entity_cs_file, event_cs_file, relation_cs_file,
128 |         json_file_dir, edl_cs_file, edl_tab_file, ltf_dir)
129 |     response = requests.post('http://localhost:%s/process' % port, json={'data': input_data})
130 |     with open(output_file, 'w') as f:
131 |         f.write(response.text)
132 |     ans = json.loads(response.text)
133 |     with open(output_file_entity, 'w') as f:
134 |         f.write(ans['entity.cs'])
135 |     with open(output_file_relation, 'w') as f:
136 |         f.write(ans['relation.cs'])
137 |     with open(output_file_event, 'w') as f:
138 |         f.write(clean_event_cs(ans['event.cs']))
139 | 
140 |     # # rewrite event_corefer.cs
141 |     # for event_corefer_file in glob.glob('/shared/nas/data/m1/manling2/ibm/graph_sum_text/data/timeline17/oneie_input/*/event_coref.cs'):
142 |     #     # remove the modality line
143 |     #     event_corefer_string = open(event_corefer_file).read()
144 |     #     with open(event_corefer_file.replace('.cs', '_fix.cs'), 'w') as f:
145 |     #         f.write(clean_event_cs(event_corefer_string))
146 | 
147 | 
148 | # KAIROS_LIB=/shared/nas/data/m1/manling2/aida_docker_test/ta2-pipeline-local/output


--------------------------------------------------------------------------------
/event_graph_construction/ie_aida.sh:
--------------------------------------------------------------------------------
  1 | data_root=$1
  2 | export CUDA_VISIBLE_DEVICES=$2
  3 | # data_root="/shared/nas/data/m1/manling2/ibm/graph_sum_text/data/timeline17/oneie_timeline"
  4 | ltf_source=${data_root}/ltf
  5 | rsd_source=${data_root}/rsd
  6 | parent_child_tab_path=${data_root}/parent_children.tab
  7 | lang="en"
  8 | 
  9 | rsd_file_list=${data_root}/rsd_lst
 10 | core_nlp_output_path=${data_root}/corenlp
 11 | edl_tab_nam_filename=${lang}.nam.tab
 12 | edl_tab_nom_filename=${lang}.nom.tab
 13 | edl_tab_pro_filename=${lang}.pro.tab
 14 | edl_output_dir=${data_root}/edl
 15 | edl_tab_link=${edl_output_dir}/${lang}.linking.tab
 16 | edl_tab_link_fb=${edl_output_dir}/${lang}.linking.freebase.tab
 17 | edl_tab_final=${edl_output_dir}/merged_final.tab
 18 | edl_cs_coarse=${edl_output_dir}/merged.cs
 19 | edl_cs_oneie=${data_root}/merge/cs/entity.cs
 20 | filler_coarse=${edl_output_dir}/filler_${lang}.cs
 21 | filler_coarse_color=${edl_output_dir}/filler_${lang}_all.cs
 22 | relation_cs_oneie=${data_root}/merge/cs/relation.cs # final cs output for relation
 23 | relation_result_dir=${data_root}/relation   # final cs output file path
 24 | relation_cs_coarse=${relation_result_dir}/${lang}.rel.cs # final cs output for relation
 25 | new_relation_coarse=${relation_result_dir}/new_relation_${lang}.cs
 26 | event_result_dir=${data_root}/event
 27 | event_coarse_oneie=${data_root}/merge/cs/event.cs
 28 | event_coarse_without_time=${event_result_dir}/event_rewrite.cs
 29 | event_corefer=${data_root}/event_coref.cs
 30 | event_corefer_idfix=${data_root}/event_coref_idfix.cs
 31 | event_corefer_time=${data_root}/event_coref_timenorm.cs
 32 | event_corefer_timesimple=${data_root}/event_coref_timesimple.cs
 33 | event_corefer_timeorder=${data_root}/event_order.cs
 34 | event_corefer_timeorder_filter=${data_root}/event_order_filter.cs
 35 | entity_corefer=${data_root}/entity_coref.cs
 36 | merged_cs=${data_root}/${lang}${source}_full.cs
 37 | timetable_tab=${data_root}/rsd.timetable.tab
 38 | 
 39 | # oneie
 40 | docker run --rm -i -v ${data_root}:${data_root} -w /oneie --gpus device=$2 limteng/oneie_aida_m36 \
 41 |     /opt/conda/bin/python \
 42 |     /oneie/predict.py -i ${ltf_source} -o ${data_root} -l ${lang}
 43 | 
 44 | 
 45 | 
 46 | # # stanford nlp
 47 | docker run --rm -v ${data_root}:${data_root} -w `pwd` -i limanling/uiuc_ie_m36 \
 48 |     /opt/conda/envs/py36/bin/python \
 49 |     /aida_utilities/dir_readlink.py ${rsd_source} ${rsd_file_list} 
 50 | python aida_timetable.py ${rsd_source}
 51 | docker run --rm -v ${data_root}:${data_root} -w /stanford-corenlp-aida_0 -i limanling/aida-tools \
 52 |     java -mx50g -cp '/stanford-corenlp-aida_0/*' edu.stanford.nlp.pipeline.StanfordCoreNLP \
 53 |     $* -annotators 'tokenize,ssplit,pos,lemma,ner' \
 54 |     -outputFormat json \
 55 |     -filelist ${rsd_file_list} \
 56 |     -ner.docdate.useMappingFile ${timetable_tab} \
 57 |     -properties StanfordCoreNLP_${lang}.properties \
 58 |     -outputDirectory ${core_nlp_output_path}
 59 | 
 60 | # # echo "** Linking entities to KB **"
 61 | wget http://159.89.180.81/demo/resources/edl_data.tar.gz -P ./data
 62 | tar zxvf ./data/edl_data.tar.gz -C ./data
 63 | docker run -d --rm -v ${PWD}/edl_data/db:/data/db --name db mongo:4.2
 64 | docker run -v ${PWD}/edl_data:/data \
 65 |     -v ${data_root}:/testdata_${lang} \
 66 |     --link db:mongo panx27/edl \
 67 |     python ./projs/docker_aida19/aida19.py \
 68 |     ${lang} \
 69 |     /testdata_${lang}/merge/mention/${edl_tab_nam_filename} \
 70 |     /testdata_${lang}/merge/mention/${edl_tab_nom_filename} \
 71 |     /testdata_${lang}/merge/mention/${edl_tab_pro_filename} \
 72 |     /testdata_${lang}/edl \
 73 |     m36
 74 | 
 75 | # # coreference
 76 | python event_coref_cross.py ${data_root} 22222
 77 | 
 78 | ## rewrite relation and event
 79 | docker run --rm -v ${data_root}:${data_root} -i limanling/uiuc_ie_m36 \
 80 |     /opt/conda/envs/py36/bin/python \
 81 |     /aida_utilities/rewrite_entity_id.py \
 82 |     ${edl_cs_oneie} ${relation_cs_oneie} ${event_coarse_oneie} \
 83 |     ${entity_corefer} ${relation_cs_coarse} ${event_coarse_without_time}
 84 | 
 85 | # temporal order
 86 | docker run --rm -v ${data_root}:${data_root} --gpus device=$2  -w /roberta_temporal_relation -i wenhycs/uiuc_kairos_temporal_relation \
 87 |     python kairos_temporal_relation_pipeline.py \
 88 |     --ltf_path ${ltf_source} \
 89 |     --event_cold_start_filename ${event_corefer} \
 90 |     --output_filename ${event_corefer_timeorder} \
 91 |     --add_sharing_arg
 92 | 
 93 | python temporal_filter.py \
 94 |     --input_cs ${event_corefer_timeorder} --filtered_output_cs ${event_corefer_timeorder_filter}
 95 | 
 96 | # # Filler Extraction & new relation
 97 | docker run --rm -v ${data_root}:${data_root} -i limanling/uiuc_ie_m36 \
 98 |     /opt/conda/envs/py36/bin/python \
 99 |     /entity/aida_filler/extract_filler_relation.py \
100 |     --corenlp_dir ${core_nlp_output_path} \
101 |     --ltf_dir ${ltf_source} \
102 |     --edl_path ${entity_corefer} \
103 |     --text_dir ${rsd_source} \
104 |     --path_relation ${new_relation_coarse} \
105 |     --path_filler ${filler_coarse} \
106 |     --lang ${lang}
107 | 
108 | ## Add time expression
109 | python time_expression.py \
110 |     ${ltf_source} ${filler_coarse} ${event_corefer} ${event_corefer_timesimple}
111 | 
112 | # 4-tuple
113 | docker run -i --rm -v ${data_root}:${data_root} \
114 |     -v ${parent_child_tab_path}:${parent_child_tab_path} \
115 |     -w /EventTimeArg --gpus device=$2 wenhycs/uiuc_event_time \
116 |     python aida_event_time_pipeline.py \
117 |     --time_cold_start_filename ${filler_coarse} \
118 |     --event_cold_start_filename ${event_corefer} \
119 |     --read_cs_event \
120 |     --parent_children_filename ${parent_child_tab_path} \
121 |     --ltf_path ${ltf_source} \
122 |     --output_filename ${event_corefer_time} \
123 |     --use_dct_as_default \
124 |     --lang ${lang}
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/event_graph_construction/temporal_filter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import json
  4 | import copy
  5 | import logging
  6 | # import configargparse
  7 | import argparse
  8 | 
  9 | from math import ceil
 10 | from tqdm import tqdm
 11 | from queue import Queue
 12 | from datetime import datetime
 13 | from typing import List, Optional
 14 | from collections import defaultdict
 15 | from itertools import product
 16 | 
 17 | 
 18 | # def _get_validated_args(input_args: Optional[List[str]] = None):
 19 | #     parser = configargparse.ArgumentParser(
 20 | #         config_file_parser_class=configargparse.YAMLConfigFileParser
 21 | #     )
 22 | 
 23 | #     parser.add_argument("--input_cs", type=str, default="/shared/nas/data/m1/wen17/tmp/20201011_kairos_backpack_roadside_temporal_relation/temporal_relation.cs",
 24 | #                         help="The input file for temporal relations.")
 25 | #     parser.add_argument("--input_es_cs", type=str, default="None",
 26 | #                         help="The input file for Spanish temporal relations.")
 27 | #     parser.add_argument("--filtered_output_cs", type=str, default="./outputs/filtered_temporal_relation.cs",
 28 | #                         help="The output file for filtered temporal relations.")
 29 | #     parser.add_argument("--do_augmentation", action="store_true",
 30 | #                         help="Do relation augmentation (find the closure).")
 31 | #     parser.add_argument("--augmented_output_cs", type=str, default="/shared/nas/data/m1/wen17/tmp/20201011_kairos_backpack_roadside_temporal_relation/augmented_temporal_relation.cs",
 32 | #                         help="The output file for augmented temporal relations.")
 33 | #     parser.add_argument("--event_cs", type=str, default="/shared/nas/data/m1/tuanml2/kairos/output/backpack_roadside_ied/event.cs",
 34 | #                         help="The input file for cold start format events.")
 35 | 
 36 | #     args = parser.parse_args(input_args)
 37 |     
 38 | #     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
 39 | #                         datefmt="%m/%d/%Y %H:%M:%S",
 40 | #                         level=logging.INFO)
 41 |     
 42 | #     return args
 43 | 
 44 | 
 45 | 
 46 | def load_temporal_cs(input_cs):
 47 |     temporal_rels = []
 48 |     for line in open(input_cs):
 49 |         line = line.rstrip('\n')
 50 |         tabs = line.split('\t')
 51 |         if tabs[1] == "TEMPORAL_BEFORE":
 52 |             events = [tabs[0], tabs[2]]
 53 |         elif tabs[1] == "TEMPORAL_AFTER":
 54 |             events = [tabs[2], tabs[0]]
 55 |         else:
 56 |             continue
 57 |         confidence = float(tabs[3])
 58 |         temporal_rels.append([events, confidence])
 59 |     return temporal_rels
 60 | 
 61 | 
 62 | def remove_conflict_temporal_relations(temporal_rels):
 63 |     edges = defaultdict(list)
 64 |     def can_reach(x, y):
 65 |         vis.add(x)
 66 |         for edge in edges[x]:
 67 |             if edge == y:
 68 |                 return True
 69 |             else:
 70 |                 if not edge in vis:
 71 |                     is_reached = can_reach(edge, y)
 72 |                     if is_reached:
 73 |                         return True
 74 |         return False
 75 | 
 76 |     temporal_rels = sorted(temporal_rels, key=lambda x: x[1], reverse=True)
 77 |     final_temporal_rels = []
 78 |     for rel in tqdm(temporal_rels, total=len(temporal_rels)):
 79 |         vis = set()
 80 |         # Remove self loop
 81 |         if rel[0][1] == rel[0][0]:
 82 |             continue
 83 |         if (not can_reach(rel[0][1], rel[0][0])) and not (rel[0][1] in edges[rel[0][0]]):
 84 |             edges[rel[0][0]].append(rel[0][1])
 85 |             final_temporal_rels.append(rel)
 86 | 
 87 |     return final_temporal_rels
 88 | 
 89 | def augment_temporal_relations(temporal_rels):
 90 |     edges = defaultdict(set)
 91 |     inverse_edges = defaultdict(set)
 92 |     nodes = set()
 93 |     for rel in temporal_rels:
 94 |         edges[rel[0][0]].add(rel[0][1])
 95 |         inverse_edges[rel[0][1]].add(rel[0][0])
 96 |         nodes.add(rel[0][0])
 97 |         nodes.add(rel[0][1])
 98 |     for k in tqdm(nodes, total=len(nodes)):
 99 |         for i in inverse_edges[k]:
100 |             if k in edges[i]:
101 |                 for j in edges[k]:
102 |                     if not (j in edges[i]):
103 |                         temporal_rels.append([[i, j], 0])
104 |                         edges[i].add(j)
105 |                         inverse_edges[j].add(i)
106 |     return temporal_rels
107 | 
108 | 
109 | 
110 | def topological_sort(temporal_rels, nodes):
111 |     edges = defaultdict(list)
112 |     nodes = list(nodes)
113 |     in_degree = defaultdict(int)
114 |     for rel in temporal_rels:
115 |         edges[rel[0][0]].append(rel[0][1])
116 |         in_degree[rel[0][1]] += 1
117 |     
118 |     q = Queue()
119 |     for node in nodes:
120 |         if in_degree[node] == 0:
121 |             q.put(node)
122 |     
123 |     sorted_event_ids = []
124 |     while not q.empty():
125 |         x = q.get()
126 |         sorted_event_ids.append(x)
127 |         for edge in edges[x]:
128 |             in_degree[edge] -= 1
129 |             if in_degree[edge] == 0:
130 |                 q.put(edge)
131 |     return sorted_event_ids
132 | 
133 | 
134 | def get_connected_components(sorted_event_ids, temporal_rels):
135 |     edges = defaultdict(list)
136 |     total_components = 0
137 |     node_to_component_id = dict()
138 |     components = []
139 |     def find_component(x):
140 |         vis.add(x)
141 |         if x in node_to_component_id:
142 |             return node_to_component_id[x]
143 |         for edge in edges[x]:
144 |             if not (edge in vis):
145 |                 component_id = find_component(edge)
146 |                 if component_id != -1:
147 |                     return component_id
148 |         return -1
149 | 
150 |     def set_component(x, component_id):
151 |         vis.add(x)
152 |         components[component_id].add(x)
153 |         node_to_component_id[x] = component_id
154 |         for edge in edges[x]:
155 |             if not (edge in vis) and not (edge in node_to_component_id):
156 |                 set_component(edge, component_id)
157 | 
158 |     for rel in temporal_rels:
159 |         edges[rel[0][0]].append(rel[0][1])
160 |     for x in sorted_event_ids:
161 |         vis = set()
162 |         component_id = find_component(x)
163 |         if component_id == -1:
164 |             component_id = total_components
165 |             components.append(set())
166 |             total_components += 1
167 |         vis = set()
168 |         set_component(x, component_id)
169 |     event_id_rank = dict()
170 |     for i, event in enumerate(sorted_event_ids):
171 |         event_id_rank[event] = i
172 |     final_components = []
173 |     for component in components:
174 |         component = list(component)
175 |         final_components.append(sorted(component, key=lambda x:event_id_rank[x]))
176 |     return final_components
177 | 
178 | 
179 | def load_event_cs(filename):
180 |     events = defaultdict(dict)
181 |     data = open(filename).readlines()
182 |     for line in data:
183 |         splits = line.strip("\n").split("\t")
184 |         if splits[1] == "type":
185 |             event_type = splits[2]
186 |             event_id = splits[0]
187 |             events[event_id]["type"] = event_type
188 |             if not "arguments" in events[event_id]:
189 |                 events[event_id]["arguments"] = []
190 |         else:
191 |             if splits[1] == "canonical_mention.actual":
192 |                 event_id = splits[0]
193 |                 mention_text = splits[2].strip("\"")
194 |                 events[event_id]["mention_text"] = mention_text
195 |             elif splits[2].startswith(":Entity_EDL"):
196 |                 event_id = splits[0]
197 |                 arg_role = splits[1]
198 |                 entity_id = splits[2]
199 |                 events[event_id]["arguments"].append({"role": arg_role, "id": entity_id})
200 |     return events
201 | 
202 | def id_normalize(id_raw, language):
203 |     return id_raw
204 | 
205 | 
206 | def parse_offset_str(offset_str):
207 |     doc_id = offset_str[:offset_str.rfind(':')]
208 |     start = int(offset_str[offset_str.rfind(':') + 1:offset_str.rfind('-')])
209 |     end = int(offset_str[offset_str.rfind('-') + 1:])
210 |     return doc_id, start, end
211 | 
212 | 
213 | def load_mention(tabs, info_dict, validate_offset, ltf_dir):
214 |     offset = tabs[3]
215 |     mention_type = tabs[1].replace(".actual", "")
216 |     mention_confidence = float(tabs[4])
217 |     mention_str = tabs[2][1:-1]
218 |     doc_id, start, end = parse_offset_str(offset)
219 |     # if validate_offset:
220 |     #     doc_id, start, end = parse_offset_str(offset)
221 |     #     mention_str_ltf = get_str_from_ltf(doc_id, start, end, ltf_dir)
222 |     #     assert mention_str == mention_str_ltf
223 |     if 'mention' not in info_dict:
224 |         info_dict['mention'] = list()
225 |     info_dict['mention'].append([mention_type, mention_str, doc_id, start, end])
226 | 
227 | 
228 | def load_canonical_mention(tabs, info_dict, validate_offset, ltf_dir):
229 |     offset = tabs[3]
230 |     mention_type = tabs[1].replace(".actual", "")
231 |     mention_confidence = float(tabs[4])
232 |     mention_str = tabs[2][1:-1]
233 |     doc_id, start, end = parse_offset_str(offset)
234 |     # if validate_offset:
235 |     #     doc_id, start, end = parse_offset_str(offset)
236 |     #     mention_str_ltf = get_str_from_ltf(doc_id, start, end, ltf_dir)
237 |     #     assert mention_str == mention_str_ltf
238 |     # if 'mention' not in info_dict:
239 |     #     info_dict['mention'] = list()
240 |     info_dict["canonical_mention"] = [mention_type, mention_str, doc_id, start, end]
241 | 
242 | 
243 | def get_events(filename, language='en'):
244 |     logging.info("***** Loading events *****")
245 | 
246 |     evt_info = defaultdict(lambda : defaultdict())
247 |     evt_args = defaultdict(lambda : defaultdict(lambda: defaultdict(list)))
248 |     for line in open(filename):
249 |         line = line.rstrip('\n')
250 |         tabs = line.split('\t')
251 |         if line.startswith('::Event'):
252 |             evt_id = id_normalize(tabs[0], language)
253 |             if tabs[1] == 'type':
254 |                 evt_info[evt_id]['type'] = tabs[2].split('#')[-1]
255 |             elif 'canonical_mention' in tabs[1]:
256 |                 load_canonical_mention(tabs, evt_info[evt_id], False, None)
257 |             elif 'mention' in tabs[1]:
258 |                 load_mention(tabs, evt_info[evt_id], False, None)    
259 |             elif len(tabs) > 3 and ('Entity' in tabs[2] or 'Filler' in tabs[2]):
260 |                 role = tabs[1].split('#')[-1].replace(".actual", "") # no other label than ".actual" for now
261 |                 arg_id = id_normalize(tabs[2], language)
262 |                 arg_offset = tabs[3]
263 |                 doc_id, start, end = parse_offset_str(arg_offset)
264 |                 arg_confidence = float(tabs[4])
265 |                 evt_args[evt_id][role][arg_id].append( (doc_id, start, end, arg_confidence) )
266 |             elif len(tabs) > 2 and tabs[1].startswith('t') and len(tabs[1]) == 2:
267 |                 # t_num = tabs[1]
268 |                 # date = tabs[2]
269 |                 # # for event_id, t_num, date in four_tuples:
270 |                 # num = int(t_num[1:]) - 1
271 |                 # # if "inf" not in date:
272 |                 # #     date = convert_data_gdate(date)
273 |                 # # else:
274 |                 # #     if num < 3:
275 |                 # #         date = convert_data_gdate("_9999-01-01")
276 |                 # #     else:
277 |                 # #         date = convert_data_gdate("9999-12-31")
278 |                 # date = convert_data_date(date)
279 |                 # if 'time' not in evt_info[evt_id]: 
280 |                 #     evt_info[evt_id]['time'] = [None, None, None, None]
281 |                 # evt_info[evt_id]['time'][num] = date
282 |                 pass
283 |     return evt_info, evt_args
284 | 
285 | 
286 | def main(input_args: Optional[List[str]] = None):
287 |     # args = _get_validated_args(input_args)
288 |     parser = argparse.ArgumentParser()
289 |     parser.add_argument("--input_cs", type=str, default="/shared/nas/data/m1/wen17/tmp/20201011_kairos_backpack_roadside_temporal_relation/temporal_relation.cs",
290 |                         help="The input file for temporal relations.")
291 |     parser.add_argument("--input_es_cs", type=str, default="None",
292 |                         help="The input file for Spanish temporal relations.")
293 |     parser.add_argument("--filtered_output_cs", type=str, default="./outputs/filtered_temporal_relation.cs",
294 |                         help="The output file for filtered temporal relations.")
295 |     parser.add_argument("--do_augmentation", action="store_true",
296 |                         help="Do relation augmentation (find the closure).")
297 |     parser.add_argument("--augmented_output_cs", type=str, default="/shared/nas/data/m1/wen17/tmp/20201011_kairos_backpack_roadside_temporal_relation/augmented_temporal_relation.cs",
298 |                         help="The output file for augmented temporal relations.")
299 |     parser.add_argument("--event_cs", type=str, default="/shared/nas/data/m1/tuanml2/kairos/output/backpack_roadside_ied/event.cs",
300 |                         help="The input file for cold start format events.")
301 |     args = parser.parse_args()
302 | 
303 | 
304 |     temporal_rels = load_temporal_cs(input_cs=args.input_cs)
305 | 
306 |     if args.input_es_cs != "None":
307 |         logging.info("Loading Spanish")
308 |         es_temporal_rels = load_temporal_cs(input_cs=args.input_es_cs)
309 |         # TODO: Try more confidence adjustment for English and Spanish
310 |         temporal_rels = temporal_rels + es_temporal_rels
311 | 
312 |     # temporal_rels_cleanup
313 |     logging.info(f"Before filter: {len(temporal_rels)}")
314 |     temporal_rels = remove_conflict_temporal_relations(temporal_rels=temporal_rels)
315 |     logging.info(f"After filter: {len(temporal_rels)}")
316 |     f = open(args.filtered_output_cs, "w")
317 |     results = ""
318 |     for [event_i, event_j], confidence in temporal_rels:
319 |         f.write("".join([event_i, "\t", "TEMPORAL_BEFORE", "\t", event_j, "\t", str(min(confidence,1.0)), "\n"]))
320 |         results += "".join([event_i, "\t", "TEMPORAL_BEFORE", "\t", event_j, "\t", str(min(confidence,1.0)), "\n"])
321 | 
322 | 
323 |     if args.do_augmentation:
324 |         results = ""
325 |         temporal_rels = augment_temporal_relations(temporal_rels=temporal_rels)
326 |         logging.info(f"After Augment: {len(temporal_rels)}")
327 |         f = open(args.augmented_output_cs, "w")
328 |         for [event_i, event_j], confidence in temporal_rels:
329 |             f.write("".join([event_i, "\t", "TEMPORAL_BEFORE", "\t", event_j, "\t", str(confidence), "\n"]))
330 |             results += "".join([event_i, "\t", "TEMPORAL_BEFORE", "\t", event_j, "\t", str(confidence), "\n"])
331 |         # temporal_rels = remove_conflict_temporal_relations(temporal_rels=temporal_rels)
332 |         # print(f"After filter: {len(temporal_rels)}")
333 |         f.close()
334 |     return results
335 | 
336 | def post_processing_main(input_args: Optional[List[str]] = None):
337 |     return main(input_args)
338 | 
339 | if __name__ == "__main__":
340 |     main()


--------------------------------------------------------------------------------
/event_graph_construction/time_expression.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xml.etree.ElementTree as ET
  3 | import argparse
  4 | 
  5 | def location_sentence_id(ltf_dict_with_file_id, start_offset, end_offset):
  6 |     for one_entry in ltf_dict_with_file_id:
  7 |         sentence_id = one_entry[0]
  8 |         start_offset_in_reference = one_entry[1]
  9 |         end_offset_in_reference = one_entry[2]
 10 |         if start_offset >= start_offset_in_reference and end_offset-1 <= end_offset_in_reference:
 11 |             return sentence_id
 12 |     return 0
 13 | 
 14 | 
 15 | 
 16 | def time_ex_locator_dict_generator(ltf_dict, time_ex_path):
 17 |     time_expression_dict = dict()
 18 |     black_list = list()
 19 |     for one_line in open(time_ex_path, 'r', encoding = 'utf-8'):
 20 |         one_line = one_line.strip()
 21 |         one_line_list = one_line.split('\t')
 22 |         if len(one_line_list) == 3:
 23 |             if one_line_list[2] != 'TME':
 24 |                 black_list.append(one_line_list[0])
 25 |             continue
 26 |         if one_line_list[0] in black_list:
 27 |             continue
 28 |         if '_mention' in one_line:
 29 |             continue
 30 |         # print(one_line)
 31 |         search_key = one_line_list[3]
 32 |         filler_id = one_line_list[0]
 33 |         file_id = search_key.split(':')[0]
 34 |         if file_id not in time_expression_dict:
 35 |             time_expression_dict[file_id] = dict()
 36 |         start_offset = int(search_key.split(':')[1].split('-')[0])
 37 |         end_offset = int(search_key.split(':')[1].split('-')[1])
 38 |         sentence_id = location_sentence_id(ltf_dict[file_id], start_offset, end_offset)
 39 |         if sentence_id == 0:
 40 |             continue
 41 |         if sentence_id not in time_expression_dict[file_id]:
 42 |             time_expression_dict[file_id][sentence_id] = list()
 43 |         time_expression_dict[file_id][sentence_id].append((filler_id, search_key))
 44 |     return time_expression_dict
 45 | 
 46 | 
 47 | parser = argparse.ArgumentParser()
 48 | parser.add_argument('ltf_source_path', type=str, help='ltf_dir')
 49 | parser.add_argument('filler_path', type=str, help='filler_path')
 50 | parser.add_argument('event_input_path', type=str, help='event_input_path')
 51 | parser.add_argument('event_output_path', type=str, help='event_output_path')
 52 | 
 53 | args = parser.parse_args()
 54 | 
 55 | ltf_source_path = args.ltf_source_path
 56 | event_input_path = args.event_input_path
 57 | event_output_path = args.event_output_path
 58 | time_ex_path = args.filler_path
 59 | ltf_dict = dict()
 60 | for one_file in os.listdir(ltf_source_path):
 61 |     if one_file.startswith('.'):
 62 |         continue
 63 |     if not one_file.endswith('.ltf.xml'):
 64 |         continue
 65 |     one_ltf_path = os.path.join(ltf_source_path, one_file)
 66 |     one_file_id = one_file.replace('.ltf.xml', '')
 67 |     if one_file_id not in ltf_dict:
 68 |         ltf_dict[one_file_id] = list()
 69 |     # try:
 70 |     # print(one_ltf_path)
 71 |     one_root = ET.parse(one_ltf_path).getroot()
 72 |     for one_seg in one_root[0][0].findall('SEG'):
 73 |         segment_id = one_seg.attrib['id']
 74 |         start_char = int(one_seg.attrib['start_char'])
 75 |         end_char = int(one_seg.attrib['end_char'])
 76 |         ltf_dict[one_file_id].append((segment_id, start_char, end_char))
 77 |     # except:
 78 |     #     print(one_ltf_path)
 79 | 
 80 | time_ex_locator_dict = time_ex_locator_dict_generator(ltf_dict, time_ex_path)
 81 | 
 82 | event_type_dict = dict()
 83 | to_write_list = list()
 84 | for one_line in open(event_input_path, 'r', encoding = 'utf-8'):
 85 |     one_line = one_line.strip()
 86 |     one_line_list = one_line.split('\t')
 87 |     if len(one_line_list) == 3:
 88 |         event_type_dict[one_line_list[0]] = one_line_list[-1]
 89 |         to_write_list.append(one_line)
 90 |         continue
 91 |     to_write_list.append(one_line)
 92 |     if 'canonical_mention' in one_line_list[1]:
 93 |         event_mention_id = one_line_list[0]
 94 |         search_key = one_line_list[3]
 95 |         file_id = search_key.split(':')[0]
 96 |         start_offset = int(search_key.split(':')[1].split('-')[0])
 97 |         end_offset = int(search_key.split(':')[1].split('-')[1])
 98 |         sentence_id = location_sentence_id(ltf_dict[file_id], start_offset, end_offset)
 99 |         if file_id not in time_ex_locator_dict:
100 |             continue
101 |         if sentence_id not in time_ex_locator_dict[file_id]:
102 |             continue
103 |         offset_difference = 0
104 |         temp_string = ''
105 |         for one_entry in time_ex_locator_dict[file_id][sentence_id]:
106 |             current_argument_type = '%s_Time.actual' % event_type_dict[event_mention_id]
107 |             argument_start_offset = int(one_entry[1].split(':')[1].split('-')[0])
108 |             if offset_difference == 0:
109 |                 offset_difference = abs(argument_start_offset-start_offset)
110 |             elif offset_difference < abs(argument_start_offset-start_offset):
111 |                 offset_difference = abs(argument_start_offset-start_offset)
112 |             else:
113 |                 continue
114 |             temp_string = "%s\t%s\t%s\t%s\t1.0" % (event_mention_id,
115 |                                                    current_argument_type,
116 |                                                    one_entry[0],
117 |                                                    one_entry[1])
118 |         if temp_string == '':
119 |             continue
120 |         to_write_list.append(temp_string)
121 | 
122 | f_w = open(event_output_path, 'w', encoding = 'utf-8')
123 | f_w.write('\n'.join(to_write_list))
124 | f_w.close()
125 | 


--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/limanling/event-graph-summarization/172d58c5b9463ae2fe60aab5fbe782bc1b2f1fe6/overview.png


--------------------------------------------------------------------------------