├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── component
    ├── BETTER
    │   ├── README.md
    │   └── joint
    │   │   ├── .gitignore
    │   │   ├── CRF_util.py
    │   │   ├── EventPipeline.py
    │   │   ├── JsonBuilder.py
    │   │   ├── README.md
    │   │   ├── all_liz
    │   │       ├── BETTER_pos2idx.pickle
    │   │       └── pos_emb.npy
    │   │   ├── dataset.py
    │   │   ├── eval.py
    │   │   ├── event_pipeline_demo.py
    │   │   ├── generate_data
    │   │       ├── all_uw.comb.pkl
    │   │       ├── contextualized_features.py
    │   │       ├── contextualized_features_bert.py
    │   │       ├── json_to_pkl_newformat.py
    │   │       ├── util.py
    │   │       └── uw_json_to_pkl_ace.py
    │   │   ├── main.py
    │   │   ├── main_biaffine.py
    │   │   ├── neural_model.py
    │   │   ├── out_pkl_to_json_eval.py
    │   │   ├── requirements.txt
    │   │   ├── saved_args.json
    │   │   ├── score.py
    │   │   ├── split_event.py
    │   │   ├── train.py
    │   │   ├── train_biaffine.py
    │   │   ├── train_ssvm.py
    │   │   └── util.py
    ├── Duration
    │   ├── .gitignore
    │   ├── Mu_test_data
    │   │   ├── dev_ace.pred.json
    │   │   ├── dev_tbd.pred.json
    │   │   ├── test_ace.pred.json
    │   │   ├── test_tbd.pred.json
    │   │   └── train_tbd.pred.json
    │   ├── README.md
    │   ├── UDS_T_data
    │   │   ├── first10.tsv
    │   │   ├── first10_preprocessed.jsonl
    │   │   └── time_eng_ud_v1.2_2015_10_30.tsv
    │   ├── inference_api.py
    │   ├── input_data
    │   │   ├── sample_document.txt
    │   │   └── sample_fig2.txt
    │   ├── input_data_conllu
    │   │   ├── sample_document.txt.output
    │   │   └── sample_fig2.txt.output
    │   ├── main.py
    │   ├── main_new.py
    │   ├── model_ckpt
    │   │   └── model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth
    │   ├── predictions
    │   │   ├── .~lock.sample_document.txt.output_timeline.csv#
    │   │   ├── README_predictions.txt
    │   │   ├── sample_document.txt.output_predictions.csv
    │   │   └── sample_document.txt.output_timeline.csv
    │   ├── predictions_new
    │   │   ├── sample_document.txt.output_predictions.csv
    │   │   ├── sample_document.txt.output_timeline.csv
    │   │   ├── sample_fig2.txt.output_predictions.csv
    │   │   └── sample_fig2.txt.output_timeline.csv
    │   ├── preprocess.py
    │   ├── preprocess_udst.py
    │   ├── readme_eval.txt
    │   ├── requirements.txt
    │   ├── run_jupyter.sh
    │   ├── scripts
    │   │   ├── __init__.py
    │   │   ├── debugging-2.ipynb
    │   │   ├── debugging.ipynb
    │   │   ├── elmo_files
    │   │   │   └── elmo_2x4096_512_2048cnn_2xhighway_options.json
    │   │   ├── run_document_timeline.bash
    │   │   ├── run_input_data.sh
    │   │   ├── run_model.py
    │   │   ├── src
    │   │   │   └── factslab
    │   │   │   │   └── factslab
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── pytorch
    │   │   │   │           ├── __init__.py
    │   │   │   │           ├── childsumtreelstm.py
    │   │   │   │           ├── mlpregression.py
    │   │   │   │           ├── rnnregression.py
    │   │   │   │           ├── roberta_extract.py
    │   │   │   │           ├── simplemlpregression.py
    │   │   │   │           ├── temporalmodule.py
    │   │   │   │           └── transformer_regression.py
    │   │   ├── timelinemodule.py
    │   │   └── utils.py
    │   └── utils_duration.py
    ├── NegationDetection
    │   ├── .gitignore
    │   ├── README.md
    │   └── train.py
    ├── REST_service
    │   └── main.py
    ├── TempRel
    │   ├── .gitignore
    │   ├── code
    │   │   ├── joint_model.py
    │   │   └── run.sh
    │   ├── conda_env.txt
    │   └── other
    │   │   └── pos_tags.txt
    └── component_envs
    │   ├── env_temprel.yml
    │   ├── req_better.txt
    │   └── req_biomed.txt
├── env.yml
├── env_coref.yml
└── project
    ├── APIs
        ├── coref.py
        ├── coref_pre.py
        ├── main.py
        ├── test_on_ace_data.py
        └── test_on_raw_text.py
    ├── manage.py
    ├── project
        ├── __init__.py
        ├── settings.py
        ├── urls.py
        ├── views.py
        └── wsgi.py
    ├── statics
        ├── css
        │   ├── index.css
        │   └── loading.gif
        └── js
        │   ├── Tracking.js
        │   ├── annotation.js
        │   ├── index.js
        │   ├── main.js
        │   ├── security.js
        │   └── temporal.js
    └── templates
        └── index.html


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Project specific
  2 | component/EventDemo
  3 | project/static
  4 | project/tmp
  5 | project/bert*
  6 | project/xlnet*
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # IDE
 17 | .idea
 18 | 
 19 | # Distribution / packaging
 20 | .Python
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | pip-wheel-metadata/
 34 | share/python-wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | MANIFEST
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .nox/
 54 | .coverage
 55 | .coverage.*
 56 | .cache
 57 | nosetests.xml
 58 | coverage.xml
 59 | *.cover
 60 | *.py,cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | db.sqlite3-journal
 73 | 
 74 | # Flask stuff:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "component/BioMedEventEx"]
2 | 	path = component/BioMedEventEx
3 | 	url = https://github.com/PlusLabNLP/GEANet-BioMed-Event-Extraction.git
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [NAACL'21] EventPlus: A Temporal Event Understanding Pipeline
  2 | 
  3 | This is the codebase for the system demo EventPlus: A Temporal Event Understanding Pipeline in NAACL 2021.
  4 | 
  5 | Please refer to our paper for details. [[PDF]](https://www.aclweb.org/anthology/2021.naacl-demos.7.pdf) [[Talk]](https://youtu.be/KPXpKeVIuag) [[Demo]](https://kairos-event.isi.edu/)
  6 | 
  7 | ## Quick Start
  8 | 
  9 | 0 - Clone the codebase with all submodules
 10 | 
 11 | ```
 12 | git clone --recurse-submodules https://github.com/PlusLabNLP/EventPlus.git
 13 | # or use following commands
 14 | git clone https://github.com/PlusLabNLP/EventPlus.git
 15 | git submodule init
 16 | git submodule update
 17 | ```
 18 | 
 19 | 1 - Environment Installation
 20 | 
 21 | Change prefix (last line) of `env.yml` to fit your path, then run
 22 | 
 23 | ```
 24 | conda env create -f env.yml
 25 | conda activate event-pipeline
 26 | pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_jnlpba_md-0.2.4.tar.gz
 27 | python -m spacy download en_core_web_sm
 28 | pip install git+https://github.com/hltcoe/PredPatt.git
 29 | ```
 30 | 
 31 | 2 - Download trained model for components
 32 | 
 33 | For `component/BETTER` module, download the trained model [[Link]](https://drive.google.com/file/d/19_W6azeG5KRQxLDICswqwIFX0QOjxh_L/view?usp=sharing), unzip and place it under `component/BETTER/joint/worked_model_ace`.
 34 | 
 35 | For `component/TempRel` module, download the trained model [[Link]](https://drive.google.com/file/d/1vyeAqtDmBp98NCuEMCFvrnJ8oBuNuMr3/view?usp=sharing), unzip and place it under `component/TempRel/models`.
 36 | 
 37 | For `component/Duration` module, download `scripts` zip file [[Link]](https://drive.google.com/file/d/1s1uLcQjjFdfcto3BZ3aRi8pPzLf9KELe/view?usp=sharing), unzip and place it under `component/Duration/scripts`.
 38 | 
 39 | For `component/NegationDetection` module, download the trained model [[Link]](https://drive.google.com/file/d/1FLAHrWy3eF23Kb7Ql4k_f1a5lCQ5m1L0/view?usp=sharing), unzip and place is under `component/NegationDetection/models`
 40 | 
 41 | 3 - In background: Run REST API for event duration detection module for faster processing
 42 | ```
 43 | (optional) tmux new -s duration_rest_api
 44 | conda activate event-pipeline
 45 | cd component/REST_service
 46 | python main.py
 47 | (optional) exit tmux window
 48 | ```
 49 | 
 50 | 4 - Application 1: Raw Text Annotation. The input is a multiple line raw text file, and the output pickle and json file will be saved to designated paths
 51 | ```
 52 | cd YOUR_PREFERRED_PATH/project
 53 | python APIs/test_on_raw_text.py -data YOUR_RAW_TEXT_FILE -save_path SAVE_PICKLE_PATH -save_path_json SAVE_JSON_PATH -negation_detection
 54 | ```
 55 | 
 56 | 5 - Application 2: Web App for Interaction and Visualization. A web app will be started and user can input a piece of text and get annotation result and visualization.
 57 | ```
 58 | cd YOUR_PREFERRED_PATH/project
 59 | tmux new -s serve
 60 | python manage.py runserver 8080
 61 | ```
 62 | 
 63 | ## Components
 64 | 
 65 | The code for data processing and incorporating different components is in `project/APIs/main.py`. Please refer to README file of each component for more details about training and inference.
 66 | 
 67 | 1- Event Extraction on ACE Ontology: `component/BETTER`
 68 |  
 69 | 2- Joint Event Trigger and Temporal Relation Extraction: `component/TempRel` for inference, [this codebase](https://github.com/rujunhan/EMNLP-2019) for training
 70 | 
 71 | 3- Event Duration Detection: `component/Duration`
 72 | 
 73 | 4- Negation and Speculation Cue Detection and Scope Resolution: `component/NegationDetection`
 74 | 
 75 | 5- Biomedical Event Extraction: `component/BioMedEventEx` for inference, [this codebase](https://github.com/PlusLabNLP/GEANet-BioMed-Event-Extraction) for training
 76 | 
 77 | ## Quick Start with ISI shared NAS
 78 | 
 79 | If you are using the system on a machine with access to ISI shared NAS, you could directly activate environment and copy the code and start using it right away!
 80 | 
 81 | ```
 82 | # 1 - Environment Installation: Activate existing environment
 83 | conda activate /nas/home/mingyuma/miniconda3/envs/event-pipeline-dev
 84 | 
 85 | # 2 - Prepare Components (Submodules): Copy the whole codebase
 86 | cp -R /nas/home/mingyuma/event-pipeline/event-pipeline-dev YOUR_PREFERRED_PATH
 87 | 
 88 | # 3 - In background: Run REST API for event duration detection module for faster processing
 89 | (optional) tmux new -s duration_rest_api
 90 | conda activate /nas/home/mingyuma/miniconda3/envs/event-pipeline-dev
 91 | cd component/REST_service
 92 | python main.py
 93 | (optional) exit tmux window
 94 | 
 95 | # To use it for raw text annotation or web app, please follow step 4 and 5 in quick start section.
 96 | ```
 97 | 
 98 | ## Deployment as Web Service
 99 | 
100 | Here are instruction of how to deploy the web application on an server
101 | 
102 | ### Set up web server
103 | 
104 | ```
105 | pip install uwsgi
106 | ```
107 | 
108 | If you met errors like `error while loading shared libraries libssl.so.1.1`, reference [this link](https://www.bswen.com/2018/11/others-Openssl-version-cause-error-when-loading-shared-libraries-libssl.so.1.1.html) and do the following
109 | 
110 | ```
111 | export LD_LIBRARY_PATH=/nas/home/mingyuma/miniconda3/envs/event-pipeline/lib:$LD_LIBRARY_PATH
112 | ```
113 | 
114 | ### Server port setting
115 | 
116 | External port: 443 (for HTTPS)
117 | 
118 | Django will forward traffic from 443 port to internal 8080 port
119 | 
120 | Internal port
121 | * 8080: run Django main process
122 | * 17000: run service for duration (if we run a REST API for duration module, but now the newer version doesn't need such a separate service)
123 | 
124 | ## Citation
125 | 
126 | ```
127 | @inproceedings{ma-etal-2021-eventplus,
128 |     title = "{E}vent{P}lus: A Temporal Event Understanding Pipeline",
129 |     author = "Ma, Mingyu Derek  and
130 |       Sun, Jiao  and
131 |       Yang, Mu  and
132 |       Huang, Kung-Hsiang  and
133 |       Wen, Nuan  and
134 |       Singh, Shikhar  and
135 |       Han, Rujun  and
136 |       Peng, Nanyun",
137 |     booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Demonstrations",
138 |     month = jun,
139 |     year = "2021",
140 |     address = "Online",
141 |     publisher = "Association for Computational Linguistics",
142 |     url = "https://www.aclweb.org/anthology/2021.naacl-demos.7",
143 |     pages = "56--65",
144 |     abstract = "We present EventPlus, a temporal event understanding pipeline that integrates various state-of-the-art event understanding components including event trigger and type detection, event argument detection, event duration and temporal relation extraction. Event information, especially event temporal knowledge, is a type of common sense knowledge that helps people understand how stories evolve and provides predictive hints for future events. EventPlus as the first comprehensive temporal event understanding pipeline provides a convenient tool for users to quickly obtain annotations about events and their temporal information for any user-provided document. Furthermore, we show EventPlus can be easily adapted to other domains (e.g., biomedical domain). We make EventPlus publicly available to facilitate event-related information extraction and downstream applications.",
145 | }
146 | ```


--------------------------------------------------------------------------------
/component/BETTER/README.md:
--------------------------------------------------------------------------------
 1 | # BETTER_project
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | - Install [Git LFS](https://github.com/git-lfs/git-lfs/wiki/Installation).
 6 | - Clone the repository. If your repository is already cloned, pull files
 7 |   with `git-lfs pull`.
 8 | - When adding big binary, JSON, etc files track them using Git LFS: `git
 9 | lfs track <filename>` or `git lfs track "<folder name>/**"` to track
10 | everything under that folder.
11 | 
12 | 
13 | ## Run code
14 | 
15 | **See the README under `joint` folder.** 
16 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/.gitignore:
--------------------------------------------------------------------------------
 1 | glove/*.txt
 2 | logs/*
 3 | run_jobs/*
 4 | results_biaffine*.pkl
 5 | *.log
 6 | slurm*
 7 | tmp/*
 8 | exp_argus/*
 9 | *.sh
10 | __pycache__
11 | worked_model_ace


--------------------------------------------------------------------------------
/component/BETTER/joint/CRF_util.py:
--------------------------------------------------------------------------------
  1 | # Assert the torchcrf version is 0.7.2
  2 | # allennlp version is 0.9.1
  3 | import torch
  4 | import heapq
  5 | import numpy as np
  6 | import time
  7 | from torch.nn.utils.rnn import pad_sequence
  8 | from allennlp.nn.util import viterbi_decode
  9 | 
 10 | def calculate_prob_byObser(crf_obj, emissions, observation, mask):
 11 |     '''
 12 |     Given padded sequence of crf_path, calculate corresponding score for path
 13 |     Args:
 14 |         crf_obj : torchcrf object
 15 |         emissions (`~torch.Tensor`): Emission score tensor of size
 16 |             ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
 17 |             ``(batch_size, seq_length, num_tags)`` otherwise.
 18 |         observation (`~torch.Tensor`): ``size (seq_length, batch_size)`` if ``batch_first is ``False``,
 19 |             ``(batch_size, seq_length)`` otherwise.
 20 |         mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
 21 |             if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
 22 |     Returns:
 23 |         torch.FloatTensor in size (batch) # log prob.
 24 |     '''
 25 |     if mask is None:
 26 |         mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
 27 | 
 28 |     if crf_obj.batch_first:
 29 |         emissions = emissions.transpose(0, 1)
 30 |         mask = mask.transpose(0, 1)
 31 |         obser = observation.transpose(0, 1)
 32 |     numerator = crf_obj._compute_score(emissions, obser, mask)
 33 |     denominator = crf_obj._compute_normalizer(emissions, mask)
 34 |     return numerator - denominator
 35 | 
 36 | def pad_seq(best_path, seq_length, batch_first=True, padding_value=0):
 37 |     assert batch_first
 38 |     batch = []
 39 |     for path in best_path:
 40 |         ori_len = len(path)
 41 |         pads = [padding_value]*(seq_length-ori_len)
 42 |         batch.append(path+pads)
 43 |     return torch.LongTensor(batch)
 44 | 
 45 | def kViterbi(crf_obj, emissions, topK, mask):
 46 |     """
 47 |     Find the k-best tag sequence using modified Viterbi algorithm.
 48 |     Args:
 49 |         crf_obj : torchcrf object
 50 |         emissions (`~torch.Tensor`): Emission score tensor of size
 51 |             ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
 52 |             ``(batch_size, seq_length, num_tags)`` otherwise.
 53 |         topK (int): How many path want to consider
 54 |         mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
 55 |             if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
 56 |     Returns:
 57 |         List of list containing the best tag sequence for each batch.
 58 |     """
 59 |     assert topK >=1
 60 |     if topK == 1:
 61 |         seq_length = emissions.size(1)
 62 |         best_path = crf_obj.decode(emissions, mask)
 63 |         observation = pad_seq(best_path, seq_length, crf_obj.batch_first, 0)
 64 |         best_probs = calculate_prob_byObser(crf_obj, emissions, observation, mask)
 65 |         return best_path, best_probs.squeeze()
 66 | 
 67 |     crf_obj._validate(emissions, mask=mask)
 68 |     if mask is None:
 69 |         mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
 70 |     if crf_obj.batch_first:
 71 |         emissions = emissions.transpose(0, 1)
 72 |         mask = mask.transpose(0, 1)
 73 |     normalizer = crf_obj._compute_normalizer(emissions, mask)
 74 |     # ===============start main part========================
 75 |     # emissions: (seq_length, batch_size, num_tags)
 76 |     # mask: (seq_length, batch_size)
 77 |     assert emissions.dim() == 3 and mask.dim() == 2
 78 |     assert emissions.shape[:2] == mask.shape
 79 |     assert emissions.size(2) == crf_obj.num_tags
 80 |     assert mask[0].all()
 81 | 
 82 |     seq_length, batch_size = mask.shape
 83 | 
 84 |     # Start transition and first emission
 85 |     # score is a tensor of size(batch_size, num_tags, topK) where for each
 86 |     # batch, value at tags i and top j stores the scores of the j-th best tag
 87 |     # sequence so far that ends with tag i
 88 |     #
 89 |     # pre_states saves the previous tag where the j-th best path that ends with tag i currently
 90 |     score = emissions.new_zeros((seq_length, batch_size, crf_obj.num_tags, topK))
 91 |     score[0,:,:,0] = crf_obj.start_transitions + emissions[0] # batch x num_tags
 92 | 
 93 |     pre_states = np.zeros((seq_length, batch_size, crf_obj.num_tags, topK), int)
 94 |     for i in range(crf_obj.num_tags):
 95 |         for b in range(batch_size):
 96 |             for k in range(topK):
 97 |                 pre_states[0,b,i,k] = i # should be start transition
 98 | 
 99 |     # The ranking of multiple paths through same state
100 |     rank = np.zeros((seq_length, batch_size, crf_obj.num_tags, topK), int)
101 |     for t in range(1, seq_length):
102 |         next_score_list = []
103 |         for k in range(topK):
104 |             broadcast_score = score[t-1,:,:,k].unsqueeze(2) #(batch_size, num_tags, 1)
105 |             broadcast_emissions = emissions[t].unsqueeze(1) #(batch_size, 1, num_tags)
106 | 
107 |             # Compute the score tensor of size (batch_size, num_tags, num_tags)
108 |             # where for each sample, entry at row i and column j stores
109 |             # the sum of scores of all possible tag sequences so far that end
110 |             # with transitioning from tag i to tag j and emitting
111 |             # shape: (batch_size, num_tags, num_tags)
112 |             next_score = broadcast_score + crf_obj.transitions + broadcast_emissions
113 |             next_score_list.append(next_score)
114 | 
115 |         for b in range(batch_size):
116 |             if mask[t,b]:
117 |                 for cur_state in range(crf_obj.num_tags):
118 |                     h = []
119 |                     for pre_state in range(crf_obj.num_tags):
120 |                         for k in range(topK):
121 |                             heapq.heappush(h, (-1*next_score_list[k][b, pre_state, cur_state], pre_state))
122 | 
123 |                     # Get the sorted list
124 |                     h_sorted = [heapq.heappop(h) for _ in range(topK)] #get topK path into cur_state
125 |                     # We need to keep a ranking if a path crosses a state more than once
126 |                     rankDict = dict()
127 |                     # Retain the topK scoring paths
128 |                     for k in range(topK):
129 |                         score[t, b, cur_state, k] = score[t, b, cur_state, k] + (h_sorted[k][0].data * -1)
130 |                         pre_states[t, b, cur_state, k] = h_sorted[k][1]
131 |                         state = h_sorted[k][1]
132 |                         if state in rankDict:
133 |                             rankDict[state] = rankDict[state]+1
134 |                         else:
135 |                             rankDict[state] = 0
136 |                         rank[t, b, cur_state, k] = rankDict[state]
137 |             else:
138 |                 for cur_state in range(crf_obj.num_tags):
139 |                     for k in range(topK):
140 |                         score[t, b, cur_state, k]=score[t-1, b, cur_state, k]
141 | 
142 | 
143 |     batch_path = []
144 |     batch_path_prob = []
145 |     seq_ends = mask.long().sum(dim=0) - 1 # seq_len x batch # assume seq_ends=8, seq_len=9
146 |     for b in range(batch_size):
147 |         h = []
148 |         for cur_state in range(crf_obj.num_tags):
149 |             for k in range(topK):
150 |                 heapq.heappush(h, ( -1 * (score[seq_ends[b], b, cur_state, k]+crf_obj.end_transitions[cur_state]),
151 |                                    cur_state, k))
152 |         h_sorted = [heapq.heappop(h) for _ in range(topK)]
153 |         k_list = np.zeros((topK, seq_ends[b]+1), int) # k x 9
154 |         k_list_probs = list()
155 |         for k in range(topK):
156 |             prob = h_sorted[k][0]
157 |             state = h_sorted[k][1]
158 |             rankK = h_sorted[k][2]
159 | 
160 |             k_list_probs.append((prob*-1)-(normalizer[b]))
161 |             k_list[k][seq_ends[b]] = state # assign index 8 == last one
162 |             for t in range(seq_ends[b]-1, -1, -1): # t = 7,6,5,4,3,2,1,0
163 |                 nextState = k_list[k][t+1]
164 |                 preState = pre_states[t+1, b, nextState, rankK]
165 |                 k_list[k][t] = preState
166 |                 rankK = rank[t+1,b,nextState,rankK]
167 |         batch_path.append(k_list.tolist())
168 |         batch_path_prob.append(k_list_probs)
169 |     if crf_obj.batch_first:
170 |         batch_probs = recalculate_probs(crf_obj, batch_path, emissions.transpose(0,1), mask.transpose(0,1), topK)
171 |     else:
172 |         batch_probs = recalculate_probs(crf_obj, batch_path, emissions, mask, topK)
173 |     return batch_path, batch_probs
174 | 
175 | def recalculate_probs(crf_obj, batch_path, emissions, mask, topK):
176 |     '''
177 |     batch_path: List(batch) of List(k) of int
178 |     emissions' and mask's batch_first should align with crf_obj
179 |     '''
180 |     if crf_obj.batch_first:
181 |         batch_size = emissions.size(0)
182 |     else:
183 |         batch_size = emissions.size(1)
184 | 
185 |     batch_probs = []
186 |     for k in range(topK):
187 |         candidate = []
188 |         for b in range(batch_size):
189 |             candidate.append(batch_path[b][k])
190 |         observation = pad_sequence([torch.LongTensor(s) for s in candidate],
191 |                                    batch_first=crf_obj.batch_first,
192 |                                    padding_value=0)
193 |         batch_probs.append(calculate_prob_byObser(crf_obj, emissions, observation, mask))
194 | 
195 |     return torch.stack(batch_probs, dim=0).transpose(0,1)
196 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/JsonBuilder.py:
--------------------------------------------------------------------------------
  1 | 
  2 | class JsonBuilder:
  3 |     def __init__(self, B2I_trigger, B2I_argument, B2I_ner):
  4 |         self.B2I_trigger = B2I_trigger
  5 |         self.B2I_argument = B2I_argument
  6 |         self.B2I_ner = B2I_ner
  7 | 
  8 |     def iob_to_obj(self, y, B2I):
  9 |         '''
 10 |         B2I : {'B-AGENT': 'I-AGENT', 'B-PATIENT': 'I-PATIENT'}
 11 |         '''
 12 |         obj = []
 13 |         in_obj = False
 14 |         curr_obj = []
 15 |         curr_I = None
 16 |         for i in range(len(y)):
 17 |             # end of obj
 18 |             if in_obj:
 19 |                 if y[i] != curr_I:
 20 |                     obj.append(curr_obj + [i-1])
 21 |                     curr_obj = []
 22 |                     curr_I = None
 23 |                     in_obj = False
 24 |                 else:
 25 |                     if i == len(y) - 1:
 26 |                         obj.append(curr_obj + [i])
 27 |             # beginning of obj
 28 |             if y[i] in B2I:
 29 |                 curr_obj = [y[i][2:], i]
 30 |                 curr_I = B2I[y[i]]
 31 |                 in_obj = True
 32 |                 if i == len(y) - 1:
 33 |                     obj.append(curr_obj + [i])
 34 |         return obj
 35 |     def from_preds(self, input_sent, y_preds_t, y_preds_e, y_preds_ner):
 36 |         assert len(y_preds_t) == len(y_preds_e)
 37 |         preds = []
 38 |         for y_pred_t, y_pred_e in zip(y_preds_t, y_preds_e):
 39 |             preds.append({
 40 |                 'trigger': y_pred_t,
 41 |                 'argument': y_pred_e
 42 |                 })
 43 |         ner = self.iob_to_obj(y_preds_ner[0], self.B2I_ner)
 44 |         ner = [[x[1], x[2], x[0]] for x in ner]  # convert the order for each ner obj
 45 |         out = []
 46 |         events_pred = self.convert_out_dicts_to_event_dicts(preds, input_sent)
 47 |         out.append({'tokens': input_sent,
 48 |                     'events': events_pred,
 49 |                     'ner': ner
 50 |             })
 51 |         return out
 52 | 
 53 | 
 54 |     def convert_out_dicts_to_event_dicts(self, sel_preds, input_sent):
 55 |         '''
 56 |         `sel_preds` contain sent-level prediction
 57 |         return a list of dicts, which will be used to create the BetterEvent objs
 58 |         `data_type`, currently support choose from ['local', 'ssvm']
 59 |         '''
 60 | 
 61 |         out_dicts = []
 62 |         cnt = 1
 63 |         for event in sel_preds:
 64 |             out_dict = {}
 65 |             # sent_id = event['sent_id']
 66 | 
 67 |             tri_seq = event['trigger']
 68 |             trigger_objs = self.iob_to_obj(tri_seq, self.B2I_trigger)
 69 |             if len(trigger_objs) == 0:
 70 |                 continue
 71 |             else:
 72 | 
 73 |                 event_type = trigger_objs[0][0]
 74 |                 out_dict['event_type'] = event_type
 75 |                 trigger_span_dicts = self.get_span_dicts_from_objs(trigger_objs, input_sent, task='trigger')
 76 |                 arg_objs = self.iob_to_obj(event['argument'], self.B2I_argument)
 77 |                 argu_span_dicts = self.get_span_dicts_from_objs(arg_objs, input_sent, task='argument')
 78 |                 out_dict['triggers'] = trigger_span_dicts
 79 | 
 80 |                 out_dict['arguments'] = argu_span_dicts
 81 | 
 82 |             cnt += 1
 83 |             out_dicts.append(out_dict)
 84 |         return out_dicts
 85 | 
 86 |     def get_span_dicts_from_objs(self, objs, input_sent, task='trigger'):
 87 |         span_dicts = []
 88 |         for obj in objs:
 89 |             role = obj[0]
 90 |             l_idx = obj[1]
 91 |             r_idx = obj[2]
 92 |             text = input_sent[l_idx] if r_idx == l_idx \
 93 |                     else ' '.join(input_sent[l_idx:r_idx+1])
 94 |             if task == 'trigger':
 95 |                 span_dict = {'event_type': role,
 96 |                              'text': text,
 97 |                              'start_token': l_idx,
 98 |                              'end_token': r_idx
 99 |                         }
100 |             elif task == 'argument':
101 |                 span_dict = {'role': role,
102 |                              'text': text,
103 |                              'start_token': l_idx,
104 |                              'end_token': r_idx
105 |                         }
106 |             span_dicts.append(span_dict)
107 |         return span_dicts
108 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/README.md:
--------------------------------------------------------------------------------
 1 | # Model for BETTER Project 
 2 | ## Event Extraction system API
 3 | 
 4 | ### Download pretrained models
 5 | The pretrained models are [here](https://drive.google.com/file/d/19_W6azeG5KRQxLDICswqwIFX0QOjxh_L/view?usp=sharing). Download the models and unzip it. There should be a `worked_model_ace` folder under `joint`.
 6 | 
 7 | ### Run code
 8 | 
 9 | ```
10 | python event_pipeline_demo.py
11 | ```
12 | 
13 | For the core of calling the event extraction system, see line 127-137 in `event_pipeline_demo.py`. The expected output should be 
14 | ```
15 | [{'tokens': ['Orders', 'went', 'out', 'today', 'to', 'deploy', '17,000', 'U.S.', 'Army', 'soldiers', 'in', 'the', 'Persian', 'Gulf', 'region', '.'], 'events': [{'event_type': 'Movement:Transport', 'triggers': [{'event_type': 'Movement:Transport', 'text': 'deploy', 'start_token': 5, 'end_token': 5}], 'arguments': [{'role': 'Artifact', 'text': 'soldiers', 'start_token': 9, 'end_token': 9}, {'role': 'Destination', 'text': 'region', 'start_token': 14, 'end_token': 14}]}], 'ner': [[7, 7, 'GPE'], [8, 8, 'ORG'], [9, 9, 'PER'], [12, 13, 'LOC'], [14, 14, 'LOC']]}]
16 | ```
17 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/all_liz/BETTER_pos2idx.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/BETTER/joint/all_liz/BETTER_pos2idx.pickle


--------------------------------------------------------------------------------
/component/BETTER/joint/all_liz/pos_emb.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/BETTER/joint/all_liz/pos_emb.npy


--------------------------------------------------------------------------------
/component/BETTER/joint/dataset.py:
--------------------------------------------------------------------------------
  1 | from torch.utils import data
  2 | from torch.nn.utils.rnn import pad_sequence
  3 | import torch
  4 | import pickle
  5 | from generate_data.contextualized_features_bert import bert_token
  6 | 
  7 | TOKEN_PAD_ID = 0
  8 | POS_PAD_ID = 6
  9 | TRI_PAD_ID = 0
 10 | ARGU_PAD_ID = 0
 11 | 
 12 | 
 13 | class EventDataset(data.Dataset):
 14 |     'Characterizes a dataset for PyTorch'
 15 |     def __init__(self, pkl_file, args):
 16 |         self.args = args
 17 |         # load data
 18 |         with open(pkl_file, 'rb') as handle:
 19 |             self.data = pickle.load(handle)
 20 | 
 21 |         # preprocessing
 22 |         new_data = list()
 23 |         for i in range(len(self.data)):
 24 |             out = list()
 25 |             if args.use_bert:
 26 |                 out.append(self.data[i]['contextual_feature'])
 27 |             elif args.finetune_bert:
 28 |                 sent_bert_tokens, sent_bert_ids, orig_to_tok_map = bert_token(self.data[i]['tokens'], args.bert_tokenizer)
 29 |                 out.append(sent_bert_ids)
 30 | 
 31 |             else:
 32 |                 if args.lower:
 33 |                     out.append([args.word2idx[x.lower()] for x in self.data[i]['tokens']])
 34 |                 else:
 35 |                     out.append([args.word2idx[x] for x in self.data[i]['tokens']])
 36 |             out.append([args.pos2idx[x] for x in self.data[i]['pos_tag']])
 37 |             if args.trigger_type:
 38 |                 out.append([args._label_to_id_t[x] for x in self.data[i]['sent_tri_label_type']])
 39 |             else:
 40 |                 out.append([args._label_to_id_t[x] for x in self.data[i]['trigger_label']])
 41 |             # if args.decode_w_ents_mask is False:
 42 |                 # out.append([args._label_to_id_e_sent[x] for x in self.data[i]['argu_label']])
 43 |             # elif args.decode_w_ents_mask is True:
 44 |             out.append([args._label_to_id_e_sent[x] for x in self.data[i]['ent_label']])     # now this item is argument candidates, instead of arguments
 45 |             if args.trigger_type:
 46 |                 out.append([([args._label_to_id_t[x] for x in i[0]], [args._label_to_id_e[y] for y in i[1]]) \
 47 |                         for i in self.data[i]['sent_tri_arg_pairs_type']])
 48 |             else:
 49 |                 out.append([([args._label_to_id_t[x] for x in i[0]], [args._label_to_id_e[y] for y in i[1]]) \
 50 |                         for i in self.data[i]['tri_arg_pairs']])
 51 |             # case 0 : use permutation of gold trigger and gold argument
 52 |             # out.append([(x[0], x[1], x[2], x[3], args._label_to_id_r[x[4]])\
 53 |             #             for x in self.data[i]['all_tri_arg_pairs']])
 54 |             out.append([])  ##### TODO, now dont do the `all_tri_arg_pairs` items so this is an empty list
 55 |             # case 1 : use candidate augmented pairs
 56 |             #out.append([(x[0], x[1], x[2], x[3], args._label_to_id_r[x[4]])\
 57 |             #            for x in self.data[i]['all_pairs_by_cand']])
 58 | 
 59 |             out.append(self.data[i]['sent_id'])
 60 |             if args.use_glove:
 61 |                 if args.lower:
 62 |                     out.append([args.word2idx[x.lower()] for x in self.data[i]['tokens']])
 63 |                 else:
 64 |                     out.append([args.word2idx[x] for x in self.data[i]['tokens']])
 65 |             else:
 66 |                 out.append([])
 67 |             if args.finetune_bert:
 68 |                 out.append(orig_to_tok_map)
 69 |             else:
 70 |                 out.append([])
 71 | 
 72 |             out.append(self.data[i]['ent_to_arg'])
 73 | 
 74 |             new_data.append(out)
 75 |         self.data = new_data
 76 | 
 77 |     def __len__(self):
 78 |         'Denotes the total number of samples'
 79 |         return len(self.data)
 80 | 
 81 |     def __getitem__(self, idx):
 82 |         'Generates one sample of data'
 83 |         sample = self.data[idx]
 84 |         sent_token = sample[0]
 85 |         sent_pos = sample[1]
 86 |         sent_label_t = sample[2]
 87 |         sent_label_e = sample[3]
 88 |         sent_tri_arg_pairs = sample[4]  # each pair is (seq, seq)
 89 |         all_pairs = sample[5]  # each pair is(l_start, l_end, r_start, r_end, arg_role)
 90 |         sent_id = sample[6]
 91 |         glove_idx = sample[7]
 92 |         orig_to_tok_map = sample[8]
 93 |         ent_to_arg_dict = sample[9]
 94 |         return sent_token, sent_pos, sent_label_t, sent_label_e, sent_tri_arg_pairs, all_pairs, sent_id, glove_idx, orig_to_tok_map, ent_to_arg_dict
 95 | 
 96 | def pad_collate(batch):
 97 |     if len(batch) >= 1:
 98 |         # sort sents in each batch according to the sent len
 99 |         bs = list(zip(*[ex for ex in sorted(batch, key=lambda x: len(x[0]), reverse=True)]))
100 |         lengths = [len(x) for x in bs[0]]
101 |         sents = pad_sequence([torch.LongTensor(s) for s in bs[0]], batch_first=True, padding_value=TOKEN_PAD_ID)
102 |         poss = pad_sequence([torch.LongTensor(s) for s in bs[1]], batch_first=True, padding_value=POS_PAD_ID)
103 |         triggers = pad_sequence([torch.LongTensor(s) for s in bs[2]], batch_first=True, padding_value=TRI_PAD_ID)
104 |         arguments = pad_sequence([torch.LongTensor(s) for s in bs[3]], batch_first=True, padding_value=ARGU_PAD_ID)
105 |         seq_pairs = bs[4]
106 |         all_pairs = bs[5]
107 |         sent_ids = bs[6]
108 | 
109 |     return sent_ids, sents, poss, triggers, arguments, lengths, seq_pairs, all_pairs
110 | 
111 | def pad_collate_bert(batch):
112 |     if len(batch) >= 1:
113 |         # sort sents in each batch according to the sent len
114 |         bs = list(zip(*[ex for ex in sorted(batch, key=lambda x: len(x[0]), reverse=True)]))
115 |         lengths = [len(x) for x in bs[0]]
116 |         bert_lengths = []
117 |         sents = pad_sequence([torch.FloatTensor(s) for s in bs[0]], batch_first=True, padding_value=0.)
118 |         poss = pad_sequence([torch.LongTensor(s) for s in bs[1]], batch_first=True, padding_value=POS_PAD_ID)
119 |         triggers = pad_sequence([torch.LongTensor(s) for s in bs[2]], batch_first=True, padding_value=TRI_PAD_ID)
120 |         arguments = pad_sequence([torch.LongTensor(s) for s in bs[3]], batch_first=True, padding_value=ARGU_PAD_ID)
121 |         seq_pairs = bs[4]
122 |         all_pairs = bs[5]
123 |         sent_ids = bs[6]
124 |         if len(bs[7]) > 0:
125 |             glove_idx = pad_sequence([torch.LongTensor(s) for s in bs[7]], batch_first=True, padding_value=TOKEN_PAD_ID)
126 |         else:
127 |             glove_idx = None
128 |         orig_to_tok_map = None
129 | 
130 |     return sent_ids, sents, poss, triggers, arguments, lengths, seq_pairs, all_pairs, glove_idx, orig_to_tok_map, bert_lengths
131 | 
132 | def pad_collate_bert_finetune(batch):
133 |     if len(batch) >= 1:
134 |         # sort sents in each batch according to the sent len
135 |         bs = list(zip(*[ex for ex in sorted(batch, key=lambda x: len(x[2]), reverse=True)]))
136 |         lengths = [len(x) for x in bs[2]]  # NOTE, here have to use the triggers as original length, b/c the length of tokens(bs[0]) has changed
137 |         bert_lengths = [len(x) for x in bs[0]]
138 |         sents = pad_sequence([torch.LongTensor(s) for s in bs[0]], batch_first=True, padding_value=TOKEN_PAD_ID)
139 |         poss = pad_sequence([torch.LongTensor(s) for s in bs[1]], batch_first=True, padding_value=POS_PAD_ID)
140 |         triggers = pad_sequence([torch.LongTensor(s) for s in bs[2]], batch_first=True, padding_value=TRI_PAD_ID)
141 |         arguments = pad_sequence([torch.LongTensor(s) for s in bs[3]], batch_first=True, padding_value=ARGU_PAD_ID)
142 |         seq_pairs = bs[4]
143 |         all_pairs = bs[5]
144 |         sent_ids = bs[6]
145 |         glove_idx = pad_sequence([torch.LongTensor(s) for s in bs[7]], batch_first=True, padding_value=TOKEN_PAD_ID)  #None   # in finetune case, do not include glove_idx
146 |         orig_to_tok_map = bs[8]
147 |         ent_to_arg_dict = bs[9]
148 | 
149 |     return sent_ids, sents, poss, triggers, arguments, lengths, seq_pairs, all_pairs, glove_idx, orig_to_tok_map, bert_lengths, ent_to_arg_dict
150 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/generate_data/all_uw.comb.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/BETTER/joint/generate_data/all_uw.comb.pkl


--------------------------------------------------------------------------------
/component/BETTER/joint/generate_data/contextualized_features.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import pickle
 4 | from util import *
 5 | from transformers import *
 6 | import tqdm
 7 | 
 8 | MODELS = [(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-large')]
 9 | #MODELS = [(RobertaModel,    RobertaTokenizer,    'roberta-large')]
10 | 
11 | p = argparse.ArgumentParser()
12 | p.add_argument('input_file', type=str,
13 |                help="Input pkl file (converted from internal JSON)")
14 | p.add_argument('output_file', type=str,
15 |                help="Where to save the output features pkl file")
16 | args = p.parse_args()
17 | 
18 | for model_class, tokenizer_class, pretrained_weights in MODELS:
19 |     tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
20 |     model = model_class.from_pretrained(pretrained_weights)
21 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22 |     model.to(device)
23 |     data = pickle.load(open(args.input_file, 'rb'))
24 |     output = list()
25 |     cnt = 0
26 |     for d in tqdm.tqdm(data):
27 |         ori_sent = d['ori_sent']
28 |         tokens = d['tokens']
29 |         # Encode text
30 |         ori_sent = clean_ori_sent(ori_sent)
31 |         input_ids = torch.tensor([tokenizer.encode(ori_sent, add_special_tokens=True)], device=device)
32 |         input_tok_list = [tokenizer.decode([x]) for x in input_ids[0]]
33 |         assert input_ids.size(1) == len(input_tok_list)
34 |         try:
35 |             alignment = align_bpe_to_words(input_tok_list, tokens)
36 |         except:
37 |             # print('Align BPE failed. Skipped')
38 |             continue
39 |         with torch.no_grad():
40 |             last_hidden_states = model(input_ids)[0]
41 |         features = align_features_to_words((last_hidden_states[0]).cpu(), alignment)
42 |         try:
43 |             assert features.size(0) == len(tokens)
44 |         except:
45 |             print('Align contextualized features failed. Skipped')
46 |             continue
47 |         d['contextual_feature'] = features
48 |         output.append(d)
49 |         cnt += 1
50 |     print(cnt)
51 |     with open(args.output_file, 'wb') as of:
52 |         pickle.dump(output, of)
53 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/generate_data/contextualized_features_bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import pickle
 4 | from util import *
 5 | from transformers import *
 6 | import pdb
 7 | import tqdm
 8 | 
 9 | def bert_token(sent_orig_tokens, tokenizer):
10 | 
11 |     orig_to_tok_map = []
12 |     sent_bert_tokens = []
13 |     sent_bert_ids = []
14 |     sent_bert_tokens.append("[CLS]")
15 |     sent_bert_ids.extend(tokenizer.encode("[CLS]", add_special_tokens=False))
16 | 
17 |     for idx, orig_token in enumerate(sent_orig_tokens):
18 |         orig_to_tok_map.append(len(sent_bert_tokens))
19 |         # if orig_token != ' ':
20 |         #     sent_bert_tokens.extend(tokenizer.tokenize(orig_token))
21 |         #     sent_bert_ids.extend(tokenizer.encode(orig_token, add_special_tokens=False))
22 |         # else:
23 |         #     sent_bert_ids.extend(tokenizer.convert_tokens_to_ids([orig_token]))
24 |         #     sent_bert_tokens.extend(tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([orig_token])))
25 |         if len(tokenizer.tokenize(orig_token)) > 0:
26 |             sent_bert_tokens.extend(tokenizer.tokenize(orig_token))
27 |             sent_bert_ids.extend(tokenizer.encode(orig_token, add_special_tokens=False))
28 |         elif len(tokenizer.tokenize(orig_token)) == 0:
29 |             # case of some special chars that cause bert tokenizer return empty
30 |             sent_bert_ids.extend(tokenizer.convert_tokens_to_ids([orig_token]))
31 |             sent_bert_tokens.extend(tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([orig_token])))
32 |     sent_bert_tokens.append("[SEP]")
33 |     sent_bert_ids.extend(tokenizer.encode("[SEP]", add_special_tokens=False))
34 |     return sent_bert_tokens, sent_bert_ids, orig_to_tok_map
35 | 
36 | def get_bert_embedding(last_hid_state, orig_to_tok_map):
37 |     '''
38 |     last_hid_state is a tensor of shape (batch_size, seq_len, hid_dim)
39 |     orig_to_tok_map is a list, len(orig_to_tok_map) = len(sent_orig_tokens)
40 |     '''
41 |     out_feats = []
42 |     for orig_idx, bert_idx in enumerate(orig_to_tok_map):
43 |         if orig_idx != len(orig_to_tok_map) - 1:
44 |             sel_idx = list(range(orig_to_tok_map[orig_idx], orig_to_tok_map[orig_idx+1]))
45 |         else:
46 |             # last token
47 |             sel_idx = list(range(orig_to_tok_map[orig_idx], last_hid_state.size(1) - 1))  # do not use the [SEP] representation
48 |         sel = last_hid_state[:, sel_idx, :]
49 |         sel_mean = torch.mean(sel, dim=1, keepdim=True)
50 |         out_feats.append(sel_mean)
51 |     out_feats = torch.cat(out_feats, dim=1)
52 |     return out_feats
53 | 
54 | if __name__ == '__main__':
55 |     # MODELS = [(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-large')]
56 |     MODELS = [(BertModel, BertTokenizer, 'bert-large-uncased')]
57 |     #MODELS = [(RobertaModel,    RobertaTokenizer,    'roberta-large')]
58 | 
59 |     p = argparse.ArgumentParser()
60 |     p.add_argument('input_file', type=str,
61 |                    help="Input pkl file (converted from internal JSON)")
62 |     p.add_argument('output_file', type=str,
63 |                    help="Where to save the output features pkl file")
64 |     args = p.parse_args()
65 | 
66 |     for model_class, tokenizer_class, pretrained_weights in MODELS:
67 |         tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
68 |         model = model_class.from_pretrained(pretrained_weights)
69 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
70 |         model.to(device)
71 |         data = pickle.load(open(args.input_file, 'rb'))
72 |         output = list()
73 |         cnt = 0
74 |         for d in tqdm.tqdm(data):
75 |             ori_sent = d['ori_sent']
76 |             tokens = d['tokens']
77 |             sent_bert_tokens, sent_bert_ids, orig_to_tok_map = bert_token(tokens, tokenizer)
78 |             assert len(sent_bert_tokens) == len(sent_bert_ids)
79 |             assert len(tokens) == len(orig_to_tok_map)
80 |             with torch.no_grad():
81 |                 bert_output = model(torch.tensor([sent_bert_ids]).to(device))
82 |             last_hid_state = bert_output[0].cpu()
83 |             out_feats = get_bert_embedding(last_hid_state, orig_to_tok_map)
84 |             assert out_feats.size(1) == len(tokens)  # orig seq_len
85 |             d['contextual_feature'] = out_feats.squeeze(0)
86 |             output.append(d)
87 |             cnt += 1
88 |         print(cnt)
89 |         with open(args.output_file, 'wb') as of:
90 |             pickle.dump(output, of)
91 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/generate_data/json_to_pkl_newformat.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import json
  3 | import pdb
  4 | import argparse
  5 | import os
  6 | from events.better_core import BetterDocument
  7 | 
  8 | 
  9 | def get_seq_label_from_idxs(idxs, tokens, label_str='ANCHOR', types=None):
 10 | 
 11 |     seq_label = ['O'] * len(tokens)
 12 | 
 13 |     if label_str == 'ANCHOR':
 14 |         # for anchor case, only consider label to be among {'O', 'B-ANCHOR', 'I-ANCHOR'}
 15 |         for i in idxs:
 16 |             l_idx = i[0]
 17 |             r_idx = i[1]
 18 |             if r_idx - l_idx == 0:
 19 |                 # single-token trigger
 20 |                 seq_label[l_idx] = 'B-{}'.format(label_str)
 21 |             elif r_idx - l_idx > 0:
 22 |                 seq_label[l_idx] = 'B-{}'.format(label_str)
 23 |                 seq_label[l_idx + 1: r_idx + 1] = ['I-{}'.format(label_str)] * (r_idx - l_idx)
 24 |     elif label_str == 'TYPE':
 25 |         assert len(idxs) == len(types), pdb.set_trace()
 26 |         # for type case, only consider label to be among {'O', 'B-material--helpful', 'I-material--helpful', ...}
 27 |         for i, idx in enumerate(idxs):
 28 |             l_idx = idx[0]
 29 |             r_idx = idx[1]
 30 |             if r_idx - l_idx == 0:
 31 |                 # single-token trigger
 32 |                 seq_label[l_idx] = 'B-{}'.format(types[i])
 33 |             elif r_idx - l_idx > 0:
 34 |                 seq_label[l_idx] = 'B-{}'.format(types[i])
 35 |                 seq_label[l_idx + 1: r_idx + 1] = ['I-{}'.format(types[i])] * (r_idx - l_idx)
 36 | 
 37 |     elif label_str == 'ENT':
 38 |         # for argument case, consider label to be among {'O', 'B-agent', 'I-agent', 'B-patient', 'I-patient'}
 39 |         for i in idxs:
 40 |             l_idx = i[0]
 41 |             r_idx = i[1]
 42 |             if len(i) == 3:
 43 |                 # when arg_role is fed in, use this as label
 44 |                 arg_role = i[2].upper()
 45 |             else:
 46 |                 # else this is for sent-level arg label, only consider {'O', 'B-ENT', 'I-ENT'}
 47 |                 arg_role = 'ENT'
 48 |             if r_idx - l_idx == 0:
 49 |                 seq_label[l_idx] = 'B-{}'.format(arg_role)
 50 |             elif r_idx - l_idx > 0:
 51 |                 seq_label[l_idx] = 'B-{}'.format(arg_role)
 52 |                 seq_label[l_idx + 1: r_idx + 1] = ['I-{}'.format(arg_role)] * (r_idx - l_idx)
 53 | 
 54 |     return seq_label
 55 | 
 56 | 
 57 | def get_seq_label_fine_grained(idxs, tokens, label_str='AGENT'):
 58 |     assert label_str == 'AGENT' or label_str == 'PATIENT'
 59 |     seq_label = ['O'] * len(tokens)
 60 |     for i in idxs:
 61 |         if label_str == 'AGENT':
 62 |             if i[2] != 'agent':
 63 |                 continue
 64 |         elif label_str == 'PATIENT':
 65 |             if i[2] != 'patient':
 66 |                 continue
 67 |         l_idx = i[0]
 68 |         r_idx = i[1]
 69 |         if r_idx - l_idx == 0:
 70 |             # single-token trigger
 71 |             seq_label[l_idx] = 'B-{}'.format(label_str)
 72 |         elif r_idx - l_idx > 0:
 73 |             seq_label[l_idx] = 'B-{}'.format(label_str)
 74 |             seq_label[l_idx + 1: r_idx + 1] = ['I-{}'.format(label_str)] * (r_idx - l_idx)
 75 |     return seq_label
 76 | 
 77 | 
 78 | def check_span(gold_start, gold_end, c_start, c_end):
 79 |     if gold_start > c_start:
 80 |         if gold_end <= c_end:
 81 |             # candidate contains gold
 82 |             return True
 83 |         elif gold_end > c_end:
 84 |             return False
 85 |     elif gold_start == c_start:
 86 |         if gold_end >= c_end:
 87 |             # gold contains candidate
 88 |             return True
 89 |         elif gold_end < c_end:
 90 |             # candidate contains gold
 91 |             return True
 92 |     elif gold_start < c_start:
 93 |         if gold_end >= c_end:
 94 |             # gold contains candidate
 95 |             return True
 96 |         elif gold_end < c_end:
 97 |             return False
 98 | 
 99 | 
100 | def check_duplicate(all_pairs, current_pair):
101 |     # if return True means there's duplicate in all_pairs already
102 |     for p in all_pairs:
103 |         trigger_flag = check_span(p[0], p[1], current_pair[0], current_pair[1])
104 |         argument_flag = check_span(p[2], p[3], current_pair[2], current_pair[3])
105 |         if trigger_flag and argument_flag:
106 |             return True
107 |     return False
108 | 
109 | 
110 | def generate_all_candidate_pairs(all_candidates, all_pairs):
111 |     '''
112 |     all_candidates is a list of tuple: (start_idx, end_idx)
113 |     all_pairs is a list of tuple that contains gold trigger argument pairs:
114 |         [(tri_start, tri_end, arg_start, arg_end, label), (....)]
115 | 
116 |     output: similar structure like all_pairs, but augument with all_candidates
117 |     '''
118 |     output = list()
119 |     for i in range(len(all_candidates)):
120 |         for j in range(len(all_candidates)):
121 |             if i != j:
122 |                 current_pair = (all_candidates[i][0], all_candidates[i][1], all_candidates[j][0], all_candidates[j][1])
123 |                 if not check_duplicate(all_pairs, current_pair):
124 |                     output.append((all_candidates[i][0], all_candidates[i][1], all_candidates[j][0], all_candidates[j][1], 'None'))
125 |     return output + all_pairs
126 | 
127 | def get_data_from_json(json_file):
128 | 
129 |     with open(json_file, 'rb') as f:
130 |         data = json.load(f)
131 |     documents = {}
132 |     data_outs = []
133 |     for doc_id, doc in data.items():
134 |         documents[doc_id] = BetterDocument.from_json(doc)
135 |         for s in documents[doc_id].sentences:
136 |             sent_id = s.sent_id
137 |             sentence = s.text
138 |             tokens = s.words
139 |             pos_tags = s.pos_tags
140 | 
141 |             # gather all events in this sentence
142 |             sent_events = documents[doc_id].abstract_events[sent_id]
143 |             sent_tri_idxs, sent_arg_idxs = [], []
144 |             sent_tri_arg_pairs = []
145 |             sent_tri_agent_pairs = []
146 |             sent_tri_patient_pairs = []
147 |             sent_tri_arg_pairs_type = []
148 |             sent_event_types = []
149 |             for event in sent_events:
150 |                 tri_idxs = [(x.grounded_span.head_span.start_token, x.grounded_span.head_span.end_token) for x in event.anchors.spans]
151 |                 arg_idxs = [(x.grounded_span.head_span.start_token, x.grounded_span.head_span.end_token, y.role)
152 |                              for y in event.arguments for x in y.span_set.spans]
153 |                 assert len(tri_idxs) > 0, pdb.set_trace()
154 |                 type1 = event.properties['material-verbal']
155 |                 type2 = event.properties['helpful-harmful']
156 |                 # if type1 not in ['material', 'verbal', 'both', 'unk']:
157 |                 #     type1 = 'unk'
158 |                 # if type2 not in ['helpful', 'harmful', 'neutral']:
159 |                 #     type2 = 'unk'
160 |                 assert type1 in ['material', 'verbal', 'both', 'unk'], pdb.set_trace()
161 |                 assert type2 in ['helpful', 'harmful', 'neutral'], pdb.set_trace()
162 |                 event_type = '{}_{}'.format(type1, type2)
163 |                 sent_event_types.extend([event_type] * len(tri_idxs))
164 | 
165 |                 tri_label = get_seq_label_from_idxs(tri_idxs, tokens, 'ANCHOR')
166 |                 arg_label = get_seq_label_from_idxs(arg_idxs, tokens, 'ENT')
167 |                 sent_tri_arg_pairs.append((tri_label, arg_label))
168 | 
169 |                 tri_label_type = get_seq_label_from_idxs(tri_idxs, tokens, 'TYPE', [event_type] * len(tri_idxs))
170 |                 sent_tri_arg_pairs_type.append((tri_label_type, arg_label))
171 |                 sent_tri_idxs.extend(tri_idxs)
172 |                 sent_arg_idxs.extend(arg_idxs)
173 |             sent_tri_idxs_uniq = list(set(sent_tri_idxs))   # there are cases where the sent_tri_idxs has duplicated event idxs
174 |             sent_arg_idxs = list(set([(i[0], i[1]) for i in sent_arg_idxs]))
175 |             sent_tri_label = get_seq_label_from_idxs(sent_tri_idxs_uniq, tokens, 'ANCHOR')
176 |             sent_arg_label = get_seq_label_from_idxs(sent_arg_idxs, tokens, 'ENT')
177 |             sent_tri_label_type = get_seq_label_from_idxs(sent_tri_idxs, tokens, 'TYPE', sent_event_types)
178 | 
179 |             data_outs.append({
180 |                 'ori_sent': sentence.strip(),
181 |                 'sent_id': '{}_{}_0'.format(doc_id, sent_id),
182 |                 'tokens': tokens,
183 |                 'pos_tag': pos_tags,
184 |                 'trigger_label': sent_tri_label,
185 |                 'argu_label': sent_arg_label,
186 |                 'tri_arg_pairs': sent_tri_arg_pairs,
187 |                 'tri_agent_pairs': sent_tri_agent_pairs,
188 |                 'tri_patient_pairs': sent_tri_patient_pairs,
189 |                 'sent_tri_label_type': sent_tri_label_type,
190 |                 'sent_tri_arg_pairs_type': sent_tri_arg_pairs_type
191 |             })
192 |     return data_outs
193 | 
194 | def save_pkl(data, out_file):
195 |     with open(out_file, 'wb') as f:
196 |         pickle.dump(data, f)
197 |     print('{} saved.'.format(out_file))
198 | 
199 | if __name__ == '__main__':
200 |     p = argparse.ArgumentParser(
201 |         description="""Convert internal JSON to pkl.""")
202 |     p.add_argument('input_file', type=str, help="JSON file in internal format, or a dir holding the JSONs")
203 |     p.add_argument('output_file', type=str, help="Where to save the pkl file, or a dir to hold the output Pkls")
204 |     args = p.parse_args()
205 | 
206 |     if os.path.isdir(args.input_file):
207 |         # ensure output dir
208 |         # directory = os.path.dirname(args.output_file)
209 |         # if directory == '':
210 |             # raise OSError('{} is not a dir. Output argument and Input argument should be both dir or both file'.format(args.output_file))
211 |         # assume the output_file is a dir
212 |         if not os.path.exists(args.output_file):
213 |             os.makedirs(args.output_file)
214 |         assert os.path.isdir(args.output_file)
215 | 
216 |         print('Read JSON files from dir {}'.format(args.input_file))
217 |         for json_file in os.listdir(args.input_file):
218 |             data = get_data_from_json(os.path.join(args.input_file, json_file))
219 |             base_name = os.path.splitext(json_file)[0]
220 |             out_file = os.path.join(args.output_file, '{}.pkl'.format(base_name))
221 |             save_pkl(data, out_file)
222 | 
223 |     elif os.path.isfile(args.input_file):
224 |         # assume the output_file is a file
225 |         directory = os.path.split(args.output_file)[0]
226 |         # if directory != '':
227 |         #     raise OSError('{} is not a dir. Output argument and Input argument should be both dir or both file'.format(args.output_file))
228 |         if directory != '':
229 |             if not os.path.exists(directory):
230 |                 os.makedirs(directory)
231 | 
232 |         print('Read JSON file from file {}'.format(args.input_file))
233 |         data = get_data_from_json(args.input_file)
234 |         save_pkl(data, args.output_file)
235 | 
236 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/generate_data/util.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from collections import Counter
  3 | import torch
  4 | import re
  5 | 
  6 | def clean_ori_sent(ori_sent):
  7 |     ori_sent = re.sub(r"\.\.\.\.", "...", ori_sent)
  8 |     ori_sent = re.sub(r"---", "--", ori_sent)
  9 |     ori_sent = re.sub(r"``", '"', ori_sent)
 10 |     ori_sent = re.sub(r"''", '"', ori_sent)
 11 |     ori_sent = re.sub(r"`", "'", ori_sent)
 12 |     ori_sent = re.sub(r"\.{3,}", "...", ori_sent)
 13 |     ori_sent = re.sub(r"etc\.$", "etc. .", ori_sent)
 14 |     ori_sent = re.sub(r"etc\.\)$", "etc. .)", ori_sent)
 15 |     return ori_sent
 16 | 
 17 | def align_bpe_to_words(bert_tokens: List[str], other_tokens: List[str]):
 18 |     def clean(text):
 19 |         text = text.strip()
 20 |         if text=='---':
 21 |             return '--'
 22 |         else:
 23 |             return text
 24 |     def clean_stanford(text):
 25 |         text = text.strip()
 26 |         text = text.replace(u"\xa0", "")
 27 |         text = re.sub(r"-LRB-", '(', text)
 28 |         text = re.sub(r"-RRB-", ')', text)
 29 |         text = re.sub(r"-LSB-", '[', text)
 30 |         text = re.sub(r"-RSB-", ']', text)
 31 |         text = re.sub(r"-LCB-", '{', text)
 32 |         text = re.sub(r"-RCB-", '}', text)
 33 |         text = re.sub(r"``", '"', text)
 34 |         text = re.sub(r"''", '"', text)
 35 |         text = re.sub(r"`", "'", text)
 36 |         text = re.sub(r"---------", "------", text)
 37 |         text = re.sub(r"---------------------", "-------------------", text)
 38 |         if text =='-------------------':
 39 |             return '--------------------'
 40 |         if text =='------------':
 41 |             return '-----------'
 42 |         return text
 43 | 
 44 |     # remove whitespaces to simplify alignment
 45 |     bpe_tokens = []
 46 |     for o in bert_tokens:
 47 |         if o not in {'<s>', '</s>'}:
 48 |             bpe_tokens.append(clean(str(o)))
 49 |     other_tokens = [clean_stanford(str(o)) for o in other_tokens]
 50 |     try:
 51 |         assert ''.join(bpe_tokens) == ''.join(other_tokens)
 52 |     except AssertionError:
 53 |         if (len(''.join(bpe_tokens))+1==len(''.join(other_tokens))) and (other_tokens[-1]=='.'):
 54 |             bpe_tokens[-1]+='.'
 55 |         assert ''.join(bpe_tokens) == ''.join(other_tokens)
 56 | 
 57 |     # create alignment
 58 |     alignment = []
 59 |     bpe_toks = filter(lambda item: item[1] != '', enumerate(bpe_tokens, start=1))
 60 |     j, bpe_tok = next(bpe_toks)
 61 |     for other_tok in other_tokens:
 62 |         bpe_indices = []
 63 |         while True:
 64 |             if other_tok.startswith(bpe_tok):
 65 |                 bpe_indices.append(j)
 66 |                 other_tok = other_tok[len(bpe_tok):]
 67 |                 try:
 68 |                     j, bpe_tok = next(bpe_toks)
 69 |                 except StopIteration:
 70 |                     j, bpe_tok = None, None
 71 |             elif bpe_tok.startswith(other_tok):
 72 |                 # other_tok spans multiple BPE tokens
 73 |                 bpe_indices.append(j)
 74 |                 bpe_tok = bpe_tok[len(other_tok):]
 75 |                 other_tok = ''
 76 |             else:
 77 |                 raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok))
 78 |             if other_tok == '':
 79 |                 break
 80 |         assert len(bpe_indices) > 0
 81 |         alignment.append(bpe_indices)
 82 |     assert len(alignment) == len(other_tokens)
 83 |     return alignment
 84 | 
 85 | def align_features_to_words(features, alignment):
 86 |     assert features.dim() == 2
 87 |     bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices)
 88 |     assert bpe_counts[0] == 0  # <s> shouldn't be aligned
 89 |     denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))])
 90 |     weighted_features = features / denom.unsqueeze(-1)
 91 |     #output = [weighted_features[0]] # <s>
 92 |     output = []
 93 |     #largest_j = -1
 94 |     for bpe_indices in alignment:
 95 |         output.append(weighted_features[bpe_indices].sum(dim=0))
 96 |         #largest_j = max(largest_j, *bpe_indices)
 97 |     #for j in range(largest_j + 1, len(features)):
 98 |     #    output.append(weighted_features[j])
 99 |     output = torch.stack(output)
100 |     return output
101 | 
102 | def spacy_nlp():
103 |     if getattr(spacy_nlp, '_nlp', None) is None:
104 |         try:
105 |             from spacy.lang.en import English
106 |             spacy_nlp._nlp = English()
107 |         except ImportError:
108 |             raise ImportError('Please install spacy with: pip install spacy')
109 |     return spacy_nlp._nlp
110 | 
111 | def spacy_tokenizer():
112 |     if getattr(spacy_tokenizer, '_tokenizer', None) is None:
113 |         try:
114 |             nlp = spacy_nlp()
115 |             spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp)
116 |         except ImportError:
117 |             raise ImportError('Please install spacy with: pip install spacy')
118 |     return spacy_tokenizer._tokenizer
119 | 
120 | def correct_unmatch(tokens, new_tok, features):
121 |     tok_aln, new_tok_aln, tok_id, new_tok_id = minEditMatching(tokens, new_tok)
122 |     assert len(new_tok_id)==0
123 |     new_fea = list()
124 |     for idx, tid in enumerate(tok_id[::-1]):
125 |         assert new_tok[tid]+new_tok[tid+1] == tokens[tid-idx]
126 |     idx = 0
127 |     lens = len(features)
128 |     while(idx < lens):
129 |         if idx not in tok_id:
130 |             new_fea.append(features[idx])
131 |             idx += 1
132 |         else:
133 |             new_fea.append(np.mean(features[idx:idx+2], axis=0))
134 |             idx += 2
135 |     assert len(new_fea)==len(tokens)
136 |     for n in new_fea:
137 |         assert n.size==1024
138 |     return new_fea
139 | 
140 | def minEditMatching(target, source):
141 |     ''' Return a pair of aligned target and source'''
142 |     n = len(target)
143 |     m = len(source)
144 |     distance = [[0 for i in range(m+1)] for j in range(n+1)]
145 |     for i in range(1,n+1):
146 |         #distance[i][0] = distance[i-1][0] + insertCost(target[i-1])
147 |         distance[i][0] = distance[i-1][0] + 1
148 | 
149 |     for j in range(1,m+1):
150 |         #distance[0][j] = distance[0][j-1] + deleteCost(source[j-1])
151 |         distance[0][j] = distance[0][j-1] + 1
152 | 
153 |     for i in range(1,n+1):
154 |         for j in range(1,m+1):
155 |             distance[i][j] = min(distance[i-1][j-1]+substCostSen(source[j-1],target[i-1]),
156 |                                  distance[i-1][j]+1,
157 |                                  distance[i][j-1]+1)
158 |     ii = n
159 |     jj = m
160 | 
161 |     target_aln = []
162 |     source_aln = []
163 |     target_id = []
164 |     source_id = []
165 |     while (ii > 0) or (jj > 0):
166 |         if distance[ii][jj]-substCostSen(source[jj-1],target[ii-1]) == distance[ii-1][jj-1]:
167 |             target_aln.append(target[ii-1])
168 |             source_aln.append(source[jj-1])
169 |             ii -= 1
170 |             jj -= 1
171 |         elif distance[ii][jj] - 1 == distance[ii][jj-1]:
172 |             source_aln.append(source[jj-1])
173 |             target_aln.append("___")
174 |             jj -= 1
175 |             target_id.append(jj)
176 |         elif distance[ii][jj] - 1 == distance[ii-1][jj]:
177 |             source_aln.append("___")
178 |             target_aln.append(target[ii-1])
179 |             ii -= 1
180 |             source_id.append(ii)
181 |         else:
182 |             print ("error!")
183 | 
184 |     target_aln = target_aln[::-1]
185 |     source_aln = source_aln[::-1]
186 |     return (target_aln,source_aln,target_id, source_id)
187 | 
188 | def substCostSen(x,y):
189 |     if x==y:
190 |         return 0
191 |     else:
192 |         return 1
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/requirements.txt:
--------------------------------------------------------------------------------
 1 | allennlp==0.9.0
 2 | pytorch-crf==0.7.2
 3 | pytorch-nlp==0.5.0
 4 | seqeval==0.0.12
 5 | sklearn==0.0
 6 | tensorboardX==2.0
 7 | torch==1.4.0
 8 | transformers==2.4.1
 9 | git+ssh://git@gitlab.com/isi-better/better-events.git@master
10 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/saved_args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "batch": 2,
 3 |   "iter_size": 2,
 4 |   "epochs": 40,
 5 |   "pipe_epochs": 1000,
 6 |   "tri_start_epochs": 50,
 7 |   "lr": 0.001,
 8 |   "lr_other_ner": 0.001,
 9 |   "lr_other_t": 0.001,
10 |   "lr_other_a": 0.001,
11 |   "num_warmup_steps": 300,
12 |   "opt": "adam",
13 |   "momentum": 0.9,
14 |   "cuda": false,
15 |   "multigpu": false,
16 |   "params": {},
17 |   "patience": 10000,
18 |   "do_train": true,
19 |   "do_test": false,
20 |   "write_pkl": false,
21 |   "eval_on_gold_tri": true,
22 |   "e2e_eval": true,
23 |   "train_on_e2e_data": true,
24 |   "tri_partial_match": true,
25 |   "use_single_token_tri": true,
26 |   "gold_ent": false,
27 |   "hid": 150,
28 |   "hid_lastmlp": 600,
29 |   "num_layers": 1,
30 |   "dropout": 0.4,
31 |   "activation": "relu",
32 |   "ner_weight": 1.0,
33 |   "argument_weight": 5.0,
34 |   "trigger_weight": 1.0,
35 |   "finetune_bert": true,
36 |   "bert_model_type": "bert-large-uncased",
37 |   "bert_encode_mthd": "head",
38 |   "use_bert": false,
39 |   "use_glove": false,
40 |   "bert_dim": 1024,
41 |   "use_pos": false,
42 |   "regen_vocfile": false,
43 |   "trainable_emb": false,
44 |   "trainable_pos_emb": false,
45 |   "random_seed": 123,
46 |   "lower": false,
47 |   "use_crf_ner": true,
48 |   "use_crf_t": true,
49 |   "use_crf_a": true,
50 |   "use_att": true,
51 |   "att_func": "general",
52 |   "att_dropout": 0.0,
53 |   "use_att_linear_out": true,
54 |   "norm": true,
55 |   "att_pool": "max",
56 |   "att_mthd": "cat",
57 |   "k_ner": 1,
58 |   "k_tri": 1,
59 |   "k_arg": 1,
60 |   "bias_t": 1.0,
61 |   "bias_a": 1.0,
62 |   "decode_w_ents_mask": true,
63 |   "decode_w_arg_role_mask_by_tri": true,
64 |   "decode_w_trigger_mask": true,
65 |   "decode_w_arg_role_mask_by_ent": false,
66 |   "load_model": true,
67 |   "load_model_path": "worked_model_ace/baseline_repro.pt",
68 |   "load_model_path_t": "worked_model_ace/singletrigger_bertlarge2.pt",
69 |   "load_model_path_ner": "worked_model_ace/ner_bertlarge.pt",
70 |   "load_model_single": true
71 | }


--------------------------------------------------------------------------------
/component/BETTER/joint/split_event.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | def split_tri_output(trigger_seq, B2I):
 4 |     '''
 5 |     given trigger_sequence, we will generate trigger word indexes for argument module
 6 |     Args:
 7 |         trigger_seq: a list of int that represent the trigger sequence.
 8 |         label_to_id_tri: a map that mapping from BIO labels ro index
 9 |     Return:
10 |         A list of lists of trigger word idx. E.g. [[1,2], [5,6], [10]]
11 |     '''
12 |     tri_idx = []
13 |     tri_type = []
14 |     in_chunk = False
15 |     curr_idx = []
16 |     curr_I = None
17 |     for i in range(len(trigger_seq)):
18 |         # end of chunk
19 |         if in_chunk:
20 |             if trigger_seq[i] != curr_I:
21 |                 tri_idx.append(curr_idx)
22 |                 tri_type.append(curr_I - 1)   # -1 accounts for finding the id of B-xxx
23 |                 curr_idx = []
24 |                 curr_I = None
25 |                 in_chunk = False
26 |             elif trigger_seq[i] == curr_I:
27 |                 curr_idx.append(i)
28 |                 if i == len(trigger_seq) - 1:
29 |                     # the last token is a I token
30 |                     tri_idx.append(curr_idx)
31 |                     tri_type.append(curr_I - 1)   # -1 accounts for finding the id of B-xxx
32 | 
33 |         # beginning of chunk
34 |         if trigger_seq[i] in B2I:
35 |             curr_idx = [i]
36 |             in_chunk = True
37 |             curr_I = B2I[trigger_seq[i]]
38 |             if i == len(trigger_seq) - 1:
39 |                 # the last token is a B token
40 |                 tri_idx.append(curr_idx)
41 |                 tri_type.append(curr_I - 1)   # -1 accounts for finding the id of B-xxx
42 | 
43 |     assert len(tri_idx) == len(tri_type)
44 |     return tri_idx, tri_type
45 | 
46 | if __name__ == '__main__':
47 |     label_to_id_t = OrderedDict([('O', 1), ('B-ANCHOR', 2), ('I-ANCHOR', 3), ('<PAD>', 0)])
48 |     fake_data = ['O', 'B-ANCHOR', 'I-ANCHOR', 'I-ANCHOR', 'B-ANCHOR', 'O', 'O', 'B-ANCHOR', 'I-ANCHOR', 'O', 'B-ANCHOR', 'I-ANCHOR']
49 |     print(fake_data)
50 |     fake_data = [label_to_id_t[x] for x in fake_data]
51 |     tri_idx = split_tri_output(fake_data, label_to_id_t)
52 |     print(tri_idx)
53 | 


--------------------------------------------------------------------------------
/component/BETTER/joint/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import OrderedDict
  3 | from tensorboardX import SummaryWriter
  4 | from seqeval.metrics import f1_score, accuracy_score, classification_report
  5 | from datetime import datetime
  6 | import os
  7 | import pickle
  8 | from eval import eval_e2e_event_level_arg_id_cls
  9 | import torch
 10 | 
 11 | def read_glove_dict(glove_dir):
 12 |     glove_emb = open(glove_dir, 'r+', encoding="utf-8")
 13 |     emb_dict = OrderedDict([(x.strip().split(' ')[0], [float(xx) for xx in x.strip().split(' ')[1:]]) for x in glove_emb])
 14 |     return emb_dict
 15 | 
 16 | def read_glove_emb(word2idx, glove_dict):
 17 |     word_emb = []
 18 |     for word in word2idx:
 19 |         if word in glove_dict:
 20 |             word_emb.append(glove_dict[word])
 21 |         elif word == '<PAD>':
 22 |             word_emb.append(np.zeros(300))
 23 |         else:
 24 |             word_emb.append(glove_dict['unk'])
 25 | 
 26 |     return np.array(word_emb)
 27 | 
 28 | class Logger(object):
 29 |     def __init__(self, logdir='./log'):
 30 |         self.writer = SummaryWriter(logdir)
 31 |     
 32 |     def scalar_summary(self, tag, value, step):
 33 |         self.writer.add_scalar(tag, value, step)
 34 | 
 35 | def write_result(filename, args, scores):
 36 |     if os.path.exists(filename):
 37 |         result = pickle.load(open(filename, 'rb'))
 38 |     else:
 39 |         result = list()
 40 |     result.append({
 41 |         'score_time': datetime.today().strftime('%Y-%m-%d-%H:%M:%S'),
 42 |         'save_dir': args.save_dir,
 43 |         'task': args.task,
 44 |         'use_bert': args.use_bert,
 45 |         'batch_size': args.batch,
 46 |         'epoch': args.epochs,
 47 |         'lr': args.lr,
 48 |         'opt': args.opt,
 49 |         'hid': args.hid,
 50 |         'n_layers': args.num_layers,
 51 |         'dp': args.dropout,
 52 |         'act': args.activation,
 53 |         'use_crf': args.use_crf,
 54 |         'use_att': args.use_att,
 55 |         'att_mthd': args.att_mthd,
 56 |         'trigger_weight': args.trigger_weight,
 57 |         'argument_weight': args.argument_weight,
 58 |         'dev_f1': scores['dev_f1'],
 59 |         'test_f1_tri': scores['test_f1_t'],
 60 |         'test_f1_argu': scores['test_f1_e']
 61 |     })
 62 |     with open(filename, 'wb') as f:
 63 |         pickle.dump(result, f)
 64 | 
 65 | def write_result_struct(filename, args, scores):
 66 |     if os.path.exists(filename):
 67 |         result = pickle.load(open(filename, 'rb'))
 68 |     else:
 69 |         result = list()
 70 |     result.append({
 71 |         'score_time': datetime.today().strftime('%Y-%m-%d-%H:%M:%S'),
 72 |         'save_dir': args.save_dir,
 73 |         'method': args.method,
 74 |         'use_bert': args.use_bert,
 75 |         'batch_size': args.batch,
 76 |         'epoch': args.epochs,
 77 |         'lr': args.lr,
 78 |         'opt': args.opt,
 79 |         'hid': args.hid,
 80 |         'n_layers': args.num_layers,
 81 |         'dp': args.dropout,
 82 |         'act': args.activation,
 83 |         'use_crf': args.use_crf,
 84 |         'eval_on_gold_tri': args.eval_on_gold_tri,
 85 |         'trigger_weight': args.trigger_weight,
 86 |         'argument_weight': args.argument_weight,
 87 |         'soft_attn': args.soft_attn,
 88 |         'query_mthd': args.query_mthd,
 89 |         'attn_mthd': args.attn_mthd,
 90 |         'att_heads': args.att_heads,
 91 |         'att_dropout': args.att_dropout,
 92 |         'att_func': args.att_func,
 93 |         'use_att_linear_out': args.use_att_linear_out,
 94 |         'dev_f1': scores['dev_f1'],
 95 |         'test_f1': scores['test_f1'],
 96 |     })
 97 |     with open(filename, 'wb') as f:
 98 |         pickle.dump(result, f)
 99 | 
100 | def get_loss_mlp(lengths, label, pred_logit, criterion):
101 |     # retrieve and flatten prediction for loss calculation
102 |     tri_pred, tri_label = [], []
103 |     for i,l in enumerate(lengths):
104 |         # flatten prediction
105 |         tri_pred.append(pred_logit[i, :l])
106 |         # flatten entity label
107 |         tri_label.append(label[i, :l])
108 |     tri_pred = torch.cat(tri_pred, 0)
109 |     tri_label = torch.cat(tri_label, 0)
110 |     assert tri_pred.size(0) == tri_label.size(0)
111 |     return(criterion(tri_pred, tri_label))
112 | 
113 | def get_output_rel(pred_logit, input_ref):
114 |     '''
115 |     input_ref: a list of integer, each integer indicate how many output in each batch
116 |     pred_logit: a tensor (# of total events in a batch, num_class)
117 |     
118 |     output: a list of list of prediction(integer)
119 |     '''
120 |     output = list()
121 |     cnt = 0
122 |     for n in input_ref:
123 |         if n != 0:
124 |             output.append(torch.argmax(pred_logit[cnt:cnt+n], dim=1, keepdim=False).tolist())
125 |         else:
126 |             output.append([])
127 |         cnt += n
128 |     return output
129 | 
130 | def get_loss_rel(gold_rel, pred_logit, criterion):
131 |     '''
132 |     gold_rel = a list of list of prediction(integer)
133 |     pred_logit: a tensor (# of total events in a batch, num_class)
134 |     '''
135 |     # flatten gold_rel
136 |     flatten = pred_logit.new_tensor([x for i in gold_rel for x in i], dtype=torch.long)
137 |     return (criterion(pred_logit, flatten))
138 | 
139 | def eval_struct_score(y_trues_t, y_preds_t, y_trues_e, y_preds_e, y_pred_paired, sent_ids, test=True):
140 |     # trigger id score:
141 |     assert len(y_trues_t) == len(y_preds_t)
142 |     f1_tri = f1_score(y_trues_t, y_preds_t)
143 |     acc_tri = accuracy_score(y_trues_t, y_preds_t)
144 |     report = classification_report(y_trues_t, y_preds_t)
145 | 
146 |     # sent-level argument id score:
147 |     assert len(y_trues_e) == len(y_preds_e)
148 |     f1_arg = f1_score(y_trues_e, y_preds_e)
149 |     acc_arg = accuracy_score(y_trues_e, y_preds_e)
150 |     report = classification_report(y_trues_e, y_preds_e)
151 | 
152 |     # end2end eval
153 |     output_event = []
154 |     for i in range(len(sent_ids)):
155 |         for event in y_pred_paired[i]:
156 |             output_event.append({'sent_id': sent_ids[i], 'pred_trigger': event[0], 'pred_arg': event[1]})
157 |     with open('temp/end2end_event_level_arg_cls.pkl', 'wb') as f:
158 |         pickle.dump(output_event, f)
159 |     print('temp pkl saved, start evaluation...')
160 |     B2I_trigger = {'B-ANCHOR': 'I-ANCHOR'}
161 |     B2I_arg = {'B-AGENT': 'I-AGENT', 'B-PATIENT': 'I-PATIENT'}
162 |     if test:
163 |         prec, recall, f1 = eval_e2e_event_level_arg_id_cls('out_pkl/gold_event_level_tri_arg_cls.pkl',
164 |                                                            'temp/end2end_event_level_arg_cls.pkl',
165 |                                                            B2I_trigger, B2I_arg)
166 |     else:
167 |         prec, recall, f1 = eval_e2e_event_level_arg_id_cls('out_pkl/dev_gold_event_level_tri_arg_cls.pkl',
168 |                                                            'temp/end2end_event_level_arg_cls.pkl',
169 |                                                            B2I_trigger, B2I_arg)
170 | 
171 |     scores = {
172 |         'f1_tri': f1_tri,
173 |         'acc_tri': acc_tri,
174 |         'f1_arg': f1_arg,
175 |         'acc_arg': acc_arg,
176 |         'precision_e2e': prec,
177 |         'recall_e2e': recall,
178 |         'f1_e2e': f1
179 |     }
180 |     return scores
181 | 


--------------------------------------------------------------------------------
/component/Duration/.gitignore:
--------------------------------------------------------------------------------
 1 | *stanford-corenlp*
 2 | dataset
 3 | # scripts/src/factslab
 4 | scripts/.ipynb_checkpoints
 5 | *__pycache__*
 6 | *.hdf5
 7 | # *.pth
 8 | .idea
 9 | *df.csv
10 | logs/


--------------------------------------------------------------------------------
/component/Duration/README.md:
--------------------------------------------------------------------------------
 1 | # UDS-T
 2 | 
 3 | Event Duration Baselines on UDS-T
 4 | 
 5 | > This repo provides baseline models and evaluations for 
 6 | time-duration classification, on USD-T dataset.
 7 | 
 8 | ---
 9 | 
10 | ## Environment Setup
11 | ```
12 | conda create -n event_dur
13 | conda install pip
14 | pip install git+https://github.com/hltcoe/PredPatt.git
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | 
19 | 
20 | ## Inference API
21 | 
22 | See requirements.txt for required dependencies. <br><br>
23 | Example for performing duration model inference.
24 | ```python
25 | from inference_api import predict_duration_elmo
26 | 
27 | # events_json = EventsModel(...)
28 | # events = json.loads(events_json)  # Parse JSON string
29 | out_json_str = predict_duration_elmo(events)
30 | ```
31 | 
32 | Output JSON structure:
33 | ```json5
34 | [
35 |   {
36 |     'duration': 'days',
37 |     'pred_text': 'meeting',
38 |     'pred_idx': 13,
39 |     'sentence': 'There was a ...' 
40 |   },
41 | ]
42 | ```
43 | 
44 | 
45 | ---
46 | ## Models
47 | 
48 | 
49 | - ELMo-MLP Baseline 
50 |     <br>
51 |     - Under eval mode (torch.no_grad), consumes ~ 1.3GB GPU-RAM, for batch-size=1
52 | 
53 | <br>
54 | 
55 | - BERT/RoBERTa Baseline
56 | 
57 | 
58 | <br>
59 | 
60 | <br>
61 | 
62 | ---
63 | 
64 | ## Training
65 | 
66 | Run the following script for training:
67 | 
68 | 
69 | 
70 | ```bash
71 | $ python3 main.py \
72 | --mode train
73 | ```
74 | 
75 | 
76 | 
77 | <br>
78 | ---
79 | 
80 | *TO-DOs*
81 | 
82 | - [ ] Add BERT baseline
83 | 
84 | 
85 | 
86 | ## References
87 | [1]  [Fine-Grained Temporal Relation Extraction](https://www.aclweb.org/anthology/P19-1280/) <br>
88 | [2]  []() <br>
89 | 


--------------------------------------------------------------------------------
/component/Duration/UDS_T_data/first10.tsv:
--------------------------------------------------------------------------------
 1 | Split	Annotator.ID	Sentence1.ID	Pred1.Span	Pred1.Token	Event1.ID	Sentence2.ID	Pred2.Span	Pred2.Token	Event2.ID	Pred1.Text	Pred1.Lemma	Pred2.Text	Pred2.Lemma	Pred1.Duration	Pred2.Duration	Pred1.Beg	Pred1.End	Pred2.Beg	Pred2.End	Pred1.Duration.Confidence	Pred2.Duration.Confidence	Relation.Confidence	Document.ID
 2 | train	209	en-ud-train.conllu 418	17	17	en-ud-train.conllu 418_17	en-ud-train.conllu 418	18_19	19	en-ud-train.conllu 418_19	think	think	is intentional	intentional	2	0	35 	 41	64 	 65	4.0	4.0	4.0	10
 3 | train	209	en-ud-train.conllu 9490	51	51	en-ud-train.conllu 9490_51	en-ud-train.conllu 9490	53	53	en-ud-train.conllu 9490_53	grey	grey	dark	dark	6	6	0 	 65	0 	 65	4.0	4.0	4.0	189
 4 | train	209	en-ud-train.conllu 7143	8	8	en-ud-train.conllu 7143_8	en-ud-train.conllu 7144	11_12_13_14_15_18_19	14	en-ud-train.conllu 7144_14	employed	employ	be the greatest threats to...	threat	7	7	0 	 81	0 	 81	4.0	4.0	4.0	83
 5 | train	209	en-ud-train.conllu 7786	17	17	en-ud-train.conllu 7786_17	en-ud-train.conllu 7787	4_5_6_7_8_9	9	en-ud-train.conllu 7787_9	getting	get	is a good nano protein...	skimmer	2	10	35 	 44	0 	 100	4.0	4.0	4.0	126
 6 | train	209	en-ud-train.conllu 5729	4	4	en-ud-train.conllu 5729_4	en-ud-train.conllu 5729	36	36	en-ud-train.conllu 5729_36	solicit	solicit	served	serve	2	7	36 	 42	0 	 26	4.0	4.0	4.0	65
 7 | train	508	en-ud-train.conllu 6416	8	8	en-ud-train.conllu 6416_8	en-ud-train.conllu 6416	32_33_34_35_36	35	en-ud-train.conllu 6416_35	switched	switch	being a net source of	source	7	7	21 	 79	40 	 67	2.0	2.0	2.0	77
 8 | train	508	en-ud-train.conllu 11326	32	32	en-ud-train.conllu 11326_32	en-ud-train.conllu 11326	40	40	en-ud-train.conllu 11326_40	go	go	came	come	4	3	28 	 74	68 	 88	2.0	2.0	2.0	439
 9 | train	508	en-ud-train.conllu 11550	1_2_3_4_5_6	5	en-ud-train.conllu 11550_5	en-ud-train.conllu 11551	14	14	en-ud-train.conllu 11551_14	was a very trying time...	time	made	make	4	4	21 	 71	42 	 54	2.0	2.0	2.0	465
10 | train	508	en-ud-train.conllu 8062	5	5	en-ud-train.conllu 8062_5	en-ud-train.conllu 8062	7	7	en-ud-train.conllu 8062_7	get	get	warm	warm	3	3	15 	 78	43 	 63	2.0	2.0	2.0	141
11 | train	508	en-ud-train.conllu 12252	25	25	en-ud-train.conllu 12252_25	en-ud-train.conllu 12252	30	30	en-ud-train.conllu 12252_30	show	show	charge	charge	3	2	26 	 58	40 	 51	2.0	2.0	2.0	527
12 | 


--------------------------------------------------------------------------------
/component/Duration/inference_api.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | import os
  4 | import json
  5 | import pandas as pd
  6 | from torch.utils.data import DataLoader
  7 | from os.path import dirname, abspath
  8 | parent_dir = dirname(dirname(abspath(__file__)))
  9 | sys.path.insert(0, parent_dir)
 10 | from utils_duration import str2bool, compute_predictions, idx2label
 11 | from preprocess import TempEveDataset
 12 | # from allennlp.commands.elmo import ElmoEmbedder
 13 | from allennlp.modules.elmo import Elmo
 14 | from .scripts.src.factslab.factslab.pytorch.temporalmodule import TemporalModel
 15 | 
 16 | class DurationAPI:
 17 |     def __init__(self, base_dir = '.', gpu_id=-1):
 18 |         """
 19 |         :param int gpu_id: cuda device id (optional); default - cpu
 20 |         """
 21 |         self.base_dir = base_dir
 22 |         device = torch.device('cuda:{}'.format(gpu_id) if torch.cuda.is_available() and gpu_id != -1 else 'cpu')
 23 | 
 24 |         # Model Configs
 25 |         options_file = os.path.join(base_dir, "./scripts/elmo_files/elmo_2x4096_512_2048cnn_2xhighway_options.json")
 26 |         weight_file = os.path.join(base_dir, "./scripts/elmo_files/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5")
 27 | 
 28 |         model_ckpt = os.path.join(base_dir, "./model_ckpt/model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth")
 29 |         file_name = model_ckpt.split('/')[-1]
 30 | 
 31 |         tokens = file_name.split("_")
 32 |         eventatt = tokens[1]
 33 |         duratt = tokens[2]
 34 |         relatt = tokens[3]
 35 |         concat_fine_to_dur = str2bool(tokens[-8])
 36 |         concat_dur_to_fine = str2bool(tokens[-7])
 37 |         fine_2_dur = str2bool(tokens[-6])
 38 |         dur_2_fine = str2bool(tokens[-5])
 39 |         weight = float(tokens[-4])
 40 |         drop = float(tokens[-3])
 41 |         activ = tokens[-2]
 42 |         bino_bool = str2bool(tokens[-1].split(".")[0])
 43 | 
 44 |         # coarse_size = int(tokens[-1].split(".")[0])
 45 | 
 46 |         print("Eventatt: {}, Duratt: {}, Relatt: {}, Dropout: {}, Activation: {}, Binomial: {}, "
 47 |             "concat_fine2dur: {}, concat_dur2fine:{}, fine_to_dur: {}, dur_to_fine: {} \n"
 48 |             .format(eventatt, duratt, relatt, drop, activ, bino_bool,
 49 |                     concat_fine_to_dur, concat_dur_to_fine, fine_2_dur, dur_2_fine))
 50 | 
 51 |         self.batch_size = 1
 52 |         self.num_workers = 1
 53 | 
 54 |         # Model
 55 |         self.model = TemporalModel(embedding_size=1024, duration_distr=bino_bool,
 56 |                             # elmo_class=
 57 |                             mlp_dropout=drop, mlp_activation=activ, tune_embed_size=256, event_attention=eventatt,
 58 |                             dur_attention=duratt, rel_attention=relatt, concat_fine_to_dur=concat_fine_to_dur,
 59 |                             concat_dur_to_fine=concat_dur_to_fine, fine_to_dur=fine_2_dur, dur_to_fine=dur_2_fine,
 60 |                             fine_squash=True, baseline=False, dur_MLP_sizes=[128], fine_MLP_sizes=[128],
 61 |                             dur_output_size=11, fine_output_size=4, device=device)
 62 |         
 63 |         self.model.to(device)
 64 | 
 65 |         # Load model weights
 66 |         checkpoint = torch.load(model_ckpt, map_location=device)
 67 |         self.model.load_state_dict(checkpoint)
 68 |         self.model.elmo_class = Elmo(options_file, weight_file, num_output_representations=3)
 69 |     
 70 |     def pred(self, events):
 71 |         """
 72 |         Model inference for ELMo baseline, given Events JSON
 73 | 
 74 |         :param list[dict] events: list of sentences and extracted event-triggers (within dict)
 75 |         :return: json containing event-duration as list of dict
 76 |         :rtype: str
 77 |         """
 78 |         # Dataloader
 79 |         test_dataset = TempEveDataset(events)
 80 | 
 81 |         test_loader = DataLoader(test_dataset, self.batch_size, num_workers=self.num_workers, drop_last=False)
 82 | 
 83 |         # Inference
 84 |         outputs = compute_predictions(self.model, test_loader)
 85 | 
 86 |         # DataFrame
 87 |         df_out = pd.DataFrame(outputs)
 88 |         df_out = df_out[['p1_dur', 'root_text', 'root_idx', 'sentence']]
 89 | 
 90 |         df_out.rename(columns={'p1_dur': 'duration',
 91 |                             'root_text': 'pred_text',
 92 |                             'root_idx': 'pred_idx'}, inplace=True)
 93 | 
 94 |         # Map duration index to label
 95 |         df_out['duration'] = df_out['duration'].apply(lambda idx: idx2label[idx])
 96 | 
 97 |         json_str = df_out.to_json(orient='records')
 98 | 
 99 |         # Parse json string to List[dict]
100 |         json_obj = json.loads(json_str)
101 |         return json_obj
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     # Input
106 |     json_file = './Mu_test_data/dev_tbd.pred.json'
107 | 
108 |     # Read json file
109 |     events_input = json.load(open(json_file))
110 | 
111 |     # For demo, input is obtained from Mu's Event model,
112 |     # thus first decode the json string as follows:
113 |     # events_input = json.loads(events_json_str)   # str --> List[dict]
114 |     # result = predict_duration_elmo(events_input, gpu_id=0)
115 |     events_input = events_input[:2]
116 |     print(events_input)
117 | 
118 |     api = DurationAPI()
119 |     result = api.pred(events_input)
120 | 
121 |     print(result)
122 | 


--------------------------------------------------------------------------------
/component/Duration/input_data/sample_document.txt:
--------------------------------------------------------------------------------
 1 | Before the arrival of Keep, which Google launched this week, there was no default note-taking app for Android.
 2 | It was a glaring hole, considering that Apple's iPhone has built-in Notes and Reminders apps that can be powered by Siri.
 3 | Instead of settling for a bare bones app to fill the void, the search giant took things one step further.
 4 | Keep isn't simply just a place to bank whatever random half-thoughts come to mind: Users can construct to-do lists, stash photos, and color code your notes -- all in one well-designed and easy-to-use interface.
 5 | The second you log anything into your phone, it is also accessible from a PC Web browser via Google Drive.
 6 | Alternatively, you can save things while working on your computer, and it will instantly appear on your phone, ready for use while on the go.
 7 | The design may not be as progressive as the to-do app Clear, but Keep makes up for that in its simplicity and efficiency.
 8 | Everything in Keep is presented like a Microsoft (MSFT, Fortune 500) Windows Phone-esque stream of tiles.
 9 | Swiping left or right will archive those notes you no longer need (but don't want to erase entirely).
10 | At the top of the app is a text entry field that serves as your main point of entry for all new notes.
11 | And when viewing any specific note, tapping any part of that note (title, body, etc.)
12 | will allow you to edit it.
13 | The entire experience is frictionless.
14 | That said, it's not going to conquer the world quite yet.
15 | Organization options are limited -- color coding is your only choice, and you can't re-order your notes.
16 | Sharing with others is mostly limited to email and Google+, and the desktop features are pretty bare bones.
17 | But that's more a function of it being new, rather than poorly thought out.
18 | Like most things Google, expect the company to flesh out Keep over time and really turn it into our personal internet junk drawer.
19 | It's easy to foresee the day the when users will be able to send anything from their Web browser or Maps directly to Keep.
20 | The prospect of Keep incorporating features of services such as Pinterest or Pocket, or even making it easy to catalog streaming media, could turn it into something big.
21 | That should scare Evernote.
22 | Keep is not the reinvention of the wheel in any aspect -- there are a plethora of third-party apps already available for Android.
23 | But it is a well-exectuted refinement.
24 | In filling a minor, but important gap in its mobile ecosystem, Google gives the competition one less claim of superiority over Android.


--------------------------------------------------------------------------------
/component/Duration/input_data/sample_fig2.txt:
--------------------------------------------------------------------------------
1 | What to feed my dog after gastroenteritis? My dog has been sick for about 3 days now.


--------------------------------------------------------------------------------
/component/Duration/input_data_conllu/sample_fig2.txt.output:
--------------------------------------------------------------------------------
 1 | 1	What	what	_	WP	_	3	dep	_	_
 2 | 2	to	to	_	TO	_	3	mark	_	_
 3 | 3	feed	feed	_	VB	_	0	root	_	_
 4 | 4	my	my	_	PRP$	_	5	nmod:poss	_	_
 5 | 5	dog	dog	_	NN	_	3	dobj	_	_
 6 | 6	after	after	_	IN	_	7	case	_	_
 7 | 7	gastroenteritis	gastroenteritis	_	NN	_	3	nmod	_	_
 8 | 8	?	?	_	.	_	3	punct	_	_
 9 | 
10 | 1	My	my	_	PRP$	_	2	nmod:poss	_	_
11 | 2	dog	dog	_	NN	_	5	nsubj	_	_
12 | 3	has	have	_	VBZ	_	5	aux	_	_
13 | 4	been	be	_	VBN	_	5	cop	_	_
14 | 5	sick	sick	_	JJ	_	0	root	_	_
15 | 6	for	for	_	IN	_	9	case	_	_
16 | 7	about	about	_	IN	_	9	case	_	_
17 | 8	3	3	_	CD	_	9	nummod	_	_
18 | 9	days	day	_	NNS	_	5	nmod	_	_
19 | 10	now	now	_	RB	_	5	advmod	_	_
20 | 11	.	.	_	.	_	5	punct	_	_
21 | 
22 | 


--------------------------------------------------------------------------------
/component/Duration/model_ckpt/model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/Duration/model_ckpt/model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth


--------------------------------------------------------------------------------
/component/Duration/predictions/.~lock.sample_document.txt.output_timeline.csv#:
--------------------------------------------------------------------------------
1 | ,sidvash,lambda-quad,25.03.2019 21:49,file:///home/sidvash/.config/libreoffice/4;


--------------------------------------------------------------------------------
/component/Duration/predictions/README_predictions.txt:
--------------------------------------------------------------------------------
 1 | ## Data dictionary for sample_document.txt.output_predictions.csv:
 2 | 
 3 | 1. Each row corresponds to an event-pair in a sentence.
 4 | 
 5 | 2. sent_pred_id1: filename sent_id pred_position
 6 | 	For eg (row 1): sample_document.txt.output 1 4 denotes predicate at the 4th position (index starting at 0) in the 1st sentence of 'sample_document.txt.output' file.
 7 | 
 8 | 3. sent_pred_id2: same as above
 9 | 	For eg (row 1): sample_document.txt.output 1 8 denotes predicate at the 8th position (index starting at 0) in the 1st sentence of 'sample_document.txt.output' file.
10 | 
11 | Note that there are two sentence ids because the full sentence is the concatenation of the sentence in the sent_pred_id1 and the next adjacent sentence in the document. 
12 | 
13 | Examples:
14 | The 1st row in the csv file has:
15 | sent_pred_id1: sample_document.txt.output 1 4
16 | sent_pred_id2: sample_document.txt.output 1 8
17 | 
18 | which denotes that both the predicates in the predicate-pair are being considered from the 1st sentence and are at 4th and 8th position.
19 | 
20 | The 4th row in the csv file has:
21 | sent_pred_id1: sample_document.txt.output 1 13
22 | sent_pred_id2: sample_document.txt.output 2 4
23 | 
24 | which denotes that the first predicate is at the 13th position in the 1st sentence and the second predicate is at the 4th position in the 2nd sentence in the document.
25 | 
26 | 4. B1: beginning point of the first predicate
27 | 
28 | 5. E1: end point of the first predicate
29 | 
30 | 6. B2: beginning point of the second predicate
31 | 
32 | 7. E2: end point of the second predicate
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/component/Duration/predictions/sample_document.txt.output_timeline.csv:
--------------------------------------------------------------------------------
 1 | start_pt,duration,sent_pred_id,pred_text
 2 | 0.22953895,0.23559411,sample_document.txt.output 1 13,was
 3 | 0.1616388,0.90549994,sample_document.txt.output 1 4,Keep
 4 | 0.287634,0.7549082,sample_document.txt.output 1 8,launched
 5 | 1.072766,0.103116445,sample_document.txt.output 10 12,serves
 6 | 1.0222178,0.11418431,sample_document.txt.output 10 6,is
 7 | 1.06561,0.074647896,sample_document.txt.output 11 2,viewing
 8 | 1.1178255,0.07104552,sample_document.txt.output 11 21,allow
 9 | 1.1504356,0.050872874,sample_document.txt.output 11 24,edit
10 | 1.0895916,0.07437399,sample_document.txt.output 11 7,tapping
11 | 0.0,1.3020271,sample_document.txt.output 12 4,is frictionless
12 | 0.49332798,0.033563256,sample_document.txt.output 13 1,said
13 | 0.48924226,0.23515902,sample_document.txt.output 13 6,going
14 | 0.32785034,1.1066597,sample_document.txt.output 13 8,conquer
15 | 0.49388248,0.10346455,sample_document.txt.output 14 10,is only choice
16 | 0.57775855,0.0680006,sample_document.txt.output 14 16,re-order
17 | 0.29515114,0.36924908,sample_document.txt.output 14 3,limited
18 | 0.31241438,0.39484847,sample_document.txt.output 15 10,+
19 | 0.41692692,0.27291024,sample_document.txt.output 15 19,are pretty bare bones
20 | 0.12191195,1.6530606,sample_document.txt.output 15 5,limited
21 | 0.3450333,0.27553135,sample_document.txt.output 15 7,email
22 | 0.4190132,0.33701715,sample_document.txt.output 16 14,thought
23 | 0.33446252,0.4187518,sample_document.txt.output 16 5,'s more a function of
24 | 0.33700866,0.42752492,sample_document.txt.output 16 9,being new
25 | 0.68224037,0.13986059,sample_document.txt.output 17 16,turn
26 | 0.47907346,0.23535627,sample_document.txt.output 17 5,expect
27 | 0.6549459,0.18746355,sample_document.txt.output 18 12,be able
28 | 0.7119201,0.13109462,sample_document.txt.output 18 14,send
29 | 0.5940464,0.18792449,sample_document.txt.output 18 2,'s easy
30 | 0.72883064,0.2629165,sample_document.txt.output 18 24,Keep
31 | 0.5958486,0.21551259,sample_document.txt.output 18 4,foresee
32 | 0.6627697,0.3056006,sample_document.txt.output 19 16,making
33 | 0.67700034,0.33953798,sample_document.txt.output 19 18,easy
34 | 0.8993714,0.21139722,sample_document.txt.output 19 25,turn
35 | 0.60749745,0.30814624,sample_document.txt.output 19 3,Keep
36 | 0.61776954,0.30698663,sample_document.txt.output 19 4,incorporating
37 | 0.23025304,0.4327411,sample_document.txt.output 2 11,has
38 | 0.2912727,0.39179614,sample_document.txt.output 2 20,powered
39 | 0.20970988,0.35870302,sample_document.txt.output 2 4,was a glaring hole
40 | 0.33967128,0.18600482,sample_document.txt.output 2 6,considering
41 | 0.9585166,0.3046474,sample_document.txt.output 20 2,scare
42 | 1.0286418,1.1654844,sample_document.txt.output 21 13,are
43 | 1.0180484,0.9873926,sample_document.txt.output 21 4,is not the reinvention of
44 | 1.5583814,0.70114505,sample_document.txt.output 22 5,is a well-exectuted refinement
45 | 1.7318684,0.52950096,sample_document.txt.output 23 1,filling
46 | 1.9559892,0.44548014,sample_document.txt.output 23 14,gives
47 | 0.5044588,0.21667121,sample_document.txt.output 3 16,took
48 | 0.35501373,0.18694507,sample_document.txt.output 3 2,settling
49 | 0.42702043,0.20121588,sample_document.txt.output 3 9,fill
50 | 0.54812557,0.27332816,sample_document.txt.output 4 18,construct
51 | 0.6833602,0.16512632,sample_document.txt.output 4 27,code
52 | 0.5855998,0.3608075,sample_document.txt.output 4 34,well-designed
53 | 0.73807853,0.27333945,sample_document.txt.output 4 37,interface
54 | 0.43750316,0.43180317,sample_document.txt.output 4 6,is n't simply just a...
55 | 0.6779917,0.23474711,sample_document.txt.output 5 12,is also accessible from
56 | 0.75836116,0.044174373,sample_document.txt.output 5 3,log
57 | 0.84418917,0.14313164,sample_document.txt.output 6 16,appear
58 | 0.8906975,0.15871912,sample_document.txt.output 6 27,go
59 | 0.6935278,0.20407178,sample_document.txt.output 6 4,save
60 | 0.71397096,0.23956096,sample_document.txt.output 6 7,working
61 | 0.81361145,0.19291249,sample_document.txt.output 7 14,Keep
62 | 0.9003202,0.19139186,sample_document.txt.output 7 15,makes
63 | 0.5575306,0.497628,sample_document.txt.output 7 6,be as progressive as
64 | 0.70687485,0.3683506,sample_document.txt.output 8 2,Keep
65 | 0.7990323,0.26095173,sample_document.txt.output 8 4,presented
66 | 0.96460164,0.116692804,sample_document.txt.output 9 0,Swiping
67 | 


--------------------------------------------------------------------------------
/component/Duration/predictions_new/sample_document.txt.output_timeline.csv:
--------------------------------------------------------------------------------
 1 | start_pt,duration,sent_pred_id,pred_text
 2 | 0.1659592,0.3867714,sample_document.txt.output 1 13,was
 3 | 0.057858784,0.38209978,sample_document.txt.output 1 4,Keep
 4 | 0.11140874,0.22814499,sample_document.txt.output 1 8,launched
 5 | 0.58783954,0.24757639,sample_document.txt.output 10 12,serves
 6 | 0.4584901,0.28038725,sample_document.txt.output 10 6,is
 7 | 0.56658405,0.18215424,sample_document.txt.output 11 2,viewing
 8 | 0.692741,0.1787399,sample_document.txt.output 11 21,allow
 9 | 0.75968033,0.14175409,sample_document.txt.output 11 24,edit
10 | 0.61380315,0.18479265,sample_document.txt.output 11 7,tapping
11 | 0.5091141,0.57216734,sample_document.txt.output 12 4,is frictionless
12 | 0.7279427,0.09699013,sample_document.txt.output 13 1,said
13 | 0.71798456,0.3629794,sample_document.txt.output 13 6,going
14 | 0.6839229,0.6700831,sample_document.txt.output 13 8,conquer
15 | 0.7431355,0.25156957,sample_document.txt.output 14 10,is only choice
16 | 0.94288635,0.16647394,sample_document.txt.output 14 16,re-order
17 | 0.40678498,0.6119435,sample_document.txt.output 14 3,limited
18 | 0.6178697,0.13728026,sample_document.txt.output 15 10,+
19 | 0.47185704,0.5922663,sample_document.txt.output 15 19,are pretty bare bones
20 | 0.38864532,0.66203797,sample_document.txt.output 15 5,limited
21 | 0.49253073,0.33022714,sample_document.txt.output 15 7,email
22 | 0.608893,0.28437492,sample_document.txt.output 16 14,thought
23 | 0.44145197,0.5678541,sample_document.txt.output 16 5,'s more a function of
24 | 0.5642646,0.3093289,sample_document.txt.output 16 9,being new
25 | 0.89284486,0.2879107,sample_document.txt.output 17 16,turn
26 | 0.63449514,0.3744282,sample_document.txt.output 17 5,expect
27 | 0.91353405,0.3282676,sample_document.txt.output 18 12,be able
28 | 1.0137923,0.22846185,sample_document.txt.output 18 14,send
29 | 0.82106173,0.3076777,sample_document.txt.output 18 2,'s easy
30 | 1.0391245,0.4751686,sample_document.txt.output 18 24,Keep
31 | 0.8270937,0.35527292,sample_document.txt.output 18 4,foresee
32 | 0.93579674,0.48665285,sample_document.txt.output 19 16,making
33 | 0.97048026,0.5357981,sample_document.txt.output 19 18,easy
34 | 1.2939894,0.32960615,sample_document.txt.output 19 25,turn
35 | 0.84815556,0.50060415,sample_document.txt.output 19 3,Keep
36 | 0.86849916,0.49890476,sample_document.txt.output 19 4,incorporating
37 | 0.1884663,0.6893078,sample_document.txt.output 2 11,has
38 | 0.34253854,0.5686484,sample_document.txt.output 2 20,powered
39 | 0.15306628,0.5725143,sample_document.txt.output 2 4,was a glaring hole
40 | 0.35936135,0.28860942,sample_document.txt.output 2 6,considering
41 | 1.3718944,0.5206325,sample_document.txt.output 20 2,scare
42 | 1.3775618,0.86497575,sample_document.txt.output 21 13,are
43 | 1.3708651,0.76067257,sample_document.txt.output 21 4,is not the reinvention of
44 | 1.6471236,0.64427495,sample_document.txt.output 22 5,is a well-exectuted refinement
45 | 1.806507,0.48694935,sample_document.txt.output 23 1,filling
46 | 2.0096083,0.41171712,sample_document.txt.output 23 14,gives
47 | 0.65977967,0.36332253,sample_document.txt.output 3 16,took
48 | 0.40279114,0.32262275,sample_document.txt.output 3 2,settling
49 | 0.5145695,0.34815133,sample_document.txt.output 3 9,fill
50 | 0.72906995,0.5002725,sample_document.txt.output 4 18,construct
51 | 0.9556468,0.3318769,sample_document.txt.output 4 27,code
52 | 0.8607931,0.5956468,sample_document.txt.output 4 34,well-designed
53 | 1.0808307,0.48955455,sample_document.txt.output 4 37,interface
54 | 0.6597389,0.611886,sample_document.txt.output 4 6,is n't simply just a...
55 | 0.18994676,0.42184663,sample_document.txt.output 5 12,is also accessible from
56 | 1.1183183,0.22149895,sample_document.txt.output 5 3,log
57 | 0.47176263,0.2754843,sample_document.txt.output 6 16,appear
58 | 0.5010931,0.37027907,sample_document.txt.output 6 27,go
59 | 0.21596201,0.36138824,sample_document.txt.output 6 4,save
60 | 0.2529694,0.42897764,sample_document.txt.output 6 7,working
61 | 0.42659408,0.32614288,sample_document.txt.output 7 14,Keep
62 | 0.56993645,0.32825184,sample_document.txt.output 7 15,makes
63 | 0.0,0.83541936,sample_document.txt.output 7 6,be as progressive as
64 | 0.18418065,0.37936413,sample_document.txt.output 8 2,Keep
65 | 0.2485228,0.35295543,sample_document.txt.output 8 4,presented
66 | 0.3916087,0.23731099,sample_document.txt.output 9 0,Swiping
67 | 


--------------------------------------------------------------------------------
/component/Duration/predictions_new/sample_fig2.txt.output_predictions.csv:
--------------------------------------------------------------------------------
1 | sent_pred_id1,sent_pred_id2,b1,e1,b2,e2,pred1_duration,pred2_duration,pred1_text,pred2_text,pred1_dict_idx,pred2_dict_idx
2 | sample_fig2.txt.output 1 2,sample_fig2.txt.output 2 4,0.0,0.0,0.0,0.0,0,0,feed,been sick for about now,0,1
3 | 


--------------------------------------------------------------------------------
/component/Duration/predictions_new/sample_fig2.txt.output_timeline.csv:
--------------------------------------------------------------------------------
1 | start_pt,duration,sent_pred_id,pred_text
2 | 0.0,1.3665516e-05,sample_fig2.txt.output 1 2,feed
3 | 0.0776875,0.0031946814,sample_fig2.txt.output 2 4,been sick for about now
4 | 


--------------------------------------------------------------------------------
/component/Duration/preprocess.py:
--------------------------------------------------------------------------------
  1 | from predpatt import PredPatt
  2 | import json
  3 | from torch.utils.data import Dataset, DataLoader
  4 | ''' Json input format:
  5 |     [
  6 |         {
  7 |             "tokens": ["word_0", "word_1", ...],
  8 |             "events": [
  9 |                 {
 10 |                     "event_type": "Movement:Transport",
 11 |                     "triggers": [{
 12 |                         "event_type": "Movement:Transport", 
 13 |                         "text": "deploy", 
 14 |                         "start_token": 5, 
 15 |                         "end_token": 5
 16 |                         }],
 17 |                     ...
 18 |                 },
 19 |                 ...
 20 |             ],
 21 |             "ner": [[]]
 22 |         },
 23 |         ...
 24 |     ]
 25 | '''
 26 | 
 27 | 
 28 | def predicate_info(predicate):
 29 |     '''
 30 |     Author: sidvash <sidsvash26@gmail.com>
 31 | 
 32 |     Input: predicate object
 33 |     Output: pred_text, token, root_token
 34 |     
 35 |     Note: If predicate is copular: pred_text is only upto first 5 words
 36 |     '''      
 37 |     copula_bool = False
 38 |     
 39 |     #Extend predicate to start from the copula
 40 |     if predicate.root.tag not in ["VERB", "AUX"]:
 41 |         all_pred = predicate.tokens
 42 |         gov_rels = [tok.gov_rel for tok in all_pred]
 43 |         if 'cop' in gov_rels:
 44 |             copula_bool = True
 45 |             cop_pos = gov_rels.index('cop')
 46 |             pred = [x.text for x in all_pred[cop_pos:]]
 47 |             pred_token = [x.position for x in all_pred[cop_pos:]]
 48 |             def_pred_token = predicate.root.position  #needed for it_happen set
 49 |             cop_bool = True  
 50 |             #print(predicate, idx)
 51 |             
 52 |         elif predicate.root.tag == "ADJ":
 53 |             pred_token = [predicate.root.position]
 54 |             pred = [predicate.root.text]
 55 |             def_pred_token = predicate.root.position
 56 |         else: ## Different from protocol as we are considering all predicates
 57 |             pred_token = [predicate.root.position]
 58 |             pred = [predicate.root.text]
 59 |             def_pred_token = predicate.root.position
 60 |             
 61 |     #Else keep the root        
 62 |     else:
 63 |         pred_token = [predicate.root.position]
 64 |         pred = [predicate.root.text]
 65 |         def_pred_token = predicate.root.position 
 66 | 
 67 |     #Stringify pred and pred_tokens:
 68 |     #pred_token = "_".join(map(str, pred_token))
 69 | 
 70 |     if len(pred)>5:
 71 |         pred = pred[:5]
 72 |         pred = " ".join(pred) + "..."
 73 |     else:
 74 |         pred = " ".join(pred)
 75 |     
 76 |     return pred, pred_token, def_pred_token
 77 | 
 78 | 
 79 | def extract_pp_obj_instance(pp_obj_instance, pp_obj):
 80 |     _, span_idx_list, root_idx = predicate_info(pp_obj_instance)
 81 |     word_tokens = [token.text for token in pp_obj.tokens]
 82 |     span_text   = ' '.join([pp_obj.tokens[i].text for i in span_idx_list])
 83 |     root_text   = pp_obj.tokens[root_idx].text
 84 |     return word_tokens, span_text, span_idx_list, root_text, root_idx
 85 | 
 86 | 
 87 | class TempEveDataset(Dataset):
 88 |     def __init__(self, json_filename, from_UDST_dataset = False, from_pipeline = True):
 89 |         
 90 |         self.sentences_wordlist = [] # list of string list
 91 |         self.spans = []              # list of string
 92 |         self.spans_idx = []          # list of int list
 93 |         self.roots = []              # list of string
 94 |         self.roots_idx = []          # list of int
 95 |         
 96 |         if from_pipeline:
 97 |             if type(json_filename) == str:
 98 |                 json_objs = json.load(open(json_filename))
 99 |             else:
100 |                 json_objs = json_filename
101 | 
102 |             print("json file size:", len(json_objs))
103 | 
104 |             for obj in json_objs:
105 |                 
106 |                 if len(obj['events']) > 0: # detected events
107 |                     for event in obj['events']:
108 |                         for trigger in event['triggers']:
109 |                             self.sentences_wordlist.append(obj['tokens'])
110 |                             self.spans.append(trigger['text'])
111 |                             self.spans_idx.append(list(range(int(trigger['start_token']), int(trigger['end_token']) + 1))) # seems like one-word span only but just to make sure
112 |                             self.roots.append(trigger['text'].split()[0]) # seems like one-word only but just to make sure
113 |                             self.roots_idx.append(int(trigger['start_token']))
114 |                 # else: # no events detected
115 |                 #     pass
116 |                 #     # some odd error with "\""", see main below for examples
117 |                 #     if len(obj['tokens']) > 3: # if truly a sentence but without trigger/event detected from event extraction
118 |                 #         # try with predpatt
119 |                 #         sentence = " ".join(obj['tokens'])
120 |                 #         print(sentence)
121 |                 #         pp_obj = PredPatt.from_sentence(sentence) # https://github.com/hltcoe/PredPatt/blob/5ce4b88c4678dcf7c99a6b0377e0f641701b8390/predpatt/patt.py#L376
122 |                 #         if len(pp_obj.instances) > 0:
123 |                 #             for pp_obj_instance in pp_obj.instances:
124 |                 #                 word_tokens, span_text, span_idx_list, root_text, root_idx = extract_pp_obj_instance(pp_obj_instance, pp_obj)
125 |                 #                 self.sentences_wordlist.append(word_tokens)
126 |                 #                 self.spans.append(span_text)
127 |                 #                 self.spans_idx.append(span_idx_list)
128 |                 #                 self.roots.append(root_text)
129 |                 #                 self.roots_idx.append(root_idx)
130 |                     #     else:
131 |                     #         # do nothing, filter it outs
132 |                     #         pass
133 |                     # else:
134 |                     #     # append entire sentence? no, do nothing (filter it out)
135 |                     #     pass
136 | 
137 |         elif from_UDST_dataset:
138 |             pass
139 | 
140 |     def __len__(self):
141 |         return len(self.sentences_wordlist)
142 | 
143 |     def __getitem__(self, index):
144 |         return {
145 |             "words_list": self.sentences_wordlist[index],
146 |             "span_text": self.spans[index],
147 |             "root_text": self.roots[index],
148 |             "span_idx_list": self.spans_idx[index],
149 |             "root_idx": self.roots_idx[index]
150 |         }
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     # pp = PredPatt.from_sentence('Chris loves silly dogs and clever cats .')
155 |     # print(predicate_info(pp.instances[0]))
156 |     # print(pp.tokens[0].text)
157 |     # test_word_list = ["We", "'re", "talking", "about", "possibilities", "of", "full", "scale", "war", "with", "former", "Congressman", "Tom", "Andrews", ",", "Democrat", "of", "Maine", "."]
158 |     # test_word_list_orig = ["New", "Questions", "About", "Attacking", "Iraq", ";", "Is", "Torturing", "Terrorists", "Necessary", "?", ]
159 |     # print('orig:\t', ' '.join(test_word_list_orig))
160 |     # test_word_list_good = ["New", "Questions", "About", "Attacking", "\"", "Iraq", ";", "Is", "Torturing", "Terrorists", "Necessary", "?", ]
161 |     # print('good:\t', ' '.join(test_word_list_good))
162 |     # test_word_list_bad = ["New", "Questions", "About", "Attacking", "Iraq", ";", "Is", "Torturing", "Terrorists", "Necessary", "\"","?", ]
163 |     # print('bad:\t', ' '.join(test_word_list_bad))
164 |     # test_word_list = ["Why", "do", "we", "have", "to", "learn", "it", "from", "\"", "Newsweek", "\"", "?"]
165 |     """
166 |     odd "\n""
167 |     error: "KeyError: 1"
168 |     print("inside JPyoeBackend!! indices_to_words[index]:", indices_to_words[index])
169 |     print(len(indices_to_words)) # 0
170 |     print(index) # 1
171 |     """
172 |     # test_word_list = ["Why", "do", "we", "have", "to", "learn", "it", "from", "Newsweek", "?"]
173 |     # test_word_list = ["And", "so", "I", "would", "like", "you", "to", "take", "a", "look", "at", "the", "CNN/\"USA", "TODAY\"", "\"", "Gallup", "poll", ",", "taken", "last", "week", ",", "should", "U.S.", "troops", "to", "go", "to", "Iraq", "to", "remove", "Saddam", "Hussein", "from", "power", "."]
174 |     # sentence = " ".join(test_word_list)
175 |     # print(sentence)
176 |     # pp_obj = PredPatt.from_sentence(sentence)
177 |     # for predicates in pp_obj.instances:
178 |     #     span, span_idx_list, root_idx = predicate_info(predicates)
179 |     #     print(span, span_idx_list, root_idx)
180 |     #     print([token.text for token in pp_obj.tokens])
181 |     #     print(' '.join([pp_obj.tokens[i].text for i in span_idx_list]))
182 |     #     print(pp_obj.tokens[root_idx].text)
183 | 
184 |     dataset = TempEveDataset("mu_dev_out.json", False, True)
185 |     print("dataset size:", len(dataset))
186 |     print("data sample:", dataset[0])
187 | 
188 |     dataloader = DataLoader(dataset, batch_size=4)
189 | 
190 |     dataloader = iter(dataloader)
191 |     batch = next(dataloader)
192 | 
193 |     print(batch)
194 | 


--------------------------------------------------------------------------------
/component/Duration/readme_eval.txt:
--------------------------------------------------------------------------------
 1 | #### Steps to create a document timeline for an input document ###
 2 | 
 3 | 1. Put all the input document files into the "input_data" folder. Note that each document file should have sentences separated by a "\n". A "sample_document.txt" file is already present as a reference for an input document file.
 4 | 
 5 | 2. From the terminal, change the current directory to be the "scripts" folder and run the following command:
 6 | 	bash run_input_data.bash
 7 | 
 8 | 3. The predictions of all the input document files will be written to the predictions folder:
 9 |     - [input_doc_filename]_timeline.csv  (contains the document timeline)
10 |     - [input_doc_filename]_predictions.csv (contains the relative timelines and predicate durations)
11 | 
12 | 
13 | The mappings for durations are as follows:
14 | 0-inst
15 | 1-secs
16 | 2-mins
17 | 3-hrs
18 | 4-days
19 | 5-weeks
20 | 6-mnths
21 | 7-yrs
22 | 8-decs
23 | 9-cents
24 | 10-forever
25 | 
26 | 
27 | For a detailed description of the protocols, datasets, as well as models of these data, please see the following paper:
28 | Vashishtha, S., B. Van Durme, & A.S. White. 2019. Fine-Grained Temporal Relation Extraction. arXiv:1902.01390 [cs.CL].  (https://arxiv.org/abs/1902.01390)


--------------------------------------------------------------------------------
/component/Duration/requirements.txt:
--------------------------------------------------------------------------------
 1 | allennlp==1.0.0
 2 | matplotlib==3.1.1
 3 | nltk==3.4.5
 4 | torch==1.5.0
 5 | tqdm==4.45.0
 6 | # predpatt==1.0
 7 | numpy==1.17.1
 8 | pandas==0.25.1
 9 | scikit_learn==0.23.1
10 | 


--------------------------------------------------------------------------------
/component/Duration/run_jupyter.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | jupyter notebook --no-browser --allow-root --port=7745 --NotebookApp.token='temporal'


--------------------------------------------------------------------------------
/component/Duration/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/Duration/scripts/__init__.py


--------------------------------------------------------------------------------
/component/Duration/scripts/elmo_files/elmo_2x4096_512_2048cnn_2xhighway_options.json:
--------------------------------------------------------------------------------
1 | {"lstm": {"use_skip_connections": true, "projection_dim": 512, "cell_clip": 3, "proj_clip": 3, "dim": 4096, "n_layers": 2}, "char_cnn": {"activation": "relu", "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], "n_highway": 2, "embedding": {"dim": 16}, "n_characters": 262, "max_characters_per_token": 50}}
2 | 


--------------------------------------------------------------------------------
/component/Duration/scripts/run_document_timeline.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash  
 2 | #base_dir=$(pwd)
 3 | base_dir=$(cd ../ && pwd)
 4 | cd ../stanford-corenlp-full-2018-10-05
 5 | 
 6 | #docname="sample_document.txt"
 7 | java -cp "*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP [ -props sampleProps.properties ] -file ../input_data/$1 -outputFormat conllu
 8 | mv *.output $base_dir/input_data_conllu/
 9 | 
10 | cd ../scripts
11 | python run_model.py -doc ../input_data_conllu/$1.output -gpu 0 -out ../predictions
12 | 
13 | 


--------------------------------------------------------------------------------
/component/Duration/scripts/run_input_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | base_dir=$(cd ../ && pwd)
3 | 
4 | #!/bin/bash
5 | for filename in $(ls $base_dir/input_data/); do
6 |     bash run_document_timeline.bash "$filename" 
7 | done


--------------------------------------------------------------------------------
/component/Duration/scripts/run_model.py:
--------------------------------------------------------------------------------
  1 | from scripts.utils import *
  2 | from scripts.timelinemodule import TimelineModel
  3 | import argparse
  4 | import warnings
  5 | 
  6 | 
  7 | warnings.filterwarnings('ignore')
  8 | 
  9 | 
 10 | def main():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument("-doc", "--docpath",
 13 |                         help="Path of the document file",
 14 |                         type=str,
 15 |                         default="")
 16 | 
 17 |     parser.add_argument("-gpu", "--gpunumber",
 18 |                         help="Which gpu to use",
 19 |                         type=int,
 20 |                         default=1)
 21 | 
 22 |     parser.add_argument("-out", "--outpath",
 23 |                         help="Path of the output folder",
 24 |                         type=str,
 25 |                         default="")
 26 | 
 27 |     args = parser.parse_args()
 28 | 
 29 |     ## Dependency Graph object
 30 |     filename = args.docpath.split("/")[-1]
 31 |     structures = get_structs(args.docpath)
 32 |     print("\n###########   Parsing Conllu through PredPatt    ###########")
 33 | 
 34 |     ## Sentences
 35 |     struct_dict = extract_struct_dicts(structures)
 36 | 
 37 |     ## A dataframe after processing the file through PredPatt and extracting
 38 |     ## roots and spans of each predicate.
 39 |     df = extract_dataframe(args.docpath, structures)
 40 | 
 41 |     ## Correct pred2_tokens as per the concatenated sentence
 42 |     df['pred2_token_mod'] = df.apply(lambda row: correct_pred2_tokens(row, struct_dict), axis=1)
 43 |     df['pred2_root_token_mod'] = df.apply(lambda row: correct_pred2_root(row, struct_dict), axis=1)
 44 |     # Convert tokens into list of numbers
 45 |     df['pred1_token_span'] = df['pred1_token'].map(lambda x: [int(y) for y in x.split("_")])
 46 |     df['pred2_token_span'] = df['pred2_token_mod'].map(lambda x: [int(y) for y in x.split("_")])
 47 | 
 48 |     ## Extract X for model predictions
 49 |     X = extract_X(df)
 50 | 
 51 |     ## Load the best model
 52 |     squashed = True
 53 |     baseline = False
 54 |     loss_confidence = True
 55 |     cuda_device_num = args.gpunumber
 56 |     cuda_device_str = "cuda:" + str(cuda_device_num)
 57 |     model_path = "../model/"
 58 |     file_path = "model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth"
 59 | 
 60 |     tokens = file_path.split("_")
 61 |     eventatt = tokens[1]
 62 |     duratt = tokens[2]
 63 |     relatt = tokens[3]
 64 |     concat_fine_to_dur = str2bool(tokens[-8])
 65 |     concat_dur_to_fine = str2bool(tokens[-7])
 66 |     fine_2_dur = str2bool(tokens[-6])
 67 |     dur_2_fine = str2bool(tokens[-5])
 68 |     weight = float(tokens[-4])
 69 |     drop = float(tokens[-3])
 70 |     activ = tokens[-2]
 71 |     bino_bool = str2bool(tokens[-1].split(".")[0])
 72 |     # coarse_size = int(tokens[-1].split(".")[0])
 73 |     print("\n###########   Predicting Relative Timelines    ###########")
 74 |     print("\nRelative Temporal Model configurations:")
 75 |     print(
 76 |         "Eventatt: {}, Duratt: {}, Relatt: {}, Dropout: {}, Activation: {}, Binomial: {}, concat_fine2dur: {}, concat_dur2fine:{}, fine_to_dur: {}, dur_to_fine: {} \n".format(
 77 |             eventatt,
 78 |             duratt,
 79 |             relatt,
 80 |             drop,
 81 |             activ,
 82 |             bino_bool,
 83 |             concat_fine_to_dur,
 84 |             concat_dur_to_fine,
 85 |             fine_2_dur,
 86 |             dur_2_fine))
 87 |     device = torch.device(cuda_device_str if torch.cuda.is_available() else "cpu")
 88 | 
 89 |     best_model = TemporalModel(
 90 |         embedding_size=1024,
 91 |         duration_distr=bino_bool,
 92 |         elmo_class=ElmoEmbedder(options_file, weight_file, cuda_device=cuda_device_num),
 93 |         mlp_dropout=drop,
 94 |         mlp_activation=activ,
 95 |         tune_embed_size=256,
 96 |         event_attention=eventatt,
 97 |         dur_attention=duratt,
 98 |         rel_attention=relatt,
 99 |         concat_fine_to_dur=concat_fine_to_dur,
100 |         concat_dur_to_fine=concat_dur_to_fine,
101 |         fine_to_dur=fine_2_dur,
102 |         dur_to_fine=dur_2_fine,
103 |         fine_squash=True,
104 |         baseline=False,
105 |         dur_MLP_sizes=[128], fine_MLP_sizes=[128],
106 |         dur_output_size=11, fine_output_size=4,
107 |         device=device)
108 | 
109 |     best_model.load_state_dict(torch.load(model_path + file_path, map_location=cuda_device_str))
110 |     best_model.to(device)
111 | 
112 |     p1_dur_yhat, p2_dur_yhat, fine_yhat, rel_yhat = predict_fine_dur_only(X, best_model)
113 |     print("Relative timelines completed!!\n")
114 |     ## Store predictions in the dataset
115 |     df['pred1_duration'] = p1_dur_yhat.cpu().numpy()
116 |     df['pred2_duration'] = p2_dur_yhat.cpu().numpy()
117 |     df['b1'] = [b1 for b1, d1, b2, d2 in fine_yhat.cpu().numpy()]
118 |     df['d1'] = [d1 for b1, d1, b2, d2 in fine_yhat.cpu().numpy()]
119 |     df['e1'] = df['b1'] + df['d1']
120 |     df['b2'] = [b2 for b1, d1, b2, d2 in fine_yhat.cpu().numpy()]
121 |     df['d2'] = [d2 for b1, d1, b2, d2 in fine_yhat.cpu().numpy()]
122 |     df['e2'] = df['b2'] + df['d2']
123 |     df = df.drop(['d1', 'd2'], axis=1)
124 |     df['sent_pred_id1'] = df['sentence_id_1'] + " " + df['pred1_root_token'].map(lambda x: str(x))
125 |     df['sent_pred_id2'] = df['sentence_id_2'] + " " + df['pred2_root_token'].map(lambda x: str(x))
126 | 
127 |     ## Document Timelines
128 |     pred_dict, num_preds, local_data = extract_preds(df)
129 | 
130 |     ## Run Timeline Model on current docid's data
131 |     model = TimelineModel(data=local_data,
132 |                           num_preds=num_preds,
133 |                           device=torch.device(type="cpu"))
134 | 
135 |     print("###########   Creating document timelines    ###########")
136 |     pred_b1, pred_e1, pred_b2, pred_e2, pred_timeline = model.fit(local_data, epochs=5000)
137 | 
138 |     preds_arr = local_data[['sent_pred_id1', 'sent_pred_id2']].values
139 |     uniq_preds = np.unique(preds_arr.flatten())
140 |     # print(uniq_preds)
141 | 
142 |     preds_text = extract_pred_text(uniq_preds, local_data)
143 | 
144 |     ans_df = pd.DataFrame(data=pred_timeline,
145 |                           columns=['start_pt', 'duration'])
146 |     ans_df['sent_pred_id'] = uniq_preds
147 |     ans_df['pred_text'] = preds_text
148 | 
149 |     ## Save prediction files
150 |     ans_df.to_csv(args.outpath + "/" + filename + "_timeline.csv", index=False)
151 |     local_data.to_csv(args.outpath + "/" + filename + "_predictions.csv", index=False)
152 | 
153 |     print("\nOutput written to the predictions folder.")
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     main()
158 | 


--------------------------------------------------------------------------------
/component/Duration/scripts/src/factslab/factslab/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/Duration/scripts/src/factslab/factslab/__init__.py


--------------------------------------------------------------------------------
/component/Duration/scripts/src/factslab/factslab/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/Duration/scripts/src/factslab/factslab/pytorch/__init__.py


--------------------------------------------------------------------------------
/component/Duration/scripts/src/factslab/factslab/pytorch/roberta_extract.py:
--------------------------------------------------------------------------------
  1 | # A fair portion of this code ('align_bpe_to_words') is taken from: https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/alignment_utils.py
  2 | # Author: sidvash
  3 | # Created: 10/28/2019
  4 | # Last modified: 11/19/2019
  5 | 
  6 | '''
  7 | The purpose of this code is to extract RoBERTa embeddings for a sentence whose gold tokens are known.
  8 | 
  9 | Usage:
 10 | from roberta_extract import aligned_roberta
 11 | embeddings = aligned_roberta(sentence, tokens, roberta='large')
 12 | 
 13 | where sentence is a string, and tokens are the tokens of the sentence.
 14 | '''
 15 | from collections import Counter
 16 | from typing import List
 17 | 
 18 | import torch
 19 | import fairseq #not importing this causes line 94 assertion to fail -- why?
 20 | 
 21 | ##### Load Roberta model
 22 | roberta_large = torch.hub.load('pytorch/fairseq', 'roberta.large')
 23 | roberta_large.eval()
 24 | print("Large Model loaded")
 25 | 
 26 | roberta_base = torch.hub.load('pytorch/fairseq', 'roberta.base')
 27 | roberta_base.eval()
 28 | print("Base Model loaded")
 29 | 
 30 | def aligned_roberta(sentence: str, 
 31 |                             tokens: List[str], 
 32 |                             roberta='large',
 33 |                             return_all_hiddens=False,
 34 |                             border_tokens=False):
 35 |     '''
 36 |     Code inspired from: https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
 37 |     
 38 |     Aligns roberta embeddings for an input tokenization of words for a sentence
 39 |     
 40 |     Inputs:
 41 |     1. sentence: sentence in string
 42 |     2. tokens: tokens of the sentence in which the alignment is to be done
 43 |     3. roberta: 'large' or 'base'
 44 |     4. border_tokens: Boolean for whether to include special token embeddings <s> and </s>
 45 | 
 46 |     Outputs:    
 47 |     Roberta embeddings aligned with the input tokens 
 48 |     '''
 49 | 
 50 |     # tokenize both with GPT-2 BPE and get alignment with given tokens
 51 |     if roberta=='large':
 52 |         roberta_model = roberta_large
 53 |     else:
 54 |         roberta_model = roberta_base
 55 | 
 56 |     bpe_toks = roberta_model.encode(sentence)
 57 |     alignment = align_bpe_to_words(roberta_model, bpe_toks, tokens)
 58 |     
 59 |     
 60 |     # extract features and align them
 61 |     features = roberta_model.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens)
 62 |     features = features.squeeze(0)   #Batch-size = 1
 63 |     aligned_feats = align_features_to_words(roberta_model, features, alignment)
 64 |     
 65 |     if border_tokens:
 66 |         return aligned_feats
 67 |     else:
 68 |         return aligned_feats[1:-1]  #exclude <s> and </s> tokens
 69 | 
 70 | 
 71 | def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]):
 72 |     """
 73 |     Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
 74 | 
 75 |     Args:
 76 |         roberta (RobertaHubInterface): RoBERTa instance
 77 |         bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
 78 |         other_tokens (List[str]): other tokens of shape `(T_words)`
 79 | 
 80 |     Returns:
 81 |         List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
 82 |     """
 83 |     assert bpe_tokens.dim() == 1
 84 |     assert bpe_tokens[0] == 0. ##added after revision in alignment utils from fairseq (Feb11, 2020)
 85 | 
 86 |     def clean(text):
 87 |         return text.strip()
 88 | 
 89 |     # remove whitespaces to simplify alignment
 90 |     bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens]
 91 |     bpe_tokens = [clean(roberta.bpe.decode(x) if x not in {'<s>', ''} else x) for x in bpe_tokens]
 92 |     other_tokens = [clean(str(o)) for o in other_tokens]
 93 | 
 94 |     # strip leading <s>
 95 |     
 96 |     bpe_tokens = bpe_tokens[1:]
 97 |     assert ''.join(bpe_tokens) == ''.join(other_tokens)
 98 | 
 99 |     # create alignment from every word to a list of BPE tokens
100 |     alignment = []
101 |     bpe_toks = filter(lambda item: item[1] != '', enumerate(bpe_tokens, start=1))
102 |     j, bpe_tok = next(bpe_toks)
103 |     for other_tok in other_tokens:
104 |         bpe_indices = []
105 |         while True:
106 |             if other_tok.startswith(bpe_tok):
107 |                 bpe_indices.append(j)
108 |                 other_tok = other_tok[len(bpe_tok):]
109 |                 try:
110 |                     j, bpe_tok = next(bpe_toks)
111 |                 except StopIteration:
112 |                     j, bpe_tok = None, None
113 |             elif bpe_tok.startswith(other_tok):
114 |                 # other_tok spans multiple BPE tokens
115 |                 bpe_indices.append(j)
116 |                 bpe_tok = bpe_tok[len(other_tok):]
117 |                 other_tok = ''
118 |             else:
119 |                 raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok))
120 |             if other_tok == '':
121 |                 break
122 |         assert len(bpe_indices) > 0
123 |         alignment.append(bpe_indices)
124 |     assert len(alignment) == len(other_tokens)
125 | 
126 |     return alignment
127 | 
128 | 
129 | def align_features_to_words(roberta, features, alignment):
130 |     """
131 |     Align given features to words.
132 | 
133 |     Args:
134 |         roberta (RobertaHubInterface): RoBERTa instance
135 |         features (torch.Tensor): features to align of shape `(T_bpe x C)`
136 |         alignment: alignment between BPE tokens and words returned by
137 |             func:`align_bpe_to_words`.
138 |     """
139 |     assert features.dim() == 2
140 | 
141 |     bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices)
142 |     assert bpe_counts[0] == 0  # <s> shouldn't be aligned
143 |     denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))])
144 |     weighted_features = features / denom.unsqueeze(-1)
145 | 
146 |     output = [weighted_features[0]]
147 |     largest_j = -1
148 |     for bpe_indices in alignment:
149 |         output.append(weighted_features[bpe_indices].sum(dim=0))
150 |         largest_j = max(largest_j, *bpe_indices)
151 |     for j in range(largest_j + 1, len(features)):
152 |         output.append(weighted_features[j])
153 |     output = torch.stack(output)
154 |     #assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4)
155 |     return output
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/component/Duration/scripts/src/factslab/factslab/pytorch/transformer_regression.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from torch.nn import CrossEntropyLoss, MSELoss, Linear, Dropout
 4 | from transformers import AutoModel
 5 | 
 6 | 
 7 | class TransformerRegressionModel(torch.nn.Module):
 8 | 
 9 |     def __init__(self, transformer, dropout, num_labels,cls_token=0,
10 |                  activation='relu'):
11 |         '''
12 |         Setup the modules in the model - a transformer, followed by a GRU for
13 |         the CLS hidden states/taking the mean of all tokens, followed by Linear
14 |         layers that outputs one number, followed by softmax
15 |         '''
16 |         super(TransformerRegressionModel, self).__init__()
17 | 
18 |         # Setup the transformer model
19 |         self.transformer = AutoModel.from_pretrained(transformer)
20 | 
21 | 
22 |         # For now CLS pooling is the only pooling supported
23 |         self.tr_output_size = self.transformer.config.hidden_size
24 |         self.num_labels = num_labels
25 |         self.cls_token = cls_token
26 | 
27 |         # Setup the linear layers on top of the transformer
28 |         self.dense = Linear(self.tr_output_size, self.tr_output_size)
29 |         self.dropout = Dropout(p=dropout)
30 |         self.classifier = Linear(self.tr_output_size, self.num_labels)
31 |         self._activation = activation
32 | 
33 |         self._loss_fn = MSELoss()
34 | 
35 | 
36 |     def _nonlinearity(self, x):
37 |         '''Applies relu or tanh activation on tensor.'''
38 | 
39 |         if self._activation == 'relu':
40 |             return torch.nn.functional.relu(x)
41 |         elif self._activation == 'tanh':
42 |             return torch.tanh(x)
43 | 
44 | 
45 |     def forward(self, input_ids, input_mask, tokens, labels=None):
46 |         '''
47 |         Runs forward pass on neural network
48 | 
49 |         Arguments:
50 |         ---------
51 |         input_ids: the tokenized, bert wordpiece IDs. (batch_size, MAX_LEN)
52 |         input_masks: the masking to be done on input_ids due to padding.
53 |         (batch_size, MAX_LEN)
54 |         labels: target against which to computer the loss. DEFAULT: None
55 |         max_seq_len: The length to which to pad the output of the rnn
56 | 
57 |         Returns:
58 |         -------
59 | 
60 |         Object of type Tuple of form (loss, logits)
61 | 
62 |         loss: Cross Entropy loss calculated in loss_fn which implements masking
63 |         logits: logsoftmaxed probabilities of classifier output
64 | 
65 |         '''
66 | 
67 | 
68 |         # Forward pass through transformer
69 |         # other values returned are pooler_output, hidden_states, and attentions
70 |         outputs = self.transformer(input_ids,
71 |                                    token_type_ids=None,
72 |                                    attention_mask=input_mask)
73 | 
74 |         last_hidden_states = outputs[0]
75 | 
76 |         # Get the hidden states based on token indices
77 |         token_hidden_states = torch.cat([h.index_select(0,tok) for \
78 |                                     h, tok in zip(last_hidden_states, tokens)])
79 | 
80 | 
81 |         # Then run it through linear layers
82 |         x = self.dropout(token_hidden_states)
83 |         x = self.dense(x)
84 |         x = self._nonlinearity(x)
85 |         x = self.dropout(x)
86 |         logits = self.classifier(x)
87 | 
88 |         outputs  = (logits,)
89 | 
90 |         if labels is not None:
91 |             loss = self._loss_fn(logits, labels)
92 |             outputs = (loss,) + outputs
93 | 
94 |         return outputs


--------------------------------------------------------------------------------
/component/Duration/scripts/timelinemodule.py:
--------------------------------------------------------------------------------
  1 | import allennlp
  2 | import torch
  3 | import torch.nn.functional as F
  4 | #import matplotlib.pyplot as plt
  5 | import pickle
  6 | from torch.distributions.binomial import Binomial
  7 | from torch.nn import MSELoss, L1Loss, SmoothL1Loss, CrossEntropyLoss
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | #from torchviz import make_dot, make_dot_from_trace
 12 | import numpy as np
 13 | import pandas as pd
 14 | from tqdm import tqdm
 15 | from tqdm import tqdm_notebook as tqdm_n
 16 | 
 17 | 
 18 | class TimelineModel(torch.nn.Module):
 19 |     '''
 20 |      A class to extract a simple timeline model from a
 21 |      given document's predicate-pair data
 22 |     '''
 23 |     def __init__(self,
 24 |                  data = None,
 25 |                  num_preds = None, 
 26 |                  mlp_activation='relu',
 27 |                  mlp_dropout=0.0,
 28 |                  optimizer_class = torch.optim.Adam,
 29 |                   dur_output_size = 11, fine_output_size = 4,
 30 |                 device=torch.device(type="cpu"),
 31 |                 **kwargs):
 32 |         super().__init__()
 33 | 
 34 |         self.device = device
 35 |         self.linear_maps = nn.ModuleDict()
 36 |         self.mlp_activation = mlp_activation
 37 |         self.mlp_dropout =  nn.Dropout(mlp_dropout) 
 38 |         self.dur_output_size = dur_output_size
 39 |         
 40 |         ## Parameters
 41 |             # Hidden predicate representations
 42 |         self.pred_tensor = torch.nn.Parameter(torch.randn(num_preds,2).to(self.device), requires_grad=True)
 43 |             # Binomial parameter
 44 |         self.k = torch.nn.Parameter(torch.randn(1).to(self.device), requires_grad=True)
 45 |         
 46 |         self.params = nn.ParameterList()
 47 |         self.params.extend([self.pred_tensor, self.k])
 48 |         
 49 |         self._optimizer_class = optimizer_class
 50 |         
 51 |         ## Losses Initialization
 52 |         self.fine_loss = L1Loss().to(self.device)
 53 |         self.duration_loss = CrossEntropyLoss().to(self.device)
 54 | 
 55 |         
 56 |     def _init_MLP(self, input_size, hidden_sizes, output_size, param=None):
 57 |         '''
 58 |         Initialise MLP or regression parameters
 59 |         '''
 60 |         self.linear_maps[param] = nn.ModuleList()
 61 | 
 62 |         for h in hidden_sizes:
 63 |             linmap = torch.nn.Linear(input_size, h)
 64 |             linmap = linmap.to(self.device)
 65 |             self.linear_maps[param].append(linmap)
 66 |             input_size = h
 67 | 
 68 |         linmap = torch.nn.Linear(input_size, output_size)
 69 |         linmap = linmap.to(self.device)
 70 |         self.linear_maps[param].append(linmap)
 71 |         
 72 |     def forward(self, local_data, **kwargs):
 73 |         '''
 74 |         INput: dataframe with cols:
 75 |                 b1, e1, b2, e2, pred1_dict_idx, pred2_dict_idx
 76 |                 
 77 |         Output: 
 78 |         '''
 79 |         t_sq = self.pred_tensor**2 
 80 |         num_preds= t_sq.size()[0]
 81 |         anchored_tensor = torch.zeros(num_preds,2).to(self.device)
 82 |         
 83 |         anchored_tensor[:,0] = t_sq[:,0] - t_sq[:,0].min()
 84 |         anchored_tensor[:,1] = t_sq[:,1]
 85 |         
 86 |         #Predicted fine-grained values for the given document
 87 |         b1 = anchored_tensor[local_data.pred1_dict_idx.values][:,0]
 88 |         dur1 = anchored_tensor[local_data.pred1_dict_idx.values][:,1]
 89 |         b2 = anchored_tensor[local_data.pred2_dict_idx.values][:,0]
 90 |         dur2 = anchored_tensor[local_data.pred2_dict_idx.values][:,1]
 91 |         
 92 |         batch_size = b1.size()[0]
 93 |         #print(batch_size)
 94 |                 
 95 |         pred1_dur = self._binomial_dist(dur1)
 96 |         pred2_dur = self._binomial_dist(dur2)
 97 |         
 98 |         yhat = (b1, dur1, b2, dur2, pred1_dur, pred2_dur,
 99 |                 anchored_tensor)
100 |         
101 |         return yhat
102 |     
103 |     def fit(self, local_data, epochs=5000, **kwargs):
104 |         losses = [10000]
105 |         
106 |         # print("#### Model Parameters ####")
107 |         # for name,param in self.named_parameters():     
108 |         #     if param.requires_grad:
109 |         #         print(name, param.shape) 
110 |         # print("##########################") 
111 |         parameters = [p for p in self.parameters() if p.requires_grad]
112 |         optimizer = self._optimizer_class(parameters)
113 |         
114 |         #Actual ground truth values
115 |         b1_lst = local_data.b1.values
116 |         e1_lst = local_data.e1.values
117 |         b2_lst = local_data.b2.values
118 |         e2_lst = local_data.e2.values
119 |         durations = [local_data.pred1_duration.values,
120 |                      local_data.pred2_duration.values]
121 | 
122 |         
123 |         # pbar = tqdm(total = total_obs//self.train_batch_size)
124 |         
125 |         for epoch in tqdm(range(epochs)):
126 |             preds = self(local_data)
127 |             #zero_grad
128 |             optimizer.zero_grad()
129 |             curr_loss = self._custom_loss(preds,
130 |                                          b1_lst,
131 |                                          e1_lst,
132 |                                          b2_lst,
133 |                                          e2_lst,
134 |                                          durations)
135 |             
136 |             curr_loss.backward()
137 |             optimizer.step()
138 |             
139 |             if epoch==0:
140 |                 tqdm.write("Epoch: {}, Loss: {}".format(epoch+1, curr_loss))
141 |             
142 |             #print("Epoch: {}, Loss: {}".format(epoch+1, curr_loss))
143 |                
144 |             ## Stop training when loss converges
145 |             if abs(curr_loss.detach() - losses[-1]) < 0.00001:
146 |                 #print("Epoch: {}, Converging-Loss: {}".format(epoch+1, curr_loss))
147 |                 break
148 |                 
149 |             #pbar.update(1)
150 |                 
151 |             losses.append(curr_loss.detach())
152 |         #pbar.close()
153 |         tqdm.write("Epoch: {}, Converging-Loss: {}".format(epoch+1, curr_loss))
154 |                 
155 |         return self.predict(preds)
156 |         
157 |     def _custom_loss(self, preds, b1_lst, e1_lst, b2_lst,
158 |                             e2_lst,durations):
159 |         ## Predictions
160 |         b1_pred, dur1_pred, b2_pred, dur2_pred = preds[0], preds[1], preds[2], preds[3]
161 |         out_p1_d, out_p2_d, anchored_tensor = preds[4], preds[5], preds[6]
162 | #         out_coarse, out_coarser = preds[7], preds[8]
163 |         
164 |         ## Ground truth values:
165 |         b1_act, e1_act, b2_act, e2_act = self._lsts_to_tensors(b1_lst, e1_lst, b2_lst, e2_lst,
166 |                                         param="float")
167 |         ## Store actual_y into tensors
168 |         pred1_durs, pred2_durs = durations
169 | 
170 |         pred1_durs, pred2_durs = self._lsts_to_tensors(pred1_durs,pred2_durs)
171 |         
172 |         ## Duration Losses
173 |         L5_p1 = self.duration_loss(out_p1_d, pred1_durs)
174 |         L5_p2 = self.duration_loss(out_p2_d, pred2_durs)
175 |         #print("L5_p1 {},  L5_p2: {}".format(L5_p1, L5_p2))
176 |             
177 |         ## Normalize predicted fine-grained values:
178 |         num_pairs = b1_pred.size()[0]
179 |         t = torch.zeros(num_pairs,4).to(self.device)
180 |         t[:,0] = b1_pred
181 |         t[:,1] = b1_pred + dur1_pred
182 |         t[:,2] = b2_pred
183 |         t[:,3] = b2_pred + dur2_pred
184 |         
185 |     
186 |         t_min, _ = torch.min(t,dim=1)
187 |         t_min = t_min.unsqueeze(1).repeat(1,4)  #add extra dimension
188 |         t_adj = t - t_min
189 |         t_adj_max, _ = torch.max(t_adj,dim=1)
190 |         t_adj_max = t_adj_max.unsqueeze(1).repeat(1,4)
191 |         t_normalized = t_adj/t_adj_max
192 |         
193 |         ## Fine-grained Losses
194 |         l1 = self.fine_loss(t_normalized[:,0]-t_normalized[:,2], b1_act-b2_act)
195 |         l2 = self.fine_loss(t_normalized[:,1]-t_normalized[:,2], e1_act-b2_act)
196 |         l3 = self.fine_loss(t_normalized[:,3]-t_normalized[:,0], e2_act-b1_act)
197 |         l4 = self.fine_loss(t_normalized[:,1]-t_normalized[:,3], e1_act-e2_act)
198 |         
199 |         L1to4 = sum([l1, l2, l3, l4])/4 
200 |            
201 |         #L5_p1, L5_p2 = 0,0 
202 |         
203 |         #print("L1to4: {}".format(L1to4))
204 |         
205 |         dur = (L5_p1+L5_p2)/2
206 |         fine = L1to4
207 |         beta=2.0
208 |         
209 |         total_loss = (sum([dur, beta*fine])/2)
210 |         
211 |         return total_loss
212 |             
213 |     def _lsts_to_tensors(self, *args, param=None):
214 |         '''
215 |         Input: list1, list2,......
216 | 
217 |         Output: [Tensor(list1), tensor(list2),....]
218 | 
219 |         '''
220 |         if param=="float":
221 |             return [torch.from_numpy(np.array(arg)).float().to(self.device) for arg in args]
222 |         else:
223 |             return [torch.from_numpy(np.array(arg, dtype="int64")).to(self.device) for arg in args]
224 |         
225 |     def predict(self, preds):
226 |         b1_pred, dur1_pred, b2_pred, dur2_pred = preds[0], preds[1], preds[2], preds[3]
227 |         pred_timeline =  preds[6]
228 |         
229 |         ## Normalize predicted values:
230 |         num_pairs = b1_pred.size()[0]
231 |         t = torch.zeros(num_pairs,4).to(self.device)
232 |         t[:,0] = b1_pred
233 |         t[:,1] = b1_pred + dur1_pred
234 |         t[:,2] = b2_pred
235 |         t[:,3] = b2_pred + dur2_pred
236 |         
237 |         t_min, _ = torch.min(t,dim=1)
238 |         t_min = t_min.unsqueeze(1).repeat(1,4)  #add extra dimension
239 |         t_adj = t - t_min
240 |         t_adj_max, _ = torch.max(t_adj,dim=1)
241 |         t_adj_max = t_adj_max.unsqueeze(1).repeat(1,4)
242 |         t_normalized = t_adj/t_adj_max
243 |         t_normalized = t_normalized.detach().cpu().numpy()
244 |         
245 |         return t_normalized[:,0],t_normalized[:,1], t_normalized[:,2], t_normalized[:,3], pred_timeline.detach().cpu().numpy()
246 |     
247 |     def _binomial_dist(self, pred_dur):
248 |         '''
249 |         *** Vectorized implementation ***
250 |         Input: A tensor with dimension: batch_size x 1
251 |         Output: A tensor with dimension: batch_size x 11 
252 |         Binomial Prob distribution for a given duration value 
253 |         '''
254 |         pred_dur = torch.sigmoid((self.k)*(torch.log(pred_dur)))
255 |     
256 |         bin_class = Binomial(total_count=self.dur_output_size-1, probs=pred_dur)
257 |         durations = torch.tensor(range(self.dur_output_size), dtype=torch.float).to(self.device)
258 |         
259 |         return self._log_prob_vectorized(bin_class, durations)
260 |         
261 |     def _log_prob_vectorized(self, bin_class, value):
262 |         '''
263 |         1. bin_class: Pytorch Binomial distribution class 
264 |         2. Value is a tensor with size: [total_count+1]
265 |         '''
266 |         batch_size = bin_class.total_count.size()[0]
267 | 
268 |         value = value.repeat(batch_size,1)
269 |         #print(value.size())
270 | 
271 |         bin_class.logits = bin_class.logits.repeat(11,1).permute(1,0)
272 |         #print(bin_class.logits.size())
273 | 
274 |         bin_class.total_count = bin_class.total_count.repeat(11,1).permute(1,0)
275 |         #print(bin_class.total_count.size())
276 | 
277 |         log_factorial_n = torch.lgamma(bin_class.total_count + 1)
278 |         log_factorial_k = torch.lgamma(value + 1)
279 |         log_factorial_nmk = torch.lgamma(bin_class.total_count - value + 1)
280 |         max_val = (-bin_class.logits).clamp(min=0.0)
281 |         # Note that: torch.log1p(-bin_class.probs)) = max_val - torch.log1p((bin_class.logits + 2 * max_val).exp()))
282 | 
283 |         return (log_factorial_n - log_factorial_k - log_factorial_nmk +
284 |                 value * bin_class.logits + bin_class.total_count * max_val -
285 |                 bin_class.total_count * torch.log1p((bin_class.logits + 2 * max_val).exp()))


--------------------------------------------------------------------------------
/component/Duration/utils_duration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import numpy as np
  5 | import torch.nn.functional as F
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | idx2label = ['inst', 'secs', 'mins', 'hours', 'days', 'weeks', 'months', 'years', 'decades', 'cents', 'forever']
 10 | 
 11 | 
 12 | @torch.no_grad()
 13 | def compute_predictions(model, dataloader):
 14 |     """
 15 |     Computes model outputs.
 16 | 
 17 |     :param model: model to evaluate
 18 |     :param dataloader: validation/test dataset loader
 19 |     :return: outputs
 20 |     :rtype: dict
 21 |     """
 22 |     model.eval()
 23 |     outputs = {'sentence': [], 'root_text': [], 'root_idx': [],
 24 |                'p1_dur': [], 'p2_dur': [], 'fine': [], 'rel': []}
 25 | 
 26 |     # Evaluate on mini-batches & then average over the total
 27 |     for batch in dataloader:
 28 |         # Load to device, for the list of batch tensors
 29 |         words = batch['words_list']         # .to(device)
 30 |         root = batch['root_idx']            # .to(device)
 31 | 
 32 |         # Add dummy event (2)
 33 |         span = [[[x], [x]] for x in root.tolist()]
 34 |         root = [[x, x] for x in root.tolist()]
 35 | 
 36 |         # Convert words to batch-first: [L, B] --> [B, L]
 37 |         words = list(map(list, zip(*words)))
 38 | 
 39 |         # Forward Pass
 40 |         p1_dur, p2_dur, fine, rel = model(words, span, root)
 41 | 
 42 |         _, p1_dur = p1_dur.max(1)
 43 |         _, p2_dur = p2_dur.max(1)
 44 | 
 45 |         outputs['sentence'] += [' '.join(w_lst) for w_lst in words]
 46 |         outputs['root_text'] += batch['root_text']
 47 |         outputs['root_idx'] += [idx.item() for idx in batch['root_idx']]
 48 | 
 49 |         outputs['p1_dur'] += p1_dur.detach().cpu().tolist()
 50 |         outputs['p2_dur'] += p2_dur.detach().cpu().tolist()
 51 |         outputs['fine'] += fine.detach().cpu().tolist()
 52 |         outputs['rel'] += rel.detach().cpu().tolist()
 53 | 
 54 |     return outputs
 55 | 
 56 | 
 57 | @torch.no_grad()
 58 | def compute_eval_metrics(model, dataloader, device, size):
 59 |     """
 60 |     For the given model, computes accuracy & loss on validation/test set.
 61 | 
 62 |     :param model: model to evaluate
 63 |     :param dataloader: validation/test set dataloader
 64 |     :param device: cuda/cpu device where the model resides
 65 |     :param size: no. of samples (subset) to use
 66 |     :return: metrics {'accuracy', 'loss'}
 67 |     :rtype: dict
 68 |     """
 69 |     model.eval()
 70 | 
 71 |     loss = 0.0
 72 |     num_correct = 0
 73 |     total_samples = 0
 74 | 
 75 |     # Evaluate on mini-batches & then average over the total
 76 |     for n_iter, batch in enumerate(dataloader):
 77 |         # Load to device, for the list of batch tensors
 78 |         image = batch['image'].to(device)
 79 |         label = batch['label'].to(device)
 80 | 
 81 |         # Forward Pass
 82 |         label_logits = model(image)
 83 | 
 84 |         # Compute Accuracy
 85 |         label_predicted = torch.argmax(label_logits, dim=1)
 86 |         correct = (label == label_predicted)
 87 |         num_correct += correct.sum().item()
 88 | 
 89 |         # Compute Loss
 90 |         loss += F.cross_entropy(label_logits, label, reduction='mean')
 91 | 
 92 |         batch_size = label_logits.shape[0]
 93 |         total_samples += batch_size
 94 | 
 95 |         if total_samples > size:
 96 |             break
 97 | 
 98 |     # Final Accuracy
 99 |     accuracy = 100.0 * (num_correct / total_samples)
100 | 
101 |     # Final Loss (averaged over mini-batches - n_iter)
102 |     loss = loss / n_iter
103 | 
104 |     metrics = {'accuracy': accuracy, 'loss': loss}
105 | 
106 |     return metrics
107 | 
108 | 
109 | # ---------------------------------------------------------------------------
110 | def setup_logger(parser, log_dir, file_name='train_log.txt'):
111 |     """
112 |     Generates log file and writes the executed python flags for the current run,
113 |     along with the training log (printed to console). \n
114 | 
115 |     This is helpful in maintaining experiment logs (with arguments). \n
116 | 
117 |     While resuming training, the new output log is simply appended to the previously created train log file.
118 | 
119 |     :param parser: argument parser object
120 |     :param log_dir: file path (to create)
121 |     :param file_name: log file name
122 |     :return: train log file
123 |     """
124 |     log_file_path = os.path.join(log_dir, file_name)
125 | 
126 |     log_file = open(log_file_path, 'a+')
127 | 
128 |     # python3 file_name.py
129 |     log_file.write('python3 ' + sys.argv[0] + '\n')
130 | 
131 |     # Add all the arguments (key value)
132 |     args = parser.parse_args()
133 | 
134 |     for key, value in vars(args).items():
135 |         # write to train log file
136 |         log_file.write('--' + key + ' ' + str(value) + '\n')
137 | 
138 |     log_file.write('\n\n')
139 |     log_file.flush()
140 | 
141 |     return log_file
142 | 
143 | 
144 | def print_log(msg, log_file):
145 |     """
146 |     :param str msg: Message to be printed & logged
147 |     :param file log_file: log file
148 |     """
149 |     log_file.write(msg + '\n')
150 |     log_file.flush()
151 | 
152 |     print(msg)
153 | 
154 | 
155 | def str2bool(v):
156 |     v = v.lower()
157 |     assert v in ['true', 'false', '1', '0'], 'Option requires: "true" or "false"'
158 |     return v in ['true', '1']
159 | 


--------------------------------------------------------------------------------
/component/NegationDetection/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | models
3 | bert_base_uncased_model
4 | bert_tokenizer
5 | xlnet_tokenizer
6 | xlnet-base-cased-model
7 | SFU_Review_Corpus_Negation_Speculation*


--------------------------------------------------------------------------------
/component/NegationDetection/README.md:
--------------------------------------------------------------------------------
 1 | # Negation Cue Detection and Scope Resolution
 2 | 
 3 | The training, evaluation and inference code for NegBERT.
 4 | 
 5 | For the cue detection, the label for each word follows the annotation schema:
 6 | * 0: Affix
 7 | * 1: Normal cue
 8 | * 2: Part of a multiword cue
 9 | * 3: Not a cue
10 | 
11 | ## Performance
12 | 
13 | Negation cue detection, evaluating on SFU review dataset:
14 | 
15 | ```
16 | Validation loss: 0.14660959019822234
17 | Validation Accuracy: 0.9972840671132076
18 | Validation Accuracy for Positive Cues: 0.9394110275689225
19 |        1     2        3
20 | 1  733.0   1.0     35.0
21 | 2    1.0  30.0     11.0
22 | 3   75.0  14.0  48267.0
23 |               precision    recall  f1-score   support
24 | 
25 |            1       0.91      0.95      0.93       769
26 |            2       0.67      0.71      0.69        42
27 |            3       1.00      1.00      1.00     48356
28 | 
29 |     accuracy                           1.00     49167
30 |    macro avg       0.86      0.89      0.87     49167
31 | weighted avg       1.00      1.00      1.00     49167
32 | 
33 | F1-Score: 0.9972513069839883
34 | Precision: 0.8955399061032864
35 | Recall: 0.9431396786155748
36 | F1 Score: 0.9187236604455147
37 | F1-Score Cue_No Cue: 0.9972891007811321
38 | ```
39 | 
40 | Negative scope resolution, evaluating on SFU review dataset:
41 | 
42 | ```
43 | Validation loss: 0.21461165494672807
44 | Validation Accuracy: 0.9522214842258335
45 | Validation Accuracy Scope Level: 0.7831683168316831
46 | Precision: 1
47 | Recall: 0.7838509316770186
48 | F1 Score: 0.8788300835654596
49 |               precision    recall  f1-score   support
50 | 
51 |            0       0.97      0.97      0.97     16358
52 |            1       0.90      0.92      0.91      5279
53 | 
54 |     accuracy                           0.95     21637
55 |    macro avg       0.94      0.94      0.94     21637
56 | weighted avg       0.96      0.95      0.95     2163
57 | ```
58 | 
59 | ## Training
60 | 
61 | To train the negation cue detection model, set `SUBTASK = 'cue_detection'`. For negation scope resolution model, set `SUBTASK = 'scope_resolution'`. Then
62 | 
63 | ```
64 | python train.py
65 | ```
66 | 
67 | ## Acknowledgement
68 | 
69 | ```
70 | @article{Khandelwal2020NegBERTAT,
71 |   title={NegBERT: A Transfer Learning Approach for Negation Detection and Scope Resolution},
72 |   author={Aditya Khandelwal and Suraj Sawant},
73 |   journal={ArXiv},
74 |   year={2020},
75 |   volume={abs/1911.04211}
76 | }
77 | ```
78 | 
79 | Adapted from the codebase: https://github.com/adityak6798/Transformers-For-Negation-and-Speculation
80 | 
81 | The SFU review dataset can be downloaded from [this link](https://www.sfu.ca/~mtaboada/SFU_Review_Corpus.html)


--------------------------------------------------------------------------------
/component/REST_service/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | from collections import defaultdict
 5 | from flask import Flask, jsonify
 6 | import json
 7 | from flask import Response
 8 | from flask import request
 9 | import requests
10 | import urllib.parse
11 | import ast
12 | import sys
13 | sys.path.append("..")
14 | sys.path.append("../Duration")
15 | from Duration.inference_api import DurationAPI
16 | 
17 | if __name__ == "__main__":
18 |     '''
19 |     This program will establish or call an web service to call component.
20 |     Mode 1: server. The machine is act as server to respond to web API REST calls (activate by run the program externally and set mode to 'server')
21 |     Mode 2: client. The machine will call a server to get embedding. (activate by run the program externally and set mode to 'client')
22 |     '''
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("-mode", "--mode", help="run as server, client [server, client]", type=str, default="server")
25 |     parser.add_argument("-port", "--port", help="port to run this REST service", type=int, default=17000)
26 |     args = parser.parse_args()
27 | 
28 |     # Option 2: Run as a server to provide API service
29 |     if args.mode == "server":
30 |         print ('-----component/REST_service: HTTP SERVER MODE-----')
31 | 
32 |         # Load component class
33 |         durationAPI = DurationAPI(base_dir = '../Duration')
34 | 
35 |         app = Flask(__name__)
36 |         @app.route('/duration', methods=['POST'])
37 |         def response_pred():
38 |             # get three parameters
39 |             print('============REST_service')
40 |             # text = request.args.get('text')
41 |             # domain = request.args.get('domain')
42 |             # events = request.args.get('events')
43 |             # print(text)
44 |             # print(domain)
45 |             # print(events)
46 |             json = request.get_json()
47 |             print (json)
48 |             events = json['events']
49 |             print (events)
50 |             json_list = durationAPI.pred(events)
51 |             print (json_list)
52 |             response_json = {'json_list': json_list}
53 |             return jsonify(response_json)
54 |         app.run(port=args.port)
55 |     else:
56 |         print ('-> MODE NOT CHOSEN')


--------------------------------------------------------------------------------
/component/TempRel/.gitignore:
--------------------------------------------------------------------------------
  1 | # Project specific
  2 | models
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # IDE
 13 | .idea
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 


--------------------------------------------------------------------------------
/component/TempRel/code/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python joint_model.py -input_text "Orders went out today to deploy 17,000 U.S. Army soldiers in the Persian Gulf region."
3 | 


--------------------------------------------------------------------------------
/component/TempRel/other/pos_tags.txt:
--------------------------------------------------------------------------------
 1 | CC
 2 | CD
 3 | DT
 4 | EX
 5 | FW
 6 | IN
 7 | JJ
 8 | JJR
 9 | JJS
10 | LS
11 | MD
12 | NN
13 | NNS
14 | NNP
15 | NNPS
16 | PDT
17 | POS
18 | PRP
19 | PRP$
20 | RB
21 | RBR
22 | RBS
23 | RP
24 | SYM
25 | TO
26 | UH
27 | VB
28 | VBD
29 | VBG
30 | VBN
31 | VBP
32 | VBZ
33 | WDT
34 | WP
35 | WP$
36 | WRB


--------------------------------------------------------------------------------
/component/component_envs/env_temprel.yml:
--------------------------------------------------------------------------------
  1 | name: event-pipeline
  2 | channels:
  3 |   - pytorch
  4 |   - gurobi
  5 |   - serge-sans-paille
  6 |   - conda-forge
  7 |   - defaults
  8 | dependencies:
  9 |   - _ipyw_jlab_nb_ext_conf=0.1.0=py37_0
 10 |   - _libgcc_mutex=0.1=main
 11 |   - _pytorch_select=0.2=gpu_0
 12 |   - alabaster=0.7.12=py37_0
 13 |   - anaconda=2019.10=py37_0
 14 |   - anaconda-client=1.7.2=py37_0
 15 |   - anaconda-navigator=1.9.7=py37_0
 16 |   - anaconda-project=0.8.3=py_0
 17 |   - asn1crypto=1.0.1=py37_0
 18 |   - astroid=2.3.1=py37_0
 19 |   - astropy=3.2.2=py37h7b6447c_0
 20 |   - atomicwrites=1.3.0=py37_1
 21 |   - attrs=19.2.0=py_0
 22 |   - babel=2.7.0=py_0
 23 |   - backcall=0.1.0=py37_0
 24 |   - backports=1.0=py_2
 25 |   - backports.functools_lru_cache=1.6.1=py_0
 26 |   - backports.os=0.1.1=py37_0
 27 |   - backports.shutil_get_terminal_size=1.0.0=py37_2
 28 |   - backports.tempfile=1.0=py_1
 29 |   - backports.weakref=1.0.post1=py_1
 30 |   - beautifulsoup4=4.8.0=py37_0
 31 |   - bitarray=1.0.1=py37h7b6447c_0
 32 |   - bkcharts=0.2=py37_0
 33 |   - blas=1.0=mkl
 34 |   - bleach=3.1.0=py37_0
 35 |   - blosc=1.16.3=hd408876_0
 36 |   - bokeh=1.3.4=py37_0
 37 |   - boto=2.49.0=py37_0
 38 |   - bottleneck=1.2.1=py37h035aef0_1
 39 |   - bzip2=1.0.8=h7b6447c_0
 40 |   - ca-certificates=2019.8.28=0
 41 |   - cairo=1.14.12=h8948797_3
 42 |   - catalogue=0.0.8=py_0
 43 |   - certifi=2019.9.11=py37_0
 44 |   - cffi=1.12.3=py37h2e261b9_0
 45 |   - chardet=3.0.4=py37_1003
 46 |   - click=7.0=py37_0
 47 |   - cloog=0.18.1=1
 48 |   - cloudpickle=1.2.2=py_0
 49 |   - clyent=1.2.2=py37_1
 50 |   - colorama=0.4.1=py37_0
 51 |   - conda=4.8.3=py37hc8dfbb8_1
 52 |   - conda-build=3.18.9=py37_3
 53 |   - conda-env=2.6.0=1
 54 |   - conda-package-handling=1.6.0=py37h7b6447c_0
 55 |   - conda-verify=3.4.2=py_1
 56 |   - contextlib2=0.6.0=py_0
 57 |   - cryptography=2.7=py37h1ba5d50_0
 58 |   - cudatoolkit=10.1.243=h6bb024c_0
 59 |   - curl=7.65.3=hbc83047_0
 60 |   - cycler=0.10.0=py37_0
 61 |   - cymem=2.0.3=py37he1b5a44_0
 62 |   - cython=0.29.13=py37he6710b0_0
 63 |   - cython-blis=0.4.1=py37h516909a_0
 64 |   - cytoolz=0.10.0=py37h7b6447c_0
 65 |   - dask=2.5.2=py_0
 66 |   - dask-core=2.5.2=py_0
 67 |   - dataclasses=0.7=py37_0
 68 |   - dbus=1.13.6=h746ee38_0
 69 |   - decorator=4.4.0=py37_1
 70 |   - defusedxml=0.6.0=py_0
 71 |   - distributed=2.5.2=py_0
 72 |   - docutils=0.15.2=py37_0
 73 |   - entrypoints=0.3=py37_0
 74 |   - et_xmlfile=1.0.1=py37_0
 75 |   - expat=2.2.6=he6710b0_0
 76 |   - fastcache=1.1.0=py37h7b6447c_0
 77 |   - filelock=3.0.12=py_0
 78 |   - flask=1.1.1=py_0
 79 |   - fontconfig=2.13.0=h9420a91_0
 80 |   - freetype=2.9.1=h8a8886c_1
 81 |   - fribidi=1.0.5=h7b6447c_0
 82 |   - fsspec=0.5.2=py_0
 83 |   - future=0.18.2=py37_0
 84 |   - gcc_49=4.9.1=6
 85 |   - get_terminal_size=1.0.0=haa9412d_0
 86 |   - gevent=1.4.0=py37h7b6447c_0
 87 |   - glib=2.56.2=hd408876_0
 88 |   - glob2=0.7=py_0
 89 |   - gmp=6.1.2=h6c8ec71_1
 90 |   - gmpy2=2.0.8=py37h10f8cd9_2
 91 |   - graphite2=1.3.13=h23475e2_0
 92 |   - greenlet=0.4.15=py37h7b6447c_0
 93 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 94 |   - gstreamer=1.14.0=hb453b48_1
 95 |   - gurobi=9.0.1=py37_0
 96 |   - h5py=2.9.0=py37h7918eee_0
 97 |   - harfbuzz=1.8.8=hffaf4a1_0
 98 |   - hdf5=1.10.4=hb1b8bf9_0
 99 |   - heapdict=1.0.1=py_0
100 |   - html5lib=1.0.1=py37_0
101 |   - icu=58.2=h9c2bf20_1
102 |   - idna=2.8=py37_0
103 |   - imageio=2.6.0=py37_0
104 |   - imagesize=1.1.0=py37_0
105 |   - importlib_metadata=0.23=py37_0
106 |   - intel-openmp=2019.4=243
107 |   - ipykernel=5.1.2=py37h39e3cac_0
108 |   - ipython=7.8.0=py37h39e3cac_0
109 |   - ipython_genutils=0.2.0=py37_0
110 |   - ipywidgets=7.5.1=py_0
111 |   - isl=0.12.2=0
112 |   - isort=4.3.21=py37_0
113 |   - itsdangerous=1.1.0=py37_0
114 |   - jbig=2.1=hdba287a_0
115 |   - jdcal=1.4.1=py_0
116 |   - jedi=0.15.1=py37_0
117 |   - jeepney=0.4.1=py_0
118 |   - jinja2=2.10.3=py_0
119 |   - joblib=0.13.2=py37_0
120 |   - jpeg=9b=h024ee3a_2
121 |   - json5=0.8.5=py_0
122 |   - jsonschema=3.0.2=py37_0
123 |   - jupyter=1.0.0=py37_7
124 |   - jupyter_client=5.3.3=py37_1
125 |   - jupyter_console=6.0.0=py37_0
126 |   - jupyter_core=4.5.0=py_0
127 |   - jupyterlab=1.1.4=pyhf63ae98_0
128 |   - jupyterlab_server=1.0.6=py_0
129 |   - keyring=18.0.0=py37_0
130 |   - kiwisolver=1.1.0=py37he6710b0_0
131 |   - krb5=1.16.1=h173b8e3_7
132 |   - lazy-object-proxy=1.4.2=py37h7b6447c_0
133 |   - libarchive=3.3.3=h5d8350f_5
134 |   - libcurl=7.65.3=h20c2e04_0
135 |   - libedit=3.1.20181209=hc058e9b_0
136 |   - libffi=3.2.1=hd88cf55_4
137 |   - libgcc-ng=9.1.0=hdf63c60_0
138 |   - libgfortran-ng=7.3.0=hdf63c60_0
139 |   - liblief=0.9.0=h7725739_2
140 |   - libpng=1.6.37=hbc83047_0
141 |   - libsodium=1.0.16=h1bed415_0
142 |   - libssh2=1.8.2=h1ba5d50_0
143 |   - libstdcxx-ng=9.1.0=hdf63c60_0
144 |   - libtiff=4.0.10=h2733197_2
145 |   - libtool=2.4.6=h7b6447c_5
146 |   - libuuid=1.0.3=h1bed415_2
147 |   - libxcb=1.13=h1bed415_1
148 |   - libxml2=2.9.9=hea5a465_1
149 |   - libxslt=1.1.33=h7d1a2b0_0
150 |   - llvmlite=0.29.0=py37hd408876_0
151 |   - locket=0.2.0=py37_1
152 |   - lxml=4.4.1=py37hefd8a0e_0
153 |   - lz4-c=1.8.1.2=h14c3975_0
154 |   - lzo=2.10=h49e0be7_2
155 |   - markupsafe=1.1.1=py37h7b6447c_0
156 |   - matplotlib=3.1.1=py37h5429711_0
157 |   - mccabe=0.6.1=py37_1
158 |   - mistune=0.8.4=py37h7b6447c_0
159 |   - mkl=2019.4=243
160 |   - mkl-service=2.3.0=py37he904b0f_0
161 |   - mkl_fft=1.0.14=py37ha843d7b_0
162 |   - mkl_random=1.1.0=py37hd6b4f25_0
163 |   - mock=3.0.5=py37_0
164 |   - more-itertools=7.2.0=py37_0
165 |   - mpc=1.1.0=h10f8cd9_1
166 |   - mpfr=4.0.1=hdf1c602_3
167 |   - mpmath=1.1.0=py37_0
168 |   - msgpack-python=0.6.1=py37hfd86e86_1
169 |   - multipledispatch=0.6.0=py37_0
170 |   - murmurhash=1.0.0=py37he1b5a44_0
171 |   - navigator-updater=0.2.1=py37_0
172 |   - nbconvert=5.6.0=py37_1
173 |   - nbformat=4.4.0=py37_0
174 |   - ncurses=6.1=he6710b0_1
175 |   - networkx=2.3=py_0
176 |   - ninja=1.9.0=py37hfd86e86_0
177 |   - nltk=3.4.5=py37_0
178 |   - nose=1.3.7=py37_2
179 |   - notebook=6.0.1=py37_0
180 |   - numba=0.45.1=py37h962f231_0
181 |   - numexpr=2.7.0=py37h9e4a6bb_0
182 |   - numpy=1.17.2=py37haad9e8e_0
183 |   - numpy-base=1.17.2=py37hde5b4d6_0
184 |   - numpydoc=0.9.1=py_0
185 |   - olefile=0.46=py37_0
186 |   - openpyxl=3.0.0=py_0
187 |   - openssl=1.1.1d=h7b6447c_2
188 |   - packaging=19.2=py_0
189 |   - pandas=0.25.1=py37he6710b0_0
190 |   - pandoc=2.2.3.2=0
191 |   - pandocfilters=1.4.2=py37_1
192 |   - pango=1.42.4=h049681c_0
193 |   - parso=0.5.1=py_0
194 |   - partd=1.0.0=py_0
195 |   - patchelf=0.9=he6710b0_3
196 |   - path.py=12.0.1=py_0
197 |   - pathlib2=2.3.5=py37_0
198 |   - patsy=0.5.1=py37_0
199 |   - pcre=8.43=he6710b0_0
200 |   - pep8=1.7.1=py37_0
201 |   - pexpect=4.7.0=py37_0
202 |   - pickleshare=0.7.5=py37_0
203 |   - pillow=6.2.0=py37h34e0f95_0
204 |   - pip=19.2.3=py37_0
205 |   - pixman=0.38.0=h7b6447c_0
206 |   - pkginfo=1.5.0.1=py37_0
207 |   - plac=0.9.6=py37_0
208 |   - pluggy=0.13.0=py37_0
209 |   - ply=3.11=py37_0
210 |   - preshed=3.0.2=py37he1b5a44_1
211 |   - prometheus_client=0.7.1=py_0
212 |   - prompt_toolkit=2.0.10=py_0
213 |   - psutil=5.6.3=py37h7b6447c_0
214 |   - ptyprocess=0.6.0=py37_0
215 |   - py=1.8.0=py37_0
216 |   - py-lief=0.9.0=py37h7725739_2
217 |   - pycodestyle=2.5.0=py37_0
218 |   - pycosat=0.6.3=py37h14c3975_0
219 |   - pycparser=2.19=py37_0
220 |   - pycrypto=2.6.1=py37h14c3975_9
221 |   - pycurl=7.43.0.3=py37h1ba5d50_0
222 |   - pyflakes=2.1.1=py37_0
223 |   - pygments=2.4.2=py_0
224 |   - pylint=2.4.2=py37_0
225 |   - pyodbc=4.0.27=py37he6710b0_0
226 |   - pyopenssl=19.0.0=py37_0
227 |   - pyparsing=2.4.2=py_0
228 |   - pyqt=5.9.2=py37h05f1152_2
229 |   - pyrsistent=0.15.4=py37h7b6447c_0
230 |   - pysocks=1.7.1=py37_0
231 |   - pytables=3.5.2=py37h71ec239_1
232 |   - pytest=5.2.1=py37_0
233 |   - pytest-arraydiff=0.3=py37h39e3cac_0
234 |   - pytest-astropy=0.5.0=py37_0
235 |   - pytest-doctestplus=0.4.0=py_0
236 |   - pytest-openfiles=0.4.0=py_0
237 |   - pytest-remotedata=0.3.2=py37_0
238 |   - python=3.7.4=h265db76_1
239 |   - python-dateutil=2.8.0=py37_0
240 |   - python-libarchive-c=2.8=py37_13
241 |   - python_abi=3.7=1_cp37m
242 |   - pytorch=1.3.1=py3.7_cuda10.1.243_cudnn7.6.3_0
243 |   - pytz=2019.3=py_0
244 |   - pywavelets=1.0.3=py37hdd07704_1
245 |   - pyyaml=5.1.2=py37h7b6447c_0
246 |   - pyzmq=18.1.0=py37he6710b0_0
247 |   - qt=5.9.7=h5867ecd_1
248 |   - qtawesome=0.6.0=py_0
249 |   - qtconsole=4.5.5=py_0
250 |   - qtpy=1.9.0=py_0
251 |   - readline=7.0=h7b6447c_5
252 |   - requests=2.22.0=py37_0
253 |   - ripgrep=0.10.0=hc07d326_0
254 |   - rope=0.14.0=py_0
255 |   - ruamel_yaml=0.15.46=py37h14c3975_0
256 |   - scikit-image=0.15.0=py37he6710b0_0
257 |   - scikit-learn=0.21.3=py37hd81dba3_0
258 |   - scipy=1.3.1=py37h7c811a0_0
259 |   - seaborn=0.9.0=py37_0
260 |   - secretstorage=3.1.1=py37_0
261 |   - send2trash=1.5.0=py37_0
262 |   - setuptools=41.4.0=py37_0
263 |   - simplegeneric=0.8.1=py37_2
264 |   - singledispatch=3.4.0.3=py37_0
265 |   - sip=4.19.8=py37hf484d3e_0
266 |   - six=1.12.0=py37_0
267 |   - snappy=1.1.7=hbae5bb6_3
268 |   - snowballstemmer=2.0.0=py_0
269 |   - sortedcollections=1.1.2=py37_0
270 |   - sortedcontainers=2.1.0=py37_0
271 |   - soupsieve=1.9.3=py37_0
272 |   - spacy=2.2.3=py37hc9558a2_0
273 |   - sphinx=2.2.0=py_0
274 |   - sphinxcontrib=1.0=py37_1
275 |   - sphinxcontrib-applehelp=1.0.1=py_0
276 |   - sphinxcontrib-devhelp=1.0.1=py_0
277 |   - sphinxcontrib-htmlhelp=1.0.2=py_0
278 |   - sphinxcontrib-jsmath=1.0.1=py_0
279 |   - sphinxcontrib-qthelp=1.0.2=py_0
280 |   - sphinxcontrib-serializinghtml=1.1.3=py_0
281 |   - sphinxcontrib-websupport=1.1.2=py_0
282 |   - spyder=3.3.6=py37_0
283 |   - spyder-kernels=0.5.2=py37_0
284 |   - sqlalchemy=1.3.9=py37h7b6447c_0
285 |   - sqlite=3.30.0=h7b6447c_0
286 |   - srsly=0.2.0=py37he1b5a44_0
287 |   - statsmodels=0.10.1=py37hdd07704_0
288 |   - sympy=1.4=py37_0
289 |   - tbb=2019.4=hfd86e86_0
290 |   - tblib=1.4.0=py_0
291 |   - terminado=0.8.2=py37_0
292 |   - testpath=0.4.2=py37_0
293 |   - thinc=7.3.0=py37hc9558a2_0
294 |   - tk=8.6.8=hbc83047_0
295 |   - toolz=0.10.0=py_0
296 |   - tornado=6.0.3=py37h7b6447c_0
297 |   - traitlets=4.3.3=py37_0
298 |   - unicodecsv=0.14.1=py37_0
299 |   - unixodbc=2.3.7=h14c3975_0
300 |   - urllib3=1.24.2=py37_0
301 |   - wasabi=0.4.0=py_0
302 |   - wcwidth=0.1.7=py37_0
303 |   - webencodings=0.5.1=py37_1
304 |   - werkzeug=0.16.0=py_0
305 |   - wheel=0.33.6=py37_0
306 |   - widgetsnbextension=3.5.1=py37_0
307 |   - wrapt=1.11.2=py37h7b6447c_0
308 |   - wurlitzer=1.0.3=py37_0
309 |   - xlrd=1.2.0=py37_0
310 |   - xlsxwriter=1.2.1=py_0
311 |   - xlwt=1.3.0=py37_0
312 |   - xz=5.2.4=h14c3975_4
313 |   - yaml=0.1.7=had09818_2
314 |   - zeromq=4.3.1=he6710b0_3
315 |   - zict=1.0.0=py_0
316 |   - zipp=0.6.0=py_0
317 |   - zlib=1.2.11=h7b6447c_3
318 |   - zstd=1.3.7=h0b5b093_0
319 |   - pip:
320 |     - en-core-web-sm-mirror==2.2.5
321 |     - frozendict==1.2
322 |     - immutablecollections==0.9.0
323 |     - sacremoses==0.0.38
324 |     - sentencepiece==0.1.85
325 |     - tokenizers==0.0.11
326 |     - torchvision==0.2.2
327 |     - tqdm==4.19.9
328 |     - transformers==2.3.0
329 |     - typing-extensions==3.7.4.1
330 |     - vistautils==0.17.0
331 | prefix: /nas/home/mingyuma/miniconda3/envs/event-pipeline
332 | 
333 | 


--------------------------------------------------------------------------------
/component/component_envs/req_better.txt:
--------------------------------------------------------------------------------
1 | allennlp==0.9.0
2 | pytorch-crf==0.7.2
3 | pytorch-nlp==0.5.0
4 | seqeval==0.0.12
5 | sklearn==0.0
6 | tensorboardX==2.0
7 | torch==1.4.0
8 | transformers==2.4.1


--------------------------------------------------------------------------------
/component/component_envs/req_biomed.txt:
--------------------------------------------------------------------------------
  1 | attrs==19.3.0
  2 | awscli==1.18.84
  3 | backcall==0.2.0
  4 | bleach==3.1.5
  5 | blis==0.4.1
  6 | boto3==1.14.7
  7 | botocore==1.17.7
  8 | catalogue==1.0.0
  9 | certifi==2020.4.5.2
 10 | chardet==3.0.4
 11 | click==7.1.2
 12 | colorama==0.4.3
 13 | conllu==3.0
 14 | cycler==0.10.0
 15 | cymem==2.0.3
 16 | decorator==4.4.2
 17 | defusedxml==0.6.0
 18 | docutils==0.15.2
 19 | en-ner-jnlpba-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_jnlpba_md-0.2.4.tar.gz
 20 | entrypoints==0.3
 21 | filelock==3.0.12
 22 | googledrivedownloader==0.4
 23 | h5py==2.10.0
 24 | idna==2.9
 25 | imageio==2.8.0
 26 | importlib-metadata==1.6.1
 27 | ipykernel==5.3.0
 28 | ipython==7.15.0
 29 | ipython-genutils==0.2.0
 30 | ipywidgets==7.5.1
 31 | isodate==0.6.0
 32 | jedi==0.17.1
 33 | Jinja2==2.11.2
 34 | jmespath==0.10.0
 35 | joblib==0.14.1
 36 | jsonschema==3.2.0
 37 | jupyter==1.0.0
 38 | jupyter-client==6.1.3
 39 | jupyter-console==6.1.0
 40 | jupyter-core==4.6.3
 41 | kiwisolver==1.2.0
 42 | llvmlite==0.33.0
 43 | MarkupSafe==1.1.1
 44 | matplotlib==3.2.2
 45 | mistune==0.8.4
 46 | murmurhash==1.0.2
 47 | nbconvert==5.6.1
 48 | nbformat==5.0.7
 49 | networkx==2.4
 50 | nmslib==2.0.6
 51 | notebook==6.0.3
 52 | numba==0.50.0
 53 | numpy==1.16.0
 54 | packaging==20.4
 55 | pandas==0.24.2
 56 | pandocfilters==1.4.2
 57 | parso==0.7.0
 58 | pexpect==4.8.0
 59 | pickleshare==0.7.5
 60 | Pillow==7.1.2
 61 | plac==1.1.3
 62 | plyfile==0.7.2
 63 | preshed==3.0.2
 64 | prometheus-client==0.8.0
 65 | prompt-toolkit==3.0.5
 66 | psutil==5.7.0
 67 | ptyprocess==0.6.0
 68 | pyasn1==0.4.8
 69 | pybind11==2.5.0
 70 | Pygments==2.6.1
 71 | pyparsing==2.4.7
 72 | pyrsistent==0.16.0
 73 | pysbd==0.2.3
 74 | python-dateutil==2.8.1
 75 | pytz==2020.1
 76 | PyWavelets==1.1.1
 77 | PyYAML==5.3.1
 78 | pyzmq==19.0.1
 79 | qtconsole==4.7.4
 80 | QtPy==1.9.0
 81 | rdflib==5.0.0
 82 | regex==2020.6.8
 83 | requests==2.24.0
 84 | rsa==3.4.2
 85 | s3transfer==0.3.3
 86 | sacremoses==0.0.43
 87 | scikit-image==0.17.2
 88 | scikit-learn==0.20.3
 89 | scipy==1.4.1
 90 | scispacy==0.2.4
 91 | Send2Trash==1.5.0
 92 | sentencepiece==0.1.91
 93 | six==1.15.0
 94 | spacy==2.2.4
 95 | srsly==1.0.2
 96 | terminado==0.8.3
 97 | testpath==0.4.4
 98 | thinc==7.4.0
 99 | threadpoolctl==2.1.0
100 | tifffile==2020.6.3
101 | tokenizers==0.5.0
102 | torch==1.4.0
103 | torch-geometric==1.4.3
104 | tornado==6.0.4
105 | tqdm==4.46.1
106 | traitlets==4.3.3
107 | transformers==2.5.0
108 | urllib3==1.25.9
109 | wasabi==0.6.0
110 | wcwidth==0.2.4
111 | webencodings==0.5.1
112 | widgetsnbextension==3.5.1
113 | zipp==3.1.0


--------------------------------------------------------------------------------
/project/APIs/coref.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | import numpy as np
  4 | from allennlp.predictors.predictor import Predictor
  5 | 
  6 | class NumpyEncoder(json.JSONEncoder):
  7 |     """ Custom encoder for numpy data types """
  8 |     def default(self, obj):
  9 |         if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
 10 |                             np.int16, np.int32, np.int64, np.uint8,
 11 |                             np.uint16, np.uint32, np.uint64)):
 12 | 
 13 |             return int(obj)
 14 | 
 15 |         elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
 16 |             return float(obj)
 17 | 
 18 |         elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
 19 |             return {'real': obj.real, 'imag': obj.imag}
 20 | 
 21 |         elif isinstance(obj, (np.ndarray,)):
 22 |             return obj.tolist()
 23 | 
 24 |         elif isinstance(obj, (np.bool_)):
 25 |             return bool(obj)
 26 | 
 27 |         elif isinstance(obj, (np.void)): 
 28 |             return None
 29 | 
 30 |         return json.JSONEncoder.default(self, obj)
 31 | 
 32 | def get_coference(doc):
 33 |     pred = predictor.predict(document = doc)
 34 |     clusters = pred['clusters']
 35 |     document = pred['document'] 
 36 |     top_spans = pred['top_spans']
 37 | 
 38 |     # find the main span for each cluster
 39 |     clusters_top_span = []
 40 |     for i in range(0, len(clusters)):
 41 |         one_cl = clusters[i]
 42 |         span_rank = [top_spans.index(span) for span in one_cl]
 43 |         top_span = np.argmin(span_rank)
 44 |         clusters_top_span.append(one_cl[top_span])
 45 |     pred['clusters_top_span'] = clusters_top_span
 46 | 
 47 |     # convert top span for each cluster to text
 48 |     clusters_top_span_text = []
 49 |     for each_top_span in clusters_top_span:
 50 |         span_text = document[each_top_span[0]:(each_top_span[1]+1)]
 51 |         clusters_top_span_text.append(span_text)
 52 |     pred['clusters_top_span_text'] = clusters_top_span_text
 53 |     
 54 |     return pred
 55 | 
 56 | def save(args, result_json):
 57 |     # result_json = {
 58 |     #     'error_list': not_done_list,
 59 |     #     'result_list': result_list
 60 |     # }
 61 |     with open(args.save_path_json, 'w', encoding='utf-8') as f:
 62 |         # Use NumpyEncoder to convert numpy data to list
 63 |         # Previous error: Object of type int64 is not JSON serializable
 64 |         json.dump(result_json, f, indent=4, ensure_ascii=False,
 65 |                     cls=NumpyEncoder)
 66 |     print ('Saved')
 67 | 
 68 | if __name__ == '__main__':
 69 |     p = argparse.ArgumentParser()
 70 |     p.add_argument('-data', type=str, default='../../raw_text/test.0622_pipelined.json')
 71 |     p.add_argument('-save_path_json', type=str, default='../../raw_text/test.0622_pipelined_coref.json')
 72 |     args = p.parse_args()
 73 | 
 74 |     data = json.load(open(args.data))
 75 |     # load AllenNLP predictor
 76 |     predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
 77 | 
 78 |     docs = data['result_list']
 79 | 
 80 |     for doc_num, doc in enumerate(docs):
 81 |         # ensemble the document from sentences
 82 |         doc_text_list = []
 83 |         doc_text_len = [0]
 84 |         for sen in doc:
 85 |             print(sen['tokens'])
 86 |             doc_text_list.append(sen['sentence'])
 87 |             doc_text_len.append(len(sen['tokens']))
 88 |         doc_text = ' '.join(doc_text_list)
 89 |         for i, num in enumerate(doc_text_len):
 90 |             if i >= 1:
 91 |                 doc_text_len[i] += doc_text_len[i - 1]
 92 |         sens_idx_beg = doc_text_len[:-1]
 93 |         sens_idx_end = doc_text_len[1:]
 94 |         # sens_idx_beg saves the beginning token idx of each sentence in the doc
 95 |         # sens_idx_end saves the ending token idx of each sentence in the doc
 96 |         print(doc_text)
 97 |         print(sens_idx_beg)
 98 |         print(sens_idx_end)
 99 |         
100 |         # get coreference result for the document
101 |         coref_pred = get_coference(doc_text)
102 |         print(coref_pred)
103 |         
104 |         # save coref result to json
105 |         for i_cluster, cluster in enumerate(coref_pred['clusters']):
106 |             print('------')
107 |             print(cluster)
108 |             for mention in cluster:
109 |                 print(mention)
110 |                 # identify which sentence that this mention belongs to
111 |                 sen_nums = [i for i, beg in enumerate(sens_idx_beg) if mention[0] >= beg and mention[0] < sens_idx_end[i]]
112 |                 for sen_num in sen_nums:
113 |                     mention_idx_in_this_sen = [i - sens_idx_beg[sen_num] for i in mention]
114 |                     events_of_this_sen = data['result_list'][doc_num][sen_num]['events']
115 |                     for i_e, e in enumerate(events_of_this_sen):
116 |                         for i_arg, arg_obj in enumerate(e['arguments']):
117 |                             if arg_obj['start_token'] == mention_idx_in_this_sen[0] and arg_obj['end_token'] == mention_idx_in_this_sen[1]:
118 |                                 # this argument is exactly the one need to update its text to co-referenced span
119 |                                 data['result_list'][doc_num][sen_num]['events'][i_e]['arguments'][i_arg]["text"] = " ".join(coref_pred['clusters_top_span_text'][i_cluster])
120 | 
121 |     save(args, data)


--------------------------------------------------------------------------------
/project/APIs/coref_pre.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | import numpy as np
  4 | from allennlp.predictors.predictor import Predictor
  5 | 
  6 | class NumpyEncoder(json.JSONEncoder):
  7 |     """ Custom encoder for numpy data types """
  8 |     def default(self, obj):
  9 |         if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
 10 |                             np.int16, np.int32, np.int64, np.uint8,
 11 |                             np.uint16, np.uint32, np.uint64)):
 12 | 
 13 |             return int(obj)
 14 | 
 15 |         elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
 16 |             return float(obj)
 17 | 
 18 |         elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
 19 |             return {'real': obj.real, 'imag': obj.imag}
 20 | 
 21 |         elif isinstance(obj, (np.ndarray,)):
 22 |             return obj.tolist()
 23 | 
 24 |         elif isinstance(obj, (np.bool_)):
 25 |             return bool(obj)
 26 | 
 27 |         elif isinstance(obj, (np.void)): 
 28 |             return None
 29 | 
 30 |         return json.JSONEncoder.default(self, obj)
 31 | 
 32 | def get_coference(doc):
 33 |     pred = predictor.predict(document = doc)
 34 |     clusters = pred['clusters']
 35 |     document = pred['document'] 
 36 |     top_spans = pred['top_spans']
 37 | 
 38 |     # find the main span for each cluster
 39 |     clusters_top_span = []
 40 |     for i in range(0, len(clusters)):
 41 |         one_cl = clusters[i]
 42 |         span_rank = [top_spans.index(span) for span in one_cl]
 43 |         top_span = np.argmin(span_rank)
 44 |         clusters_top_span.append(one_cl[top_span])
 45 |     pred['clusters_top_span'] = clusters_top_span
 46 | 
 47 |     # convert top span for each cluster to text
 48 |     clusters_top_span_text = []
 49 |     for each_top_span in clusters_top_span:
 50 |         span_text = document[each_top_span[0]:(each_top_span[1]+1)]
 51 |         clusters_top_span_text.append(span_text)
 52 |     pred['clusters_top_span_text'] = clusters_top_span_text
 53 |     
 54 |     return pred
 55 | 
 56 | def save(args, result_json):
 57 |     # result_json = {
 58 |     #     'error_list': not_done_list,
 59 |     #     'result_list': result_list
 60 |     # }
 61 |     with open(args.save_path_json, 'w', encoding='utf-8') as f:
 62 |         # Use NumpyEncoder to convert numpy data to list
 63 |         # Previous error: Object of type int64 is not JSON serializable
 64 |         json.dump(result_json, f, indent=4, ensure_ascii=False,
 65 |                     cls=NumpyEncoder)
 66 |     print ('Saved')
 67 | 
 68 | def save_txt(args, docs_list):
 69 |     with open(args.save_path, 'w', encoding='utf-8') as f:
 70 |         f.write('\n'.join(docs_list))
 71 | 
 72 | if __name__ == '__main__':
 73 |     p = argparse.ArgumentParser()
 74 |     p.add_argument('-data', type=str, default='../../raw_text/test.0622.txt')
 75 |     p.add_argument('-save_path', type=str, default='../../raw_text/test.0622_coref_replaced.txt')
 76 |     args = p.parse_args()
 77 | 
 78 |     with (open(args.data, "r")) as f:
 79 |         docs = [line.rstrip() for line in f]
 80 |     # load AllenNLP predictor
 81 |     predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
 82 | 
 83 |     for doc_num, doc_text in enumerate(docs):
 84 |         # get coreference result for the document
 85 |         coref_pred = get_coference(doc_text)
 86 |         print('*' * 20)
 87 |         print(coref_pred)
 88 |         doc_tokens = coref_pred['document']
 89 |         
 90 |         # replace coref mention to main mention
 91 |         for i_cluster, cluster in enumerate(coref_pred['clusters']):
 92 |             print('------')
 93 |             print(cluster)
 94 |             for mention in cluster:
 95 |                 print("--> mention: %s" % mention)
 96 |                 # replace each token in the mention range to empty
 97 |                 for i in range(mention[0], mention[1]+1):
 98 |                     doc_tokens[i] = ''
 99 |                 # replace the first token with the main mention
100 |                 doc_tokens[mention[0]] = ' '.join(coref_pred['clusters_top_span_text'][i_cluster])
101 |                 print(doc_tokens)
102 |         docs[doc_num] = ' '.join([i for i in doc_tokens if i])
103 |         print("Replaced docs: %s" % docs[doc_num])
104 | 
105 |     # save(args, data)
106 |     save_txt(args, docs)


--------------------------------------------------------------------------------
/project/APIs/test_on_ace_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run this script to run the API on the entire ACE dev and test data
 3 | 
 4 | # Under project
 5 | python APIs/test_on_ace_data.py
 6 | """
 7 | 
 8 | import pickle
 9 | import sys
10 | import os
11 | import argparse
12 | import json
13 | from main import EventAPIs
14 | 
15 | def save(args, result_list, not_done_list):
16 |     with open(args.save_path, 'wb') as f:
17 |         pickle.dump(result_list, f)
18 | 
19 |     result_json = {
20 |         'error_list': not_done_list,
21 |         'result_list': result_list
22 |     }
23 |     with open(args.save_path_json, 'w', encoding='utf-8') as f:
24 |         json.dump(result_json, f, indent=4)
25 |     print ('Saved')
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     p = argparse.ArgumentParser()
30 |     p.add_argument('-data', type=str, default='../../ace_data/ace_rawtext_dev.pkl')
31 |     # p.add_argument('-data', type=str, default='../../ace_data/ace_rawtext_test.pkl')
32 |     p.add_argument('-save_path', type=str, default='../../ace_data/ace_rawtext_dev_pipelined.pkl')
33 |     p.add_argument('-save_path_json', type=str, default='../../ace_data/ace_rawtext_dev_pipelined.json')
34 |     args = p.parse_args()
35 |     eventAPIs = EventAPIs()
36 |     print ('Loaded class')
37 |     not_done_list = []
38 | 
39 |     with (open(args.data, "rb")) as f:
40 |         data = pickle.load(f)
41 |     
42 |     print ('Total sentences: ', len(data))
43 |     result_list = []
44 |     for i, text in enumerate(data):
45 |         print ('='*40, i)
46 |         params_this = {
47 |             'text': text,
48 |             'domain': 'news'
49 |         }
50 |         try:
51 |             combined_result = eventAPIs.analyze(params_this)
52 |             result_list.append(combined_result)
53 |         except Exception as e:
54 |             print('?'*60)
55 |             print('Error for this text: ', text)
56 |             print(str(e))
57 |             not_done_list.append(i)
58 |         if i % 20 == 0:
59 |             save(args, result_list, not_done_list)
60 | 
61 |     # print (result_list)
62 |     save(args, result_list, not_done_list)
63 |     print ('Not successfuly text:')
64 |     print (not_done_list)


--------------------------------------------------------------------------------
/project/APIs/test_on_raw_text.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Run this script to run the API on raw text input
  3 | 
  4 | # Under project
  5 | python APIs/test_on_raw_text.py
  6 | """
  7 | 
  8 | import pickle
  9 | import sys
 10 | import os
 11 | import argparse
 12 | import json
 13 | from main import EventAPIs
 14 | from nltk import tokenize
 15 | import numpy as np
 16 | 
 17 | class NumpyEncoder(json.JSONEncoder):
 18 |     """ Custom encoder for numpy data types """
 19 |     def default(self, obj):
 20 |         if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
 21 |                             np.int16, np.int32, np.int64, np.uint8,
 22 |                             np.uint16, np.uint32, np.uint64)):
 23 | 
 24 |             return int(obj)
 25 | 
 26 |         elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
 27 |             return float(obj)
 28 | 
 29 |         elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
 30 |             return {'real': obj.real, 'imag': obj.imag}
 31 | 
 32 |         elif isinstance(obj, (np.ndarray,)):
 33 |             return obj.tolist()
 34 | 
 35 |         elif isinstance(obj, (np.bool_)):
 36 |             return bool(obj)
 37 | 
 38 |         elif isinstance(obj, (np.void)): 
 39 |             return None
 40 | 
 41 |         return json.JSONEncoder.default(self, obj)
 42 | 
 43 | def save(args, result_list, not_done_list):
 44 |     with open(args.save_path, 'wb') as f:
 45 |         pickle.dump(result_list, f)
 46 | 
 47 |     result_json = {
 48 |         'error_list': not_done_list,
 49 |         'result_list': result_list
 50 |     }
 51 |     with open(args.save_path_json, 'w', encoding='utf-8') as f:
 52 |         # Use NumpyEncoder to convert numpy data to list
 53 |         # Previous error: Object of type int64 is not JSON serializable
 54 |         json.dump(result_json, f, indent=4, ensure_ascii=False,
 55 |                     cls=NumpyEncoder)
 56 |     print ('Saved')
 57 | 
 58 | 
 59 | if __name__ == '__main__':
 60 |     p = argparse.ArgumentParser()
 61 |     p.add_argument('-data', type=str, default='../../raw_text/julsepscan.txt')
 62 |     # p.add_argument('-data', type=str, default='../../ace_data/ace_rawtext_test.pkl')
 63 |     p.add_argument('-save_path', type=str, default='../../raw_text/julsepscan_pipelined.pkl')
 64 |     p.add_argument('-save_path_json', type=str, default='../../raw_text/julsepscan_pipelined.json')
 65 |     p.add_argument('-negation_detection', action='store_true', default=True,
 66 |                     help='Whether detection negation cue and scope resolution')
 67 |     args = p.parse_args()
 68 | 
 69 |     if args.negation_detection:
 70 |         eventAPIs = EventAPIs(negation_detection=True)
 71 |     else:
 72 |         eventAPIs = EventAPIs(negation_detection=False)
 73 |     print ('Loaded class')
 74 |     not_done_list = []
 75 | 
 76 |     if args.data.split('.')[-1] == 'pkl':
 77 |         with (open(args.data, "rb")) as f:
 78 |             data = pickle.load(f)
 79 |     elif args.data.split('.')[-1] == 'txt':
 80 |         with (open(args.data, "r")) as f:
 81 |             linelist = [line.rstrip() for line in f]
 82 |             data = []
 83 |             total_sen = 0
 84 |             # convert row text to list of sentences
 85 |             for line in linelist:
 86 |                 sen_list = []
 87 |                 if line != '':
 88 |                     # divide to sentences
 89 |                     sen_list = tokenize.sent_tokenize(line)
 90 |                 data.append(sen_list)
 91 |                 total_sen += len(sen_list)
 92 |             # print(data[0:100])
 93 |             # with open('../../raw_text/aprjunscan.pkl', 'wb') as f:
 94 |             #     pickle.dump(data, f)
 95 |             print ('Total sentences: ', total_sen)
 96 | 
 97 |     print ('Total lines: ', len(data))
 98 |     result_list = []
 99 |     for i_line, sen_list in enumerate(data):
100 |         result_list_this_line = []
101 |         for i_sen, text in enumerate(sen_list):
102 |             print ('='*40, 'line num: ', i_line, "; sen num: ", i_sen)
103 |             params_this = {
104 |                 'text': text,
105 |                 'domain': 'news'
106 |             }
107 |             try:
108 |                 combined_result = eventAPIs.analyze(params_this)
109 |                 combined_result['line_num'] = i_line
110 |                 combined_result['sen_num'] = i_sen
111 |                 combined_result['sentence'] = text
112 |                 result_list_this_line.append(combined_result)
113 |             except Exception as e:
114 |                 print('?'*60)
115 |                 print('Error for this text: ', text)
116 |                 print(str(e))
117 |                 not_done_list.append([i_line, i_sen])
118 |         result_list.append(result_list_this_line)
119 |         if i_line % 20 == 0:
120 |             save(args, result_list, not_done_list)
121 | 
122 |     # print (result_list)
123 |     save(args, result_list, not_done_list)
124 |     print ('Not successfuly text:')
125 |     print (not_done_list)


--------------------------------------------------------------------------------
/project/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "project.settings")
 7 |     try:
 8 |         from django.core.management import execute_from_command_line
 9 |     except ImportError:
10 |         # The above import may fail for some other reason. Ensure that the
11 |         # issue is really that Django is missing to avoid masking other
12 |         # exceptions on Python 2.
13 |         try:
14 |             import django
15 |         except ImportError:
16 |             raise ImportError(
17 |                 "Couldn't import Django. Are you sure it's installed and "
18 |                 "available on your PYTHONPATH environment variable? Did you "
19 |                 "forget to activate a virtual environment?"
20 |             )
21 |         raise
22 |     execute_from_command_line(sys.argv)
23 | 


--------------------------------------------------------------------------------
/project/project/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/project/project/__init__.py


--------------------------------------------------------------------------------
/project/project/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for project project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 1.11.29.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/1.11/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/1.11/ref/settings/
 11 | """
 12 | 
 13 | import os
 14 | 
 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 17 | 
 18 | 
 19 | # Quick-start development settings - unsuitable for production
 20 | # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/
 21 | 
 22 | # SECURITY WARNING: keep the secret key used in production secret!
 23 | SECRET_KEY = '5)se$(1!6n4apye2dal)f*=h3t%c7ep8e7)ikmb6%cux15tb0j'
 24 | 
 25 | # SECURITY WARNING: don't run with debug turned on in production!
 26 | DEBUG = True
 27 | 
 28 | ALLOWED_HOSTS = ["8dc63685fcb1.ngrok.io", "127.0.0.1", "localhost"]
 29 | 
 30 | 
 31 | # Application definition
 32 | 
 33 | INSTALLED_APPS = [
 34 |     'django.contrib.admin',
 35 |     'django.contrib.auth',
 36 |     'django.contrib.contenttypes',
 37 |     'django.contrib.sessions',
 38 |     'django.contrib.messages',
 39 |     'django.contrib.staticfiles',
 40 | ]
 41 | 
 42 | MIDDLEWARE = [
 43 |     'django.middleware.security.SecurityMiddleware',
 44 |     'django.contrib.sessions.middleware.SessionMiddleware',
 45 |     'django.middleware.common.CommonMiddleware',
 46 |     'django.middleware.csrf.CsrfViewMiddleware',
 47 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 48 |     'django.contrib.messages.middleware.MessageMiddleware',
 49 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 50 | ]
 51 | 
 52 | ROOT_URLCONF = 'project.urls'
 53 | 
 54 | TEMPLATES = [
 55 |     {
 56 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 57 |         'DIRS': [os.path.join(BASE_DIR, 'templates')]
 58 |         ,
 59 |         'APP_DIRS': True,
 60 |         'OPTIONS': {
 61 |             'context_processors': [
 62 |                 'django.template.context_processors.debug',
 63 |                 'django.template.context_processors.request',
 64 |                 'django.contrib.auth.context_processors.auth',
 65 |                 'django.contrib.messages.context_processors.messages',
 66 |             ],
 67 |         },
 68 |     },
 69 | ]
 70 | 
 71 | WSGI_APPLICATION = 'project.wsgi.application'
 72 | 
 73 | 
 74 | # Database
 75 | # https://docs.djangoproject.com/en/1.11/ref/settings/#databases
 76 | 
 77 | DATABASES = {
 78 |     'default': {
 79 |         'ENGINE': 'django.db.backends.sqlite3',
 80 |         'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
 81 |     }
 82 | }
 83 | 
 84 | 
 85 | # Password validation
 86 | # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators
 87 | 
 88 | AUTH_PASSWORD_VALIDATORS = [
 89 |     {
 90 |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
 91 |     },
 92 |     {
 93 |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
 94 |     },
 95 |     {
 96 |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
 97 |     },
 98 |     {
 99 |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
100 |     },
101 | ]
102 | 
103 | 
104 | # Internationalization
105 | # https://docs.djangoproject.com/en/1.11/topics/i18n/
106 | 
107 | LANGUAGE_CODE = 'en-us'
108 | 
109 | TIME_ZONE = 'UTC'
110 | 
111 | USE_I18N = True
112 | 
113 | USE_L10N = True
114 | 
115 | USE_TZ = True
116 | 
117 | 
118 | # Static files (CSS, JavaScript, Images)
119 | # https://docs.djangoproject.com/en/1.11/howto/static-files/
120 | 
121 | STATIC_URL = '/static/'
122 | # this is the static files folder name which you created in django project root folder. This is different from above STATIC_URL.
123 | STATICFILES_DIRS = [
124 |     os.path.join(BASE_DIR, 'statics'),
125 | ]
126 | 
127 | # hold our site’s static assets. This will allow Nginx to serve these directly, which will have a positive impact on performance
128 | STATIC_ROOT = os.path.join(BASE_DIR, "static/")
129 | 


--------------------------------------------------------------------------------
/project/project/urls.py:
--------------------------------------------------------------------------------
 1 | """project URL Configuration
 2 | 
 3 | The `urlpatterns` list routes URLs to views. For more information please see:
 4 |     https://docs.djangoproject.com/en/1.11/topics/http/urls/
 5 | Examples:
 6 | Function views
 7 |     1. Add an import:  from my_app import views
 8 |     2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home')
 9 | Class-based views
10 |     1. Add an import:  from other_app.views import Home
11 |     2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 |     1. Import the include() function: from django.conf.urls import url, include
14 |     2. Add a URL to urlpatterns:  url(r'^blog/', include('blog.urls'))
15 | """
16 | from django.conf.urls import url
17 | from django.contrib import admin
18 | from project import views
19 | 
20 | urlpatterns = [
21 |     url(r'^$', views.index, name='index'),
22 |     url(r'^admin/', admin.site.urls),
23 |     url('analyze_text/', views.analyze_text, name='analyze_text'),
24 | ]
25 | 


--------------------------------------------------------------------------------
/project/project/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for project project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "project.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/project/statics/css/index.css:
--------------------------------------------------------------------------------
  1 | .body{
  2 |     font-family: 'Roboto Mono', monospace!important
  3 | }
  4 | .container {
  5 |     width: 100%;
  6 |     height: 850px;
  7 |     margin: auto;
  8 |     position: center;
  9 |     padding: 2px;
 10 | }
 11 | 
 12 | .left {
 13 |     position: center;
 14 |     padding-top: 10%;
 15 |     width: 50%;
 16 |     height: 100%;
 17 |     float: left;
 18 |     border-right: 2px solid olivedrab;
 19 | }
 20 | 
 21 | .h2{
 22 |     font-family: 'Roboto Mono', monospace!important;
 23 |     padding-top: 2px;
 24 | }
 25 | 
 26 | .h3{
 27 |     font-family: 'Roboto Mono', monospace!important;
 28 | }
 29 | 
 30 | .introduction{
 31 |     height: 30%;
 32 |     margin: auto;
 33 |     padding: auto;
 34 |     border-bottom: 1px dashed olivedrab;
 35 | 
 36 | }
 37 | 
 38 | .sample{
 39 |     padding-top: 10%;
 40 |     height: 20%;
 41 |     /*border-bottom: 1px dashed olivedrab;*/
 42 | 
 43 | }
 44 | .input{
 45 |     padding-top: 10px;
 46 |     height: 50%;
 47 | }
 48 | 
 49 | 
 50 | .right {
 51 |   margin-left: 50%;
 52 |   height: 100%;
 53 |   /*background: black;*/
 54 | }
 55 | 
 56 | .annotation{
 57 |     /*padding-top: 20px;*/
 58 |     height: 48.2%;
 59 |     /*background: blueviolet;*/
 60 |     /*border-bottom: 1px dashed olivedrab;*/
 61 |     /*margin: auto;*/
 62 | }
 63 | 
 64 | .temporal{
 65 |     height: 60%;
 66 |     /*background: chocolate;*/
 67 |     margin: auto;
 68 | }
 69 | 
 70 | .node{}
 71 | 
 72 | .link{
 73 |     stroke: #999; stroke-opacity: .6; stroke-width: 1px;
 74 | }
 75 | 
 76 | .svg{
 77 |     height: 50%;
 78 |     width: 100%;
 79 | }
 80 | 
 81 | .span{
 82 |     font-size: 0.8em;
 83 |     font-weight: bold;
 84 |     line-height: 1;
 85 |     border-radius: 0.35em;
 86 |     text-transform: uppercase;
 87 |     vertical-align: middle;
 88 |     margin-left: 0.5rem
 89 | }
 90 | 
 91 | .mark{
 92 |     padding: 0.45em 0.6em;
 93 |     margin: 0 0.25em;
 94 |     line-height: 1;
 95 |     border-radius: 0.35em;
 96 |     box-decoration-break: clone;
 97 |     -webkit-box-decoration-break: clone
 98 | }
 99 | 
100 | div.tooltip {
101 |     position: absolute;
102 |     text-align: center;
103 |     width: auto;
104 |     height: auto;
105 |     padding: 2px;
106 |     font: 12px sans-serif;
107 |     /*background: lightsteelblue;*/
108 |     background: #f4db4b;
109 |     border: 0px;
110 |     border-radius: 8px;
111 |     pointer-events: none;
112 | }
113 | 
114 | /* Start by setting display:none to make this hidden.
115 |    Then we position it in relation to the viewport window
116 |    with position:fixed. Width, height, top and left speak
117 |    for themselves. Background we set to 80% white with
118 |    our animation centered, and no-repeating */
119 | .modal {
120 |     display:    none;
121 |     position:   fixed;
122 |     z-index:    1000;
123 |     top:        0;
124 |     left:       0;
125 |     height:     100%;
126 |     width:      100%;
127 |     background: rgba( 255, 255, 255, .8 ) 
128 |                 url('loading.gif') 
129 |                 50% 50% 
130 |                 no-repeat;
131 | }
132 | 
133 | /* When the body has the loading class, we turn
134 |    the scrollbar off with overflow:hidden */
135 | body.loading .modal {
136 |     overflow: hidden;   
137 | }
138 | 
139 | /* Anytime the body has the loading class, our
140 |    modal element will be visible */
141 | body.loading .modal {
142 |     display: block;
143 | }


--------------------------------------------------------------------------------
/project/statics/css/loading.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/project/statics/css/loading.gif


--------------------------------------------------------------------------------
/project/statics/js/Tracking.js:
--------------------------------------------------------------------------------
1 | // potential tracker from Google Analytics


--------------------------------------------------------------------------------
/project/statics/js/annotation.js:
--------------------------------------------------------------------------------
  1 | const intro = introJs();
  2 | intro.setOptions({
  3 |     exitOnOverlayClick:false,
  4 |     steps: [
  5 |         {intro: "Welcome to EventPlus! Let\'s take a tour!"},
  6 |         {
  7 |             element: "#btn-feature",
  8 |             intro: "Please take a look at features in EventPlus"
  9 |         },
 10 |         {
 11 |             element: "#topic",
 12 |             intro: "Select the domain of your input, EventPlus support news domain and biomedical domain!"
 13 |         },
 14 |         {
 15 |             element: "#examples",
 16 |             intro: "Select a sentence or input your sentences below."
 17 |         },
 18 |         {
 19 |             element: "#analyze-text-btn",
 20 |             intro: "Click on analyze text button!"
 21 |         },
 22 |         {
 23 |             element: "#show_annotation",
 24 |             intro: "We visualize entities in your input and their NER labels, they will be candidate arguments for your event!"
 25 |         },
 26 |         {
 27 |             element: "#displayEvents",
 28 |             intro: "Here are all events that we extracted! Please click on any of them to see their corresponding arguments!"
 29 |         },
 30 |         {
 31 |             element: "#show_annotation",
 32 |             intro: "We visualize your event and its corresponding arguments here!"
 33 |         },
 34 |         {
 35 |             element: "#graph",
 36 |             intro: "Temporal relations between events if there are any and duration of events as node labels"
 37 |         }
 38 |     ]
 39 | });
 40 | intro.start();
 41 | 
 42 | var return_value;
 43 | var labels;
 44 | var tokens;
 45 | var mark_default = "padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone";
 46 | var span_default = "font-style: italic; background: #f4db4b; font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem";
 47 | var ner_default = "font-style: italic; background: #f4c2c2; font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem";
 48 | var default_annotation;
 49 | String.prototype.template = function() {
 50 |     var args = arguments;
 51 |     return this.replace(/\{(\d+)\}/g,function(m,i){return args[i];});
 52 | };
 53 | 
 54 | 
 55 | $( function(){
 56 |     $( document ).on('click', '#analyze-text-btn', function(e){
 57 |         var input_text = $('#analyze-text-input').val().toString();
 58 |         console.log('input_text:', input_text);
 59 |         var topic = $('#topic').val().toString();
 60 |         ajaxAnalyzeText(input_text, topic);
 61 |     });
 62 | 
 63 | });
 64 | 
 65 | function ajaxAnalyzeText (input_text, topic){
 66 |     console.log("ajax start");
 67 |     $body = $("body");
 68 |     $body.addClass("loading");
 69 |     $.ajax({
 70 |         url: '/analyze_text/',
 71 |         type: 'post',
 72 |         async: true,
 73 |         dataType: 'json',
 74 |         data: {
 75 |             text: input_text,
 76 |             domain: topic,
 77 |         },
 78 |         beforeSend: function (xhr, settings) {
 79 |             if (!csrfSafeMethod(settings.type) && !this.crossDomain) {
 80 |                 xhr.setRequestHeader("X-CSRFToken", csrftoken);
 81 |             }
 82 |         },
 83 |         success: function (data) {
 84 |             console.log("ajax end");
 85 |             $body.removeClass("loading");
 86 |             return_value = data;
 87 |             $("#graph").empty();
 88 |             draw_graph(return_value);
 89 |             tokens = return_value.tokens;
 90 |             labels = return_value.labels;
 91 |             var annotation = "";
 92 |             var event_display = '<label>events:</label>';
 93 |             // what if we delete the maintained visitedList to get rid of the nested problem.
 94 |             // var visitedList = [];
 95 |             // find all start indexes for ners and all start indexes for triggers
 96 |             var ner = [];
 97 |             var triggers = new Set();
 98 |             for (i = 0; i < labels.length; i++) {
 99 |                 if (labels[i].role === "trigger") {
100 |                     triggers.add(labels[i].start)
101 |                 }
102 |                 if ("ner" in labels[i]) {
103 |                     ner[labels[i].start] = labels[i]
104 |                 }
105 |             }
106 |             var visitListNer = [];
107 |             for (let i = 0; i < tokens.length; i++) {
108 |                 // console.log(i.toString() + ": " + visitedList);
109 |                 // if (visitedList.includes(i)) {
110 |                 //     continue;
111 |                 // }
112 |                 if (triggers.has(i)) {
113 |                     labels.forEach(function (item, index) {
114 |                         if ((item.start === i) && (item.role === "trigger")) {
115 |                             var text = " ";
116 |                             for (index = i; index < item.end; index++) {
117 |                                 // this is correct
118 |                                 var visitedList = [];
119 |                                 if (visitedList.includes(index)) {
120 |                                     continue;
121 |                                 }
122 |                                 text += tokens.slice(index, index + 1);
123 |                                 text += " ";
124 |                                 visitedList.push(index);
125 |                             }
126 |                             var mark_style = "background:" + item.color + ";" + mark_default;
127 |                             event_display += "<span style='{0}; cursor: pointer' eventId='{1}' onclick='event_click(this)'>{2}</span>".template(mark_style, item.event, text);
128 |                         }
129 |                     });
130 |                 }
131 | 
132 |                 if (i in ner) {
133 |                     var label = ner[i]
134 |                     var text = " ";
135 |                     for (index = i; index < label.end; index++) {
136 |                         if(visitListNer.includes(index)) {
137 |                             continue;
138 |                         }
139 |                         text += tokens.slice(index, index + 1);
140 |                         text += " ";
141 |                         visitListNer.push(index)
142 |                     }
143 |                     ner_label = label.ner;
144 |                     annotation += "<mark style='text-decoration-line: underline; text-decoration-style: wavy;'>{0}</mark><span style='{1}' position>{2}</span></mark>&nbsp;".template(text, ner_default, ner_label)
145 |                 }
146 |                 if (!(i in ner) && !(i in triggers) && !(visitListNer.includes(i))) {
147 |                     annotation += tokens[i];
148 |                     annotation += " ";
149 |                 }
150 |             }
151 |            $("#show_annotation").html(annotation)
152 |             // give a default annotation as the annotation when it first loads
153 |             default_annotation = annotation;
154 |             $("#displayEvents").html(event_display)
155 |         }
156 |     });
157 | }
158 | 
159 | function event_click(obj) {
160 |     var clicks = $(this).data('clicks');
161 |     if (clicks) {
162 |          // odd clicks
163 |         // console.log("odd clicks")
164 |         onlyPlotThis(obj.getAttribute("eventId"));
165 |     } else {
166 |          // even clicks
167 |         // console.log("even clicks")
168 |          $("#show_annotation").html(default_annotation)
169 | 
170 |     }
171 |       $(this).data("clicks", !clicks);
172 | }
173 | 
174 | function onlyPlotThis(eventId) {
175 | 
176 |     var annotation = "";
177 |     var visitedList = [];
178 |     var event_display = "";
179 |     var this_event = {};
180 |     // find everything corresponding to this event and mark it as a dictionary
181 |     labels.forEach(function (item, index) {
182 |         if (item.event == eventId) {
183 |                 this_event[item.start] = item
184 |             }
185 |         });
186 | 
187 |     for (let i=0; i < tokens.length; i++) {
188 |         // console.log(i.toString() + ": " + visitedList);
189 |          if (visitedList.includes(i)) {
190 |             continue;
191 |         }
192 |         if (i in this_event && !(visitedList.includes(i))) {
193 |             var label = this_event[i];
194 |             var text = " ";
195 |             for(index = i; index < label.end; index++) {
196 |                 text += tokens.slice(index, index + 1);
197 |                 text += " ";
198 |                 visitedList.push(index);
199 |             }
200 |             if ("event" in label && label.event == eventId) {
201 |                   if (label.role === "trigger") {
202 |                     // this labeled item is a trigger
203 |                     var mark_style = "background:" + label.color + ";" + mark_default;
204 |                     annotation += "<mark style='{0}'>{1}<span style='{2}'>{3}</span></mark>".template(mark_style, text, span_default, label.label);
205 |                     event_display += "<span style='{0}; cursor: pointer' eventId='{1}' onclick='event_click(this)'>{2}</span>".template(mark_style, label.event, text);
206 |                 } else if (label.role === "argument") {
207 |                       // this label item is an argument
208 |                       var mark_style = "background:" + label.color + ";" + mark_default;
209 |                       if("ner" in label) {
210 |                         mark_style += "text-decoration-line: underline;";
211 |                         ner_label = label.ner;
212 |                         annotation += "<mark style='{0}'>{1}<span style='{2}'>{3}</span><span style='{4}'>{5}</span></mark>".template(mark_style, text, span_default, label.label, ner_default, ner_label);
213 |                       } else {
214 |                           annotation += "<mark style='{0}'>{1}<span style='{2}'>{3}</span></mark>".template(mark_style, text, span_default, label.label);
215 |                       }
216 |                 }
217 |             } else {
218 |                 annotation += text;
219 |                 annotation += " ";
220 |             }
221 |         } else {
222 |             annotation += tokens[i];
223 |             annotation += " ";
224 |         }
225 |     }
226 |     $("#show_annotation").html(annotation)
227 | 
228 | }
229 | 
230 | // var labels = return_value.labels
231 | 
232 | // for(let i = 0; i < cherry_obj.tokens.length; i++){
233 | //
234 | // }
235 | //
236 | // for(let i = 0; i < cherry_obj.events.length; i++) {
237 | //     var mark_style = "background:" + "#42a4f0;" + mark_default
238 | //     $("#displayEvents").html("<mark style='{0}'>{1}</mark>".template(mark_style, cherry_obj.events[i].triggers.text))
239 | // }
240 | 
241 | 
242 | 


--------------------------------------------------------------------------------
/project/statics/js/main.js:
--------------------------------------------------------------------------------
 1 | 
 2 | // two select box
 3 | var news_array = [
 4 |     "The other customers fled, and the police said it did not appear that anyone else was injured",
 5 |     "A powerful ice storm continues to maintain its grip. Yesterday New York governor George Pataki toured five counties that have been declared under a state of emergency",
 6 |     "Despite the recent possibility of military conflict with Iraq, oil prices have been falling, that's because of a worldwide glut of oil"
 7 | ];
 8 | 
 9 | var bio_array = [
10 |     "We have found that the HTLV-1 transactivator protein, tax, acts as a costimulatory signal for GM-CSF and IL-2 gene transcription, in that it can cooperate with TCR signals to mediate high level gene expression.",
11 |     "We show that ligand-induced homodimerization of chimeric surface receptors consisting of the extracellular and transmembrane domains of the erythropoietin receptor and of the intracellular domain of IL-4Ralpha induces Janus kinase 1 (Jak1) activation, STAT6 activation, and Cepsilon germline transcripts in human B cell line BJAB."
12 | ];
13 | 
14 | String.prototype.template = function() {
15 | var args = arguments;
16 | return this.replace(/\{(\d+)\}/g,function(m,i){return args[i];});
17 | };
18 | 
19 | function htmlEntities(str) {
20 |     return String(str).replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;').replace(/'/g, '&#39');
21 | }
22 | 
23 | function select_topic() {
24 |    var val = document.getElementById("topic").value;
25 |    var btnelem = document.getElementById("analyze-text-btn");
26 | 
27 |    if (val == "news") {
28 |        btnelem.disabled = false;
29 |        console.log(btnelem.disabled);
30 |         var news = "";
31 |         news += "<option value=''>---</option>";
32 |         for(let i = 0; i < news_array.length; i++) {
33 |             news += "<option value='{0}'>".template(htmlEntities(news_array[i])) + htmlEntities(news_array[i]).substring(0, 30) + "..." + "</option>"
34 |         }
35 |         // console.log("news", news)
36 |         $("#examples").html(news);
37 |     }
38 |     else if (val == "bio") {
39 |        btnelem.disabled = false;
40 |        var bio = "";
41 |        bio += "<option value=''>---</option>";
42 |         for(let i = 0; i < bio_array.length; i++) {
43 |             bio += "<option value='{0}'>".template(htmlEntities(bio_array[i])) + htmlEntities(bio_array[i]).substring(0, 30) + "..." + "</option>"
44 |         }
45 |         $("#examples").html(bio);
46 |     }
47 | }
48 | 
49 | function give_examples() {
50 |     var e2 = document.getElementById("examples");
51 |     document.getElementById("analyze-text-input").value = e2.value
52 | }
53 | 


--------------------------------------------------------------------------------
/project/statics/js/security.js:
--------------------------------------------------------------------------------
 1 | function csrfSafeMethod(method)
 2 | {
 3 | 	    // these HTTP methods do not require CSRF protection
 4 | 	    return (/^(GET|HEAD|OPTIONS|TRACE)$/.test(method));
 5 | }
 6 | 
 7 | function getCookie(name) {
 8 |     var cookieValue = null;
 9 |     if (document.cookie && document.cookie !== '') {
10 |         var cookies = document.cookie.split(';');
11 |         for (var i = 0; i < cookies.length; i++) {
12 |             var cookie = jQuery.trim(cookies[i]);
13 |             // Does this cookie string begin with the name we want?
14 |             if (cookie.substring(0, name.length + 1) === (name + '=')) {
15 |                 cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
16 |                 break;
17 |             }
18 |         }
19 |     }
20 |     return cookieValue;
21 | }
22 | 
23 | var csrftoken = getCookie('csrftoken');


--------------------------------------------------------------------------------
/project/statics/js/temporal.js:
--------------------------------------------------------------------------------
  1 | function draw_graph(return_value) {
  2 |     console.log(return_value.graph);
  3 |     var json_obj = return_value.graph;
  4 | 
  5 |     var svg = d3.select("svg"),
  6 |         width = +svg.attr("width"),
  7 |         height = +svg.attr("height"),
  8 |         node,
  9 |         link;
 10 | 
 11 |     svg.append('defs').append('marker')
 12 |         .attrs({'id':'arrowhead',
 13 |             'viewBox':'-0 -5 10 10',
 14 |             'refX':13,
 15 |             'refY':0,
 16 |             'orient':'auto',
 17 |             'markerWidth':13,
 18 |             'markerHeight':13,
 19 |             'xoverflow':'visible'})
 20 |         .append('svg:path')
 21 |         .attr('d', 'M 0,-5 L 10 ,0 L 0,5')
 22 |         .attr('fill', '#999')
 23 |         .style('stroke','none');
 24 | 
 25 |     var simulation = d3.forceSimulation()
 26 |         .force("link", d3.forceLink().id(function (d) {return d.id;}).distance(100).strength(1))
 27 |         .force("charge", d3.forceManyBody())
 28 |         .force("center", d3.forceCenter(width / 2, height / 2));
 29 | 
 30 |     // d3.json(json_obj, function (error, graph) {
 31 |     //     if (error) throw error;
 32 |     //     update(graph.links, graph.nodes);
 33 |     // })
 34 |     update(json_obj.links, json_obj.nodes)
 35 | 
 36 |     // Define the div for the tooltip
 37 |     var div = d3.select("body").append("div")
 38 |         .attr("class", "tooltip")
 39 |         .style("opacity", 0);
 40 | 
 41 | 
 42 |     function update(links, nodes) {
 43 |         link = svg.selectAll(".link")
 44 |             .data(links)
 45 |             .enter()
 46 |             .append("line")
 47 |             .attr("class", "link")
 48 |             .attr('marker-end','url(#arrowhead)')
 49 | 
 50 |         link.append("title")
 51 |             .text(function (d) {return d.type;});
 52 | 
 53 |         edgepaths = svg.selectAll(".edgepath")
 54 |             .data(links)
 55 |             .enter()
 56 |             .append('path')
 57 |             .attrs({
 58 |                 'class': 'edgepath',
 59 |                 'fill-opacity': 0,
 60 |                 'stroke-opacity': 0,
 61 |                 'id': function (d, i) {return 'edgepath' + i}
 62 |             })
 63 |             .style("pointer-events", "none");
 64 | 
 65 |         edgelabels = svg.selectAll(".edgelabel")
 66 |             .data(links)
 67 |             .enter()
 68 |             .append('text')
 69 |             .style("pointer-events", "none")
 70 |             .attrs({
 71 |                 'class': 'edgelabel',
 72 |                 'id': function (d, i) {return 'edgelabel' + i},
 73 |                 'font-size': 10,
 74 |                 'fill': '#aaa'
 75 |             });
 76 | 
 77 |         edgelabels.append('textPath')
 78 |             .attr('xlink:href', function (d, i) {return '#edgepath' + i})
 79 |             .style("text-anchor", "middle")
 80 |             .style("pointer-events", "none")
 81 |             .attr("startOffset", "50%")
 82 |             .text(function (d) {return d.type});
 83 | 
 84 |         node = svg.selectAll(".node")
 85 |             .data(nodes)
 86 |             .enter()
 87 |             .append("g")
 88 |             .attr("class", "node")
 89 |             .call(d3.drag()
 90 |                     .on("start", dragstarted)
 91 |                     .on("drag", dragged)
 92 |                     //.on("end", dragended)
 93 |             );
 94 | 
 95 |         node.append("circle")
 96 |             .attr("r", 12)
 97 |             // .style("fill", function (d, i) {return colors(i);})
 98 |             .style("fill", function (d) {return d.color;})
 99 |             .on("mouseover", function(d) {
100 |                 div.transition()
101 |                     .duration(200)
102 |                     .style("opacity", .9);
103 |                 div	.html(d.type )
104 |                     .style("left", (d3.event.pageX) + "px")
105 |                     .style("top", (d3.event.pageY - 28) + "px");
106 |             })
107 |             .on("mouseout", function(d) {
108 |                 div.transition()
109 |                     .duration(500)
110 |                     .style("opacity", 0);
111 |             })
112 | 
113 |         // node.append("title")
114 |         //     .text(function (d) {return d.id;});
115 | 
116 |         node.append("text")
117 |             .attr("dy", -1)
118 |             .text(function (d) {return d.name+":"+d.label;});
119 | 
120 |         simulation
121 |             .nodes(nodes)
122 |             .on("tick", ticked);
123 | 
124 |         simulation.force("link")
125 |             .links(links);
126 |     }
127 | 
128 |     function ticked() {
129 |         link
130 |             .attr("x1", function (d) {return d.source.x;})
131 |             .attr("y1", function (d) {return d.source.y;})
132 |             .attr("x2", function (d) {return d.target.x;})
133 |             .attr("y2", function (d) {return d.target.y;});
134 | 
135 |         node
136 |             .attr("transform", function (d) {return "translate(" + d.x + ", " + d.y + ")";});
137 | 
138 |         edgepaths.attr('d', function (d) {
139 |             return 'M ' + d.source.x + ' ' + d.source.y + ' L ' + d.target.x + ' ' + d.target.y;
140 |         });
141 | 
142 |         edgelabels.attr('transform', function (d) {
143 |             if (d.target.x < d.source.x) {
144 |                 var bbox = this.getBBox();
145 | 
146 |                 rx = bbox.x + bbox.width / 2;
147 |                 ry = bbox.y + bbox.height / 2;
148 |                 return 'rotate(180 ' + rx + ' ' + ry + ')';
149 |             }
150 |             else {
151 |                 return 'rotate(0)';
152 |             }
153 |         });
154 |     }
155 | 
156 |     function dragstarted(d) {
157 |         if (!d3.event.active) simulation.alphaTarget(0.3).restart()
158 |         d.fx = d.x;
159 |         d.fy = d.y;
160 |     }
161 | 
162 |     function dragged(d) {
163 |         d.fx = d3.event.x;
164 |         d.fy = d3.event.y;
165 |     }
166 | 
167 | 
168 | }
169 | 
170 | json_obj = {
171 |   "nodes": [
172 |     {
173 |       "name": "Peter",
174 |       "label": "Person",
175 |       "id": 1
176 |     },
177 |     {
178 |       "name": "Michael",
179 |       "label": "Person",
180 |       "id": 2
181 |     },
182 |     {
183 |       "name": "Neo4j",
184 |       "label": "Database",
185 |       "id": 3
186 |     },
187 |     {
188 |       "name": "Graph Database",
189 |       "label": "Database",
190 |       "id": 4
191 |     }
192 |   ],
193 |   "links": [
194 |     {
195 |       "source": 1,
196 |       "target": 2,
197 |       "type": "KNOWS",
198 |     },
199 |     {
200 |       "source": 1,
201 |       "target": 3,
202 |       "type": "FOUNDED"
203 |     },
204 |     {
205 |       "source": 2,
206 |       "target": 3,
207 |       "type": "WORKS_ON"
208 |     },
209 |     {
210 |       "source": 3,
211 |       "target": 4,
212 |       "type": "IS_A"
213 |     }
214 |   ]
215 | }
216 | 
217 | 


--------------------------------------------------------------------------------
/project/templates/index.html:
--------------------------------------------------------------------------------
  1 | {% load static %}
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 | <head>
  5 |     <meta charset="UTF-8">
  6 |     <link rel="stylesheet" href="{% static 'css/index.css' %}">
  7 |     <title>Event Detection Pipeline</title>
  8 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  9 |     <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
 10 |     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
 11 |     <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/intro.js/2.9.3/introjs.min.css">
 12 |     <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/npm/intro.js@2.9.3/themes/introjs-modern.min.css">
 13 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/intro.js/2.9.3/intro.min.js"></script>
 14 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/intro.js/2.9.3/intro.js"></script>
 15 |     <script src="https://d3js.org/d3.v4.min.js" type="text/javascript"></script>
 16 |     <script src="https://d3js.org/d3-selection-multi.v1.js"></script>
 17 |     <script type="text/javascript" src="{% static 'js/security.js' %}"></script>
 18 |     <script type="text/javascript" src="{% static 'js/Tracking.js' %}"></script>
 19 |     <script type="text/javascript" src="{% static 'js/main.js' %}"></script>
 20 | </head>
 21 | <body>
 22 |         <nav class="navbar navbar-inverse">
 23 |             <div class="container-fluid">
 24 |                 <div class="navbar-header">
 25 |                     <a class="navbar-brand" href="#" id="header" data-intro="read this first">EventPlus</a>
 26 |                 </div>
 27 |                 <ul class="nav navbar-nav navbar-right">
 28 |                    <li class="nav-item active">
 29 |                     <a id="btn-feature" class="hoverable nav-link" data-toggle="modal" data-target="#feature-modal">Feature & Task Help</a>
 30 |                 </li>
 31 | {#              <li><a id="intro" title="powered by PLUS Lab"><span class="glyphicon glyphicon-info-sign"></span>&nbsp Intro</a></li>#}
 32 |                 </ul>
 33 |             </div>
 34 |             <div class="collapse navbar-collapse" id="navbarSupportedContent">
 35 |             </div>
 36 |         </nav>
 37 |     <div class="container">
 38 |           <div class="left">
 39 | 
 40 |               <div class="introduction">
 41 |                 <h2>Description</h2>
 42 |                   <br>
 43 |                  Powered by the state-of-the-art event-related knowledge extraction models, EventPlus extracts and integrates <strong>event triggers</strong>, <strong>corresponding arguments and roles</strong>, <strong>event duration</strong>, <strong>temporal relation</strong> between events, and etc.
 44 |                  Please click on the "Feature &Task Help" button on the top right to know how to interpret the result!
 45 | 
 46 |               </div>
 47 | 
 48 |               <div class="sample">
 49 |                   <div class="row" style="position: center">
 50 |                       <div class="col-sm-3">
 51 |                        <form>
 52 |                           <label for="topic">Domain:</label>
 53 |                           <select name="topic" id="topic" onchange="select_topic()">
 54 |                               <option value="">---</option>
 55 |                                <option value="news">news</option>
 56 |                                <option value="bio">bio</option>
 57 |                           </select>
 58 |                           <br><br>
 59 |                         </form>
 60 |                       </div>
 61 | 
 62 |                       <div class="col-sm-7">
 63 |                           <div class="dropdown">
 64 |                               <form >
 65 |                                   <label for="examples">Enter text or:</label>
 66 |                                   <select name="examples" id="examples" onchange="give_examples()">
 67 |                                   </select>
 68 |                                   <br><br>
 69 |                               </form>
 70 |                           </div>
 71 |                       </div>
 72 |                   </div>
 73 |               </div>
 74 | 
 75 |               <div class="input">
 76 |                   <h3> Text Input</h3>
 77 |                    <div class="form-group">
 78 |                     <label for="analyze-text-input"></label>
 79 |                     <textarea class="form-control" id="analyze-text-input" rows="5" maxlength="500"></textarea>
 80 |                   </div>
 81 |                     <br>
 82 |                   <div class="col-md-10 text-center">
 83 |                    <button id="analyze-text-btn" type="button" class="btn btn-primary" disabled>Analyze Text</button>
 84 |                  </div>
 85 |               </div>
 86 |           </div>
 87 | 
 88 |           <div class="right">
 89 |               <div class="annotation" style="height: 50%">
 90 |                   <h3>&nbsp;&nbsp; Annotation</h3>
 91 |                   <br>
 92 |                   <div style="height: 90%">
 93 |                       <div class="panel" style="height: 73%;line-height: 2.5; overflow-y: scroll; border-bottom: 1px dashed hotpink;" id="show_annotation"></div>
 94 |                       <div class="list" style="padding-top: 0; height: 20%; line-height: 2.5; overflow-y: scroll; border-bottom: 1px dashed olivedrab;" id="displayEvents"></div>
 95 |                   </div>
 96 |               </div>
 97 | 
 98 |               <div class="temporal" style="height: 40%">
 99 |                   <h3>&nbsp;&nbsp; Temporal Relation</h3>
100 |                   <div><svg width="570" height="300" id="graph"></svg></div>
101 |               </div>
102 |           </div>
103 |     </div>
104 | 
105 |     <!-- Feature Modal -->
106 |     <div class="modal fade" id="feature-modal" tabindex="-1" role="dialog" aria-labelledby="feature-modal" aria-hidden="true">
107 |         <div class="modal-dialog modal-dialog-centered modal-lg" role="document">
108 |             <div class="modal-content">
109 |             <div class="modal-header">
110 |                 <h4 class="modal-title" id="feature-modal-title">Features</h4>
111 |                 <button type="button" class="close" data-dismiss="modal" aria-label="Close">
112 |                 <span aria-hidden="true">&times;</span>
113 |                 </button>
114 |             </div>
115 | 
116 |             <div class="modal-body">
117 |                 <ul>
118 |                     <h4>What do we extract?</h4>
119 |                     <p> When you input your text and click on the <button class="btn-group btn-group-sm" style="background-color: #2d6fae">Analyze Text</button> button, our system will automatically extract </p>
120 |                     <ul>
121 |                         <li style="padding: 10px;">All events and their corresponding arguments</li>
122 |                         <li style="padding: 10px;">The duration of detected events</li>
123 |                         <li style="padding: 10px;">The temporal relation among detected events</li>
124 |                     </ul>
125 |                     <hr>
126 |                     <h4>Notation Explanation</h4>
127 |                     <p>The <strong>Annotation</strong> panel contains the following</p>
128 |                     <table class="table">
129 |                             <thead>
130 |                                 <tr><th>Style</th><th>Explanation</th></tr>
131 |                             </thead>
132 |                             <tbody>
133 |                                 <tr><th><mark style='text-decoration-line: underline; text-decoration-style: wavy;'>entity</mark></th><td>Detected candidate entities from NER</td></tr>
134 |                                 <tr><th><span style="font-style: italic; background: #f4c2c2; font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">NER label</span></th><td>The NER label for entities</td></tr>
135 |                                 <tr><th><span style="background-image: linear-gradient(to right,#fb8072, #fdb462, #ffffb3, #b3de69, #8dd3c7, #80b1d3, #bebada); opacity: 100%; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone"><strong>Events and Arguments</strong></span></th><td>Detected Events and their corresponding arguments will be marked as the same color.</td></tr>
136 |                                 <tr><th><span style="font-style: italic; background: #f4db4b; font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem";>label</span></th><td>Label for events and their arguments</td></tr>
137 |                             </tbody>
138 |                     </table>
139 | {#                    <hr>#}
140 | {#                    <h4>Sentence-level Feedback</h4>#}
141 | {#                        The colour of the sentences shows LinggleWrite’s opinion of the quality of each sentence. There are three types of quality.#}
142 | {#                        <div class="p-2">#}
143 | {#                            <div class="sen-good">This seems to be a good sentence.</div>#}
144 | {#                            <div class="sen-notok">This sentence could maybe be improved.</div>#}
145 | {#                            <div class="sen-bad">There are some problems in this sentence.</div>#}
146 | {#                        </div>#}
147 | {#                    <hr>#}
148 | {#                    <h4>Grammatical Error Detection</h4>#}
149 | {#                    <p>When you submit your essay or sentence, LinggleWrite will identify potential errors in sentences. There are three types of error given by LinggleWrite :<p>#}
150 | {#                    <ul>#}
151 | {#                        <li><span class="B-II text-nowrap">Insert Word</span> You should add some word in here.</li>#}
152 | {#                        <ul><li style="padding: 10px;">I am good <span class="B-II">Insert Word </span> this sport.</li></ul>#}
153 | {#                        <li><span class="B-D text-nowrap">Delete Word </span> You should delete this word.</li>#}
154 | {#                        <ul><li style="padding: 10px;">We dicuss <span class="B-D">about</span> this issue.</li></ul>#}
155 | {#                        <li><span class="B-R text-nowrap">Replace Word </span> You should replace this word.</li>#}
156 | {#                        <ul><li style="padding: 10px;">I finish school <span class="B-R">on</span> June.</ul>#}
157 | {#                    </ul>#}
158 | {#                    <hr>#}
159 |                 </ul>
160 |             </div>
161 | 
162 |             <div class="modal-footer">
163 |                 <button type="button" class="btn btn-secondary" data-dismiss="modal">Close</button>
164 |             </div>
165 |             </div>
166 |         </div>
167 |     </div>
168 | 
169 |     <div class="modal"><!-- Place at bottom of page --></div>
170 | 
171 | 
172 |     <footer class="container-fluid text-center" style="background-color: darkseagreen;" >
173 |             <p style="font-family: 'Roboto Mono', monospace; opacity: 1">
174 |                 <br>
175 |                 <strong>USC ISI all rights reserved. Event Detection Pipeline</strong>
176 |             </p>
177 |     </footer>
178 | 
179 |     <script type="text/javascript" src="{% static 'js/annotation.js' %}"></script>
180 | </body>
181 | <script type="text/javascript" src="{% static 'js/temporal.js' %}"></script>
182 | </html>


--------------------------------------------------------------------------------