├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── component ├── BETTER │ ├── README.md │ └── joint │ │ ├── .gitignore │ │ ├── CRF_util.py │ │ ├── EventPipeline.py │ │ ├── JsonBuilder.py │ │ ├── README.md │ │ ├── all_liz │ │ ├── BETTER_pos2idx.pickle │ │ └── pos_emb.npy │ │ ├── dataset.py │ │ ├── eval.py │ │ ├── event_pipeline_demo.py │ │ ├── generate_data │ │ ├── all_uw.comb.pkl │ │ ├── contextualized_features.py │ │ ├── contextualized_features_bert.py │ │ ├── json_to_pkl_newformat.py │ │ ├── util.py │ │ └── uw_json_to_pkl_ace.py │ │ ├── main.py │ │ ├── main_biaffine.py │ │ ├── neural_model.py │ │ ├── out_pkl_to_json_eval.py │ │ ├── requirements.txt │ │ ├── saved_args.json │ │ ├── score.py │ │ ├── split_event.py │ │ ├── train.py │ │ ├── train_biaffine.py │ │ ├── train_ssvm.py │ │ └── util.py ├── Duration │ ├── .gitignore │ ├── Mu_test_data │ │ ├── dev_ace.pred.json │ │ ├── dev_tbd.pred.json │ │ ├── test_ace.pred.json │ │ ├── test_tbd.pred.json │ │ └── train_tbd.pred.json │ ├── README.md │ ├── UDS_T_data │ │ ├── first10.tsv │ │ ├── first10_preprocessed.jsonl │ │ └── time_eng_ud_v1.2_2015_10_30.tsv │ ├── inference_api.py │ ├── input_data │ │ ├── sample_document.txt │ │ └── sample_fig2.txt │ ├── input_data_conllu │ │ ├── sample_document.txt.output │ │ └── sample_fig2.txt.output │ ├── main.py │ ├── main_new.py │ ├── model_ckpt │ │ └── model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth │ ├── predictions │ │ ├── .~lock.sample_document.txt.output_timeline.csv# │ │ ├── README_predictions.txt │ │ ├── sample_document.txt.output_predictions.csv │ │ └── sample_document.txt.output_timeline.csv │ ├── predictions_new │ │ ├── sample_document.txt.output_predictions.csv │ │ ├── sample_document.txt.output_timeline.csv │ │ ├── sample_fig2.txt.output_predictions.csv │ │ └── sample_fig2.txt.output_timeline.csv │ ├── preprocess.py │ ├── preprocess_udst.py │ ├── readme_eval.txt │ ├── requirements.txt │ ├── run_jupyter.sh │ ├── scripts │ │ ├── __init__.py │ │ ├── debugging-2.ipynb │ │ ├── debugging.ipynb │ │ ├── elmo_files │ │ │ └── elmo_2x4096_512_2048cnn_2xhighway_options.json │ │ ├── run_document_timeline.bash │ │ ├── run_input_data.sh │ │ ├── run_model.py │ │ ├── src │ │ │ └── factslab │ │ │ │ └── factslab │ │ │ │ ├── __init__.py │ │ │ │ └── pytorch │ │ │ │ ├── __init__.py │ │ │ │ ├── childsumtreelstm.py │ │ │ │ ├── mlpregression.py │ │ │ │ ├── rnnregression.py │ │ │ │ ├── roberta_extract.py │ │ │ │ ├── simplemlpregression.py │ │ │ │ ├── temporalmodule.py │ │ │ │ └── transformer_regression.py │ │ ├── timelinemodule.py │ │ └── utils.py │ └── utils_duration.py ├── NegationDetection │ ├── .gitignore │ ├── README.md │ └── train.py ├── REST_service │ └── main.py ├── TempRel │ ├── .gitignore │ ├── code │ │ ├── joint_model.py │ │ └── run.sh │ ├── conda_env.txt │ └── other │ │ └── pos_tags.txt └── component_envs │ ├── env_temprel.yml │ ├── req_better.txt │ └── req_biomed.txt ├── env.yml ├── env_coref.yml └── project ├── APIs ├── coref.py ├── coref_pre.py ├── main.py ├── test_on_ace_data.py └── test_on_raw_text.py ├── manage.py ├── project ├── __init__.py ├── settings.py ├── urls.py ├── views.py └── wsgi.py ├── statics ├── css │ ├── index.css │ └── loading.gif └── js │ ├── Tracking.js │ ├── annotation.js │ ├── index.js │ ├── main.js │ ├── security.js │ └── temporal.js └── templates └── index.html /.gitignore: -------------------------------------------------------------------------------- 1 | # Project specific 2 | component/EventDemo 3 | project/static 4 | project/tmp 5 | project/bert* 6 | project/xlnet* 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # IDE 17 | .idea 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | pip-wheel-metadata/ 34 | share/python-wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | MANIFEST 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .nox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | *.py,cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "component/BioMedEventEx"] 2 | path = component/BioMedEventEx 3 | url = https://github.com/PlusLabNLP/GEANet-BioMed-Event-Extraction.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [NAACL'21] EventPlus: A Temporal Event Understanding Pipeline 2 | 3 | This is the codebase for the system demo EventPlus: A Temporal Event Understanding Pipeline in NAACL 2021. 4 | 5 | Please refer to our paper for details. [[PDF]](https://www.aclweb.org/anthology/2021.naacl-demos.7.pdf) [[Talk]](https://youtu.be/KPXpKeVIuag) [[Demo]](https://kairos-event.isi.edu/) 6 | 7 | ## Quick Start 8 | 9 | 0 - Clone the codebase with all submodules 10 | 11 | ``` 12 | git clone --recurse-submodules https://github.com/PlusLabNLP/EventPlus.git 13 | # or use following commands 14 | git clone https://github.com/PlusLabNLP/EventPlus.git 15 | git submodule init 16 | git submodule update 17 | ``` 18 | 19 | 1 - Environment Installation 20 | 21 | Change prefix (last line) of `env.yml` to fit your path, then run 22 | 23 | ``` 24 | conda env create -f env.yml 25 | conda activate event-pipeline 26 | pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_jnlpba_md-0.2.4.tar.gz 27 | python -m spacy download en_core_web_sm 28 | pip install git+https://github.com/hltcoe/PredPatt.git 29 | ``` 30 | 31 | 2 - Download trained model for components 32 | 33 | For `component/BETTER` module, download the trained model [[Link]](https://drive.google.com/file/d/19_W6azeG5KRQxLDICswqwIFX0QOjxh_L/view?usp=sharing), unzip and place it under `component/BETTER/joint/worked_model_ace`. 34 | 35 | For `component/TempRel` module, download the trained model [[Link]](https://drive.google.com/file/d/1vyeAqtDmBp98NCuEMCFvrnJ8oBuNuMr3/view?usp=sharing), unzip and place it under `component/TempRel/models`. 36 | 37 | For `component/Duration` module, download `scripts` zip file [[Link]](https://drive.google.com/file/d/1s1uLcQjjFdfcto3BZ3aRi8pPzLf9KELe/view?usp=sharing), unzip and place it under `component/Duration/scripts`. 38 | 39 | For `component/NegationDetection` module, download the trained model [[Link]](https://drive.google.com/file/d/1FLAHrWy3eF23Kb7Ql4k_f1a5lCQ5m1L0/view?usp=sharing), unzip and place is under `component/NegationDetection/models` 40 | 41 | 3 - In background: Run REST API for event duration detection module for faster processing 42 | ``` 43 | (optional) tmux new -s duration_rest_api 44 | conda activate event-pipeline 45 | cd component/REST_service 46 | python main.py 47 | (optional) exit tmux window 48 | ``` 49 | 50 | 4 - Application 1: Raw Text Annotation. The input is a multiple line raw text file, and the output pickle and json file will be saved to designated paths 51 | ``` 52 | cd YOUR_PREFERRED_PATH/project 53 | python APIs/test_on_raw_text.py -data YOUR_RAW_TEXT_FILE -save_path SAVE_PICKLE_PATH -save_path_json SAVE_JSON_PATH -negation_detection 54 | ``` 55 | 56 | 5 - Application 2: Web App for Interaction and Visualization. A web app will be started and user can input a piece of text and get annotation result and visualization. 57 | ``` 58 | cd YOUR_PREFERRED_PATH/project 59 | tmux new -s serve 60 | python manage.py runserver 8080 61 | ``` 62 | 63 | ## Components 64 | 65 | The code for data processing and incorporating different components is in `project/APIs/main.py`. Please refer to README file of each component for more details about training and inference. 66 | 67 | 1- Event Extraction on ACE Ontology: `component/BETTER` 68 | 69 | 2- Joint Event Trigger and Temporal Relation Extraction: `component/TempRel` for inference, [this codebase](https://github.com/rujunhan/EMNLP-2019) for training 70 | 71 | 3- Event Duration Detection: `component/Duration` 72 | 73 | 4- Negation and Speculation Cue Detection and Scope Resolution: `component/NegationDetection` 74 | 75 | 5- Biomedical Event Extraction: `component/BioMedEventEx` for inference, [this codebase](https://github.com/PlusLabNLP/GEANet-BioMed-Event-Extraction) for training 76 | 77 | ## Quick Start with ISI shared NAS 78 | 79 | If you are using the system on a machine with access to ISI shared NAS, you could directly activate environment and copy the code and start using it right away! 80 | 81 | ``` 82 | # 1 - Environment Installation: Activate existing environment 83 | conda activate /nas/home/mingyuma/miniconda3/envs/event-pipeline-dev 84 | 85 | # 2 - Prepare Components (Submodules): Copy the whole codebase 86 | cp -R /nas/home/mingyuma/event-pipeline/event-pipeline-dev YOUR_PREFERRED_PATH 87 | 88 | # 3 - In background: Run REST API for event duration detection module for faster processing 89 | (optional) tmux new -s duration_rest_api 90 | conda activate /nas/home/mingyuma/miniconda3/envs/event-pipeline-dev 91 | cd component/REST_service 92 | python main.py 93 | (optional) exit tmux window 94 | 95 | # To use it for raw text annotation or web app, please follow step 4 and 5 in quick start section. 96 | ``` 97 | 98 | ## Deployment as Web Service 99 | 100 | Here are instruction of how to deploy the web application on an server 101 | 102 | ### Set up web server 103 | 104 | ``` 105 | pip install uwsgi 106 | ``` 107 | 108 | If you met errors like `error while loading shared libraries libssl.so.1.1`, reference [this link](https://www.bswen.com/2018/11/others-Openssl-version-cause-error-when-loading-shared-libraries-libssl.so.1.1.html) and do the following 109 | 110 | ``` 111 | export LD_LIBRARY_PATH=/nas/home/mingyuma/miniconda3/envs/event-pipeline/lib:$LD_LIBRARY_PATH 112 | ``` 113 | 114 | ### Server port setting 115 | 116 | External port: 443 (for HTTPS) 117 | 118 | Django will forward traffic from 443 port to internal 8080 port 119 | 120 | Internal port 121 | * 8080: run Django main process 122 | * 17000: run service for duration (if we run a REST API for duration module, but now the newer version doesn't need such a separate service) 123 | 124 | ## Citation 125 | 126 | ``` 127 | @inproceedings{ma-etal-2021-eventplus, 128 | title = "{E}vent{P}lus: A Temporal Event Understanding Pipeline", 129 | author = "Ma, Mingyu Derek and 130 | Sun, Jiao and 131 | Yang, Mu and 132 | Huang, Kung-Hsiang and 133 | Wen, Nuan and 134 | Singh, Shikhar and 135 | Han, Rujun and 136 | Peng, Nanyun", 137 | booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Demonstrations", 138 | month = jun, 139 | year = "2021", 140 | address = "Online", 141 | publisher = "Association for Computational Linguistics", 142 | url = "https://www.aclweb.org/anthology/2021.naacl-demos.7", 143 | pages = "56--65", 144 | abstract = "We present EventPlus, a temporal event understanding pipeline that integrates various state-of-the-art event understanding components including event trigger and type detection, event argument detection, event duration and temporal relation extraction. Event information, especially event temporal knowledge, is a type of common sense knowledge that helps people understand how stories evolve and provides predictive hints for future events. EventPlus as the first comprehensive temporal event understanding pipeline provides a convenient tool for users to quickly obtain annotations about events and their temporal information for any user-provided document. Furthermore, we show EventPlus can be easily adapted to other domains (e.g., biomedical domain). We make EventPlus publicly available to facilitate event-related information extraction and downstream applications.", 145 | } 146 | ``` -------------------------------------------------------------------------------- /component/BETTER/README.md: -------------------------------------------------------------------------------- 1 | # BETTER_project 2 | 3 | ## Prerequisites 4 | 5 | - Install [Git LFS](https://github.com/git-lfs/git-lfs/wiki/Installation). 6 | - Clone the repository. If your repository is already cloned, pull files 7 | with `git-lfs pull`. 8 | - When adding big binary, JSON, etc files track them using Git LFS: `git 9 | lfs track ` or `git lfs track "/**"` to track 10 | everything under that folder. 11 | 12 | 13 | ## Run code 14 | 15 | **See the README under `joint` folder.** 16 | -------------------------------------------------------------------------------- /component/BETTER/joint/.gitignore: -------------------------------------------------------------------------------- 1 | glove/*.txt 2 | logs/* 3 | run_jobs/* 4 | results_biaffine*.pkl 5 | *.log 6 | slurm* 7 | tmp/* 8 | exp_argus/* 9 | *.sh 10 | __pycache__ 11 | worked_model_ace -------------------------------------------------------------------------------- /component/BETTER/joint/CRF_util.py: -------------------------------------------------------------------------------- 1 | # Assert the torchcrf version is 0.7.2 2 | # allennlp version is 0.9.1 3 | import torch 4 | import heapq 5 | import numpy as np 6 | import time 7 | from torch.nn.utils.rnn import pad_sequence 8 | from allennlp.nn.util import viterbi_decode 9 | 10 | def calculate_prob_byObser(crf_obj, emissions, observation, mask): 11 | ''' 12 | Given padded sequence of crf_path, calculate corresponding score for path 13 | Args: 14 | crf_obj : torchcrf object 15 | emissions (`~torch.Tensor`): Emission score tensor of size 16 | ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, 17 | ``(batch_size, seq_length, num_tags)`` otherwise. 18 | observation (`~torch.Tensor`): ``size (seq_length, batch_size)`` if ``batch_first is ``False``, 19 | ``(batch_size, seq_length)`` otherwise. 20 | mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` 21 | if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. 22 | Returns: 23 | torch.FloatTensor in size (batch) # log prob. 24 | ''' 25 | if mask is None: 26 | mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8) 27 | 28 | if crf_obj.batch_first: 29 | emissions = emissions.transpose(0, 1) 30 | mask = mask.transpose(0, 1) 31 | obser = observation.transpose(0, 1) 32 | numerator = crf_obj._compute_score(emissions, obser, mask) 33 | denominator = crf_obj._compute_normalizer(emissions, mask) 34 | return numerator - denominator 35 | 36 | def pad_seq(best_path, seq_length, batch_first=True, padding_value=0): 37 | assert batch_first 38 | batch = [] 39 | for path in best_path: 40 | ori_len = len(path) 41 | pads = [padding_value]*(seq_length-ori_len) 42 | batch.append(path+pads) 43 | return torch.LongTensor(batch) 44 | 45 | def kViterbi(crf_obj, emissions, topK, mask): 46 | """ 47 | Find the k-best tag sequence using modified Viterbi algorithm. 48 | Args: 49 | crf_obj : torchcrf object 50 | emissions (`~torch.Tensor`): Emission score tensor of size 51 | ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, 52 | ``(batch_size, seq_length, num_tags)`` otherwise. 53 | topK (int): How many path want to consider 54 | mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` 55 | if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. 56 | Returns: 57 | List of list containing the best tag sequence for each batch. 58 | """ 59 | assert topK >=1 60 | if topK == 1: 61 | seq_length = emissions.size(1) 62 | best_path = crf_obj.decode(emissions, mask) 63 | observation = pad_seq(best_path, seq_length, crf_obj.batch_first, 0) 64 | best_probs = calculate_prob_byObser(crf_obj, emissions, observation, mask) 65 | return best_path, best_probs.squeeze() 66 | 67 | crf_obj._validate(emissions, mask=mask) 68 | if mask is None: 69 | mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8) 70 | if crf_obj.batch_first: 71 | emissions = emissions.transpose(0, 1) 72 | mask = mask.transpose(0, 1) 73 | normalizer = crf_obj._compute_normalizer(emissions, mask) 74 | # ===============start main part======================== 75 | # emissions: (seq_length, batch_size, num_tags) 76 | # mask: (seq_length, batch_size) 77 | assert emissions.dim() == 3 and mask.dim() == 2 78 | assert emissions.shape[:2] == mask.shape 79 | assert emissions.size(2) == crf_obj.num_tags 80 | assert mask[0].all() 81 | 82 | seq_length, batch_size = mask.shape 83 | 84 | # Start transition and first emission 85 | # score is a tensor of size(batch_size, num_tags, topK) where for each 86 | # batch, value at tags i and top j stores the scores of the j-th best tag 87 | # sequence so far that ends with tag i 88 | # 89 | # pre_states saves the previous tag where the j-th best path that ends with tag i currently 90 | score = emissions.new_zeros((seq_length, batch_size, crf_obj.num_tags, topK)) 91 | score[0,:,:,0] = crf_obj.start_transitions + emissions[0] # batch x num_tags 92 | 93 | pre_states = np.zeros((seq_length, batch_size, crf_obj.num_tags, topK), int) 94 | for i in range(crf_obj.num_tags): 95 | for b in range(batch_size): 96 | for k in range(topK): 97 | pre_states[0,b,i,k] = i # should be start transition 98 | 99 | # The ranking of multiple paths through same state 100 | rank = np.zeros((seq_length, batch_size, crf_obj.num_tags, topK), int) 101 | for t in range(1, seq_length): 102 | next_score_list = [] 103 | for k in range(topK): 104 | broadcast_score = score[t-1,:,:,k].unsqueeze(2) #(batch_size, num_tags, 1) 105 | broadcast_emissions = emissions[t].unsqueeze(1) #(batch_size, 1, num_tags) 106 | 107 | # Compute the score tensor of size (batch_size, num_tags, num_tags) 108 | # where for each sample, entry at row i and column j stores 109 | # the sum of scores of all possible tag sequences so far that end 110 | # with transitioning from tag i to tag j and emitting 111 | # shape: (batch_size, num_tags, num_tags) 112 | next_score = broadcast_score + crf_obj.transitions + broadcast_emissions 113 | next_score_list.append(next_score) 114 | 115 | for b in range(batch_size): 116 | if mask[t,b]: 117 | for cur_state in range(crf_obj.num_tags): 118 | h = [] 119 | for pre_state in range(crf_obj.num_tags): 120 | for k in range(topK): 121 | heapq.heappush(h, (-1*next_score_list[k][b, pre_state, cur_state], pre_state)) 122 | 123 | # Get the sorted list 124 | h_sorted = [heapq.heappop(h) for _ in range(topK)] #get topK path into cur_state 125 | # We need to keep a ranking if a path crosses a state more than once 126 | rankDict = dict() 127 | # Retain the topK scoring paths 128 | for k in range(topK): 129 | score[t, b, cur_state, k] = score[t, b, cur_state, k] + (h_sorted[k][0].data * -1) 130 | pre_states[t, b, cur_state, k] = h_sorted[k][1] 131 | state = h_sorted[k][1] 132 | if state in rankDict: 133 | rankDict[state] = rankDict[state]+1 134 | else: 135 | rankDict[state] = 0 136 | rank[t, b, cur_state, k] = rankDict[state] 137 | else: 138 | for cur_state in range(crf_obj.num_tags): 139 | for k in range(topK): 140 | score[t, b, cur_state, k]=score[t-1, b, cur_state, k] 141 | 142 | 143 | batch_path = [] 144 | batch_path_prob = [] 145 | seq_ends = mask.long().sum(dim=0) - 1 # seq_len x batch # assume seq_ends=8, seq_len=9 146 | for b in range(batch_size): 147 | h = [] 148 | for cur_state in range(crf_obj.num_tags): 149 | for k in range(topK): 150 | heapq.heappush(h, ( -1 * (score[seq_ends[b], b, cur_state, k]+crf_obj.end_transitions[cur_state]), 151 | cur_state, k)) 152 | h_sorted = [heapq.heappop(h) for _ in range(topK)] 153 | k_list = np.zeros((topK, seq_ends[b]+1), int) # k x 9 154 | k_list_probs = list() 155 | for k in range(topK): 156 | prob = h_sorted[k][0] 157 | state = h_sorted[k][1] 158 | rankK = h_sorted[k][2] 159 | 160 | k_list_probs.append((prob*-1)-(normalizer[b])) 161 | k_list[k][seq_ends[b]] = state # assign index 8 == last one 162 | for t in range(seq_ends[b]-1, -1, -1): # t = 7,6,5,4,3,2,1,0 163 | nextState = k_list[k][t+1] 164 | preState = pre_states[t+1, b, nextState, rankK] 165 | k_list[k][t] = preState 166 | rankK = rank[t+1,b,nextState,rankK] 167 | batch_path.append(k_list.tolist()) 168 | batch_path_prob.append(k_list_probs) 169 | if crf_obj.batch_first: 170 | batch_probs = recalculate_probs(crf_obj, batch_path, emissions.transpose(0,1), mask.transpose(0,1), topK) 171 | else: 172 | batch_probs = recalculate_probs(crf_obj, batch_path, emissions, mask, topK) 173 | return batch_path, batch_probs 174 | 175 | def recalculate_probs(crf_obj, batch_path, emissions, mask, topK): 176 | ''' 177 | batch_path: List(batch) of List(k) of int 178 | emissions' and mask's batch_first should align with crf_obj 179 | ''' 180 | if crf_obj.batch_first: 181 | batch_size = emissions.size(0) 182 | else: 183 | batch_size = emissions.size(1) 184 | 185 | batch_probs = [] 186 | for k in range(topK): 187 | candidate = [] 188 | for b in range(batch_size): 189 | candidate.append(batch_path[b][k]) 190 | observation = pad_sequence([torch.LongTensor(s) for s in candidate], 191 | batch_first=crf_obj.batch_first, 192 | padding_value=0) 193 | batch_probs.append(calculate_prob_byObser(crf_obj, emissions, observation, mask)) 194 | 195 | return torch.stack(batch_probs, dim=0).transpose(0,1) 196 | -------------------------------------------------------------------------------- /component/BETTER/joint/JsonBuilder.py: -------------------------------------------------------------------------------- 1 | 2 | class JsonBuilder: 3 | def __init__(self, B2I_trigger, B2I_argument, B2I_ner): 4 | self.B2I_trigger = B2I_trigger 5 | self.B2I_argument = B2I_argument 6 | self.B2I_ner = B2I_ner 7 | 8 | def iob_to_obj(self, y, B2I): 9 | ''' 10 | B2I : {'B-AGENT': 'I-AGENT', 'B-PATIENT': 'I-PATIENT'} 11 | ''' 12 | obj = [] 13 | in_obj = False 14 | curr_obj = [] 15 | curr_I = None 16 | for i in range(len(y)): 17 | # end of obj 18 | if in_obj: 19 | if y[i] != curr_I: 20 | obj.append(curr_obj + [i-1]) 21 | curr_obj = [] 22 | curr_I = None 23 | in_obj = False 24 | else: 25 | if i == len(y) - 1: 26 | obj.append(curr_obj + [i]) 27 | # beginning of obj 28 | if y[i] in B2I: 29 | curr_obj = [y[i][2:], i] 30 | curr_I = B2I[y[i]] 31 | in_obj = True 32 | if i == len(y) - 1: 33 | obj.append(curr_obj + [i]) 34 | return obj 35 | def from_preds(self, input_sent, y_preds_t, y_preds_e, y_preds_ner): 36 | assert len(y_preds_t) == len(y_preds_e) 37 | preds = [] 38 | for y_pred_t, y_pred_e in zip(y_preds_t, y_preds_e): 39 | preds.append({ 40 | 'trigger': y_pred_t, 41 | 'argument': y_pred_e 42 | }) 43 | ner = self.iob_to_obj(y_preds_ner[0], self.B2I_ner) 44 | ner = [[x[1], x[2], x[0]] for x in ner] # convert the order for each ner obj 45 | out = [] 46 | events_pred = self.convert_out_dicts_to_event_dicts(preds, input_sent) 47 | out.append({'tokens': input_sent, 48 | 'events': events_pred, 49 | 'ner': ner 50 | }) 51 | return out 52 | 53 | 54 | def convert_out_dicts_to_event_dicts(self, sel_preds, input_sent): 55 | ''' 56 | `sel_preds` contain sent-level prediction 57 | return a list of dicts, which will be used to create the BetterEvent objs 58 | `data_type`, currently support choose from ['local', 'ssvm'] 59 | ''' 60 | 61 | out_dicts = [] 62 | cnt = 1 63 | for event in sel_preds: 64 | out_dict = {} 65 | # sent_id = event['sent_id'] 66 | 67 | tri_seq = event['trigger'] 68 | trigger_objs = self.iob_to_obj(tri_seq, self.B2I_trigger) 69 | if len(trigger_objs) == 0: 70 | continue 71 | else: 72 | 73 | event_type = trigger_objs[0][0] 74 | out_dict['event_type'] = event_type 75 | trigger_span_dicts = self.get_span_dicts_from_objs(trigger_objs, input_sent, task='trigger') 76 | arg_objs = self.iob_to_obj(event['argument'], self.B2I_argument) 77 | argu_span_dicts = self.get_span_dicts_from_objs(arg_objs, input_sent, task='argument') 78 | out_dict['triggers'] = trigger_span_dicts 79 | 80 | out_dict['arguments'] = argu_span_dicts 81 | 82 | cnt += 1 83 | out_dicts.append(out_dict) 84 | return out_dicts 85 | 86 | def get_span_dicts_from_objs(self, objs, input_sent, task='trigger'): 87 | span_dicts = [] 88 | for obj in objs: 89 | role = obj[0] 90 | l_idx = obj[1] 91 | r_idx = obj[2] 92 | text = input_sent[l_idx] if r_idx == l_idx \ 93 | else ' '.join(input_sent[l_idx:r_idx+1]) 94 | if task == 'trigger': 95 | span_dict = {'event_type': role, 96 | 'text': text, 97 | 'start_token': l_idx, 98 | 'end_token': r_idx 99 | } 100 | elif task == 'argument': 101 | span_dict = {'role': role, 102 | 'text': text, 103 | 'start_token': l_idx, 104 | 'end_token': r_idx 105 | } 106 | span_dicts.append(span_dict) 107 | return span_dicts 108 | -------------------------------------------------------------------------------- /component/BETTER/joint/README.md: -------------------------------------------------------------------------------- 1 | # Model for BETTER Project 2 | ## Event Extraction system API 3 | 4 | ### Download pretrained models 5 | The pretrained models are [here](https://drive.google.com/file/d/19_W6azeG5KRQxLDICswqwIFX0QOjxh_L/view?usp=sharing). Download the models and unzip it. There should be a `worked_model_ace` folder under `joint`. 6 | 7 | ### Run code 8 | 9 | ``` 10 | python event_pipeline_demo.py 11 | ``` 12 | 13 | For the core of calling the event extraction system, see line 127-137 in `event_pipeline_demo.py`. The expected output should be 14 | ``` 15 | [{'tokens': ['Orders', 'went', 'out', 'today', 'to', 'deploy', '17,000', 'U.S.', 'Army', 'soldiers', 'in', 'the', 'Persian', 'Gulf', 'region', '.'], 'events': [{'event_type': 'Movement:Transport', 'triggers': [{'event_type': 'Movement:Transport', 'text': 'deploy', 'start_token': 5, 'end_token': 5}], 'arguments': [{'role': 'Artifact', 'text': 'soldiers', 'start_token': 9, 'end_token': 9}, {'role': 'Destination', 'text': 'region', 'start_token': 14, 'end_token': 14}]}], 'ner': [[7, 7, 'GPE'], [8, 8, 'ORG'], [9, 9, 'PER'], [12, 13, 'LOC'], [14, 14, 'LOC']]}] 16 | ``` 17 | -------------------------------------------------------------------------------- /component/BETTER/joint/all_liz/BETTER_pos2idx.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/BETTER/joint/all_liz/BETTER_pos2idx.pickle -------------------------------------------------------------------------------- /component/BETTER/joint/all_liz/pos_emb.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/BETTER/joint/all_liz/pos_emb.npy -------------------------------------------------------------------------------- /component/BETTER/joint/dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils import data 2 | from torch.nn.utils.rnn import pad_sequence 3 | import torch 4 | import pickle 5 | from generate_data.contextualized_features_bert import bert_token 6 | 7 | TOKEN_PAD_ID = 0 8 | POS_PAD_ID = 6 9 | TRI_PAD_ID = 0 10 | ARGU_PAD_ID = 0 11 | 12 | 13 | class EventDataset(data.Dataset): 14 | 'Characterizes a dataset for PyTorch' 15 | def __init__(self, pkl_file, args): 16 | self.args = args 17 | # load data 18 | with open(pkl_file, 'rb') as handle: 19 | self.data = pickle.load(handle) 20 | 21 | # preprocessing 22 | new_data = list() 23 | for i in range(len(self.data)): 24 | out = list() 25 | if args.use_bert: 26 | out.append(self.data[i]['contextual_feature']) 27 | elif args.finetune_bert: 28 | sent_bert_tokens, sent_bert_ids, orig_to_tok_map = bert_token(self.data[i]['tokens'], args.bert_tokenizer) 29 | out.append(sent_bert_ids) 30 | 31 | else: 32 | if args.lower: 33 | out.append([args.word2idx[x.lower()] for x in self.data[i]['tokens']]) 34 | else: 35 | out.append([args.word2idx[x] for x in self.data[i]['tokens']]) 36 | out.append([args.pos2idx[x] for x in self.data[i]['pos_tag']]) 37 | if args.trigger_type: 38 | out.append([args._label_to_id_t[x] for x in self.data[i]['sent_tri_label_type']]) 39 | else: 40 | out.append([args._label_to_id_t[x] for x in self.data[i]['trigger_label']]) 41 | # if args.decode_w_ents_mask is False: 42 | # out.append([args._label_to_id_e_sent[x] for x in self.data[i]['argu_label']]) 43 | # elif args.decode_w_ents_mask is True: 44 | out.append([args._label_to_id_e_sent[x] for x in self.data[i]['ent_label']]) # now this item is argument candidates, instead of arguments 45 | if args.trigger_type: 46 | out.append([([args._label_to_id_t[x] for x in i[0]], [args._label_to_id_e[y] for y in i[1]]) \ 47 | for i in self.data[i]['sent_tri_arg_pairs_type']]) 48 | else: 49 | out.append([([args._label_to_id_t[x] for x in i[0]], [args._label_to_id_e[y] for y in i[1]]) \ 50 | for i in self.data[i]['tri_arg_pairs']]) 51 | # case 0 : use permutation of gold trigger and gold argument 52 | # out.append([(x[0], x[1], x[2], x[3], args._label_to_id_r[x[4]])\ 53 | # for x in self.data[i]['all_tri_arg_pairs']]) 54 | out.append([]) ##### TODO, now dont do the `all_tri_arg_pairs` items so this is an empty list 55 | # case 1 : use candidate augmented pairs 56 | #out.append([(x[0], x[1], x[2], x[3], args._label_to_id_r[x[4]])\ 57 | # for x in self.data[i]['all_pairs_by_cand']]) 58 | 59 | out.append(self.data[i]['sent_id']) 60 | if args.use_glove: 61 | if args.lower: 62 | out.append([args.word2idx[x.lower()] for x in self.data[i]['tokens']]) 63 | else: 64 | out.append([args.word2idx[x] for x in self.data[i]['tokens']]) 65 | else: 66 | out.append([]) 67 | if args.finetune_bert: 68 | out.append(orig_to_tok_map) 69 | else: 70 | out.append([]) 71 | 72 | out.append(self.data[i]['ent_to_arg']) 73 | 74 | new_data.append(out) 75 | self.data = new_data 76 | 77 | def __len__(self): 78 | 'Denotes the total number of samples' 79 | return len(self.data) 80 | 81 | def __getitem__(self, idx): 82 | 'Generates one sample of data' 83 | sample = self.data[idx] 84 | sent_token = sample[0] 85 | sent_pos = sample[1] 86 | sent_label_t = sample[2] 87 | sent_label_e = sample[3] 88 | sent_tri_arg_pairs = sample[4] # each pair is (seq, seq) 89 | all_pairs = sample[5] # each pair is(l_start, l_end, r_start, r_end, arg_role) 90 | sent_id = sample[6] 91 | glove_idx = sample[7] 92 | orig_to_tok_map = sample[8] 93 | ent_to_arg_dict = sample[9] 94 | return sent_token, sent_pos, sent_label_t, sent_label_e, sent_tri_arg_pairs, all_pairs, sent_id, glove_idx, orig_to_tok_map, ent_to_arg_dict 95 | 96 | def pad_collate(batch): 97 | if len(batch) >= 1: 98 | # sort sents in each batch according to the sent len 99 | bs = list(zip(*[ex for ex in sorted(batch, key=lambda x: len(x[0]), reverse=True)])) 100 | lengths = [len(x) for x in bs[0]] 101 | sents = pad_sequence([torch.LongTensor(s) for s in bs[0]], batch_first=True, padding_value=TOKEN_PAD_ID) 102 | poss = pad_sequence([torch.LongTensor(s) for s in bs[1]], batch_first=True, padding_value=POS_PAD_ID) 103 | triggers = pad_sequence([torch.LongTensor(s) for s in bs[2]], batch_first=True, padding_value=TRI_PAD_ID) 104 | arguments = pad_sequence([torch.LongTensor(s) for s in bs[3]], batch_first=True, padding_value=ARGU_PAD_ID) 105 | seq_pairs = bs[4] 106 | all_pairs = bs[5] 107 | sent_ids = bs[6] 108 | 109 | return sent_ids, sents, poss, triggers, arguments, lengths, seq_pairs, all_pairs 110 | 111 | def pad_collate_bert(batch): 112 | if len(batch) >= 1: 113 | # sort sents in each batch according to the sent len 114 | bs = list(zip(*[ex for ex in sorted(batch, key=lambda x: len(x[0]), reverse=True)])) 115 | lengths = [len(x) for x in bs[0]] 116 | bert_lengths = [] 117 | sents = pad_sequence([torch.FloatTensor(s) for s in bs[0]], batch_first=True, padding_value=0.) 118 | poss = pad_sequence([torch.LongTensor(s) for s in bs[1]], batch_first=True, padding_value=POS_PAD_ID) 119 | triggers = pad_sequence([torch.LongTensor(s) for s in bs[2]], batch_first=True, padding_value=TRI_PAD_ID) 120 | arguments = pad_sequence([torch.LongTensor(s) for s in bs[3]], batch_first=True, padding_value=ARGU_PAD_ID) 121 | seq_pairs = bs[4] 122 | all_pairs = bs[5] 123 | sent_ids = bs[6] 124 | if len(bs[7]) > 0: 125 | glove_idx = pad_sequence([torch.LongTensor(s) for s in bs[7]], batch_first=True, padding_value=TOKEN_PAD_ID) 126 | else: 127 | glove_idx = None 128 | orig_to_tok_map = None 129 | 130 | return sent_ids, sents, poss, triggers, arguments, lengths, seq_pairs, all_pairs, glove_idx, orig_to_tok_map, bert_lengths 131 | 132 | def pad_collate_bert_finetune(batch): 133 | if len(batch) >= 1: 134 | # sort sents in each batch according to the sent len 135 | bs = list(zip(*[ex for ex in sorted(batch, key=lambda x: len(x[2]), reverse=True)])) 136 | lengths = [len(x) for x in bs[2]] # NOTE, here have to use the triggers as original length, b/c the length of tokens(bs[0]) has changed 137 | bert_lengths = [len(x) for x in bs[0]] 138 | sents = pad_sequence([torch.LongTensor(s) for s in bs[0]], batch_first=True, padding_value=TOKEN_PAD_ID) 139 | poss = pad_sequence([torch.LongTensor(s) for s in bs[1]], batch_first=True, padding_value=POS_PAD_ID) 140 | triggers = pad_sequence([torch.LongTensor(s) for s in bs[2]], batch_first=True, padding_value=TRI_PAD_ID) 141 | arguments = pad_sequence([torch.LongTensor(s) for s in bs[3]], batch_first=True, padding_value=ARGU_PAD_ID) 142 | seq_pairs = bs[4] 143 | all_pairs = bs[5] 144 | sent_ids = bs[6] 145 | glove_idx = pad_sequence([torch.LongTensor(s) for s in bs[7]], batch_first=True, padding_value=TOKEN_PAD_ID) #None # in finetune case, do not include glove_idx 146 | orig_to_tok_map = bs[8] 147 | ent_to_arg_dict = bs[9] 148 | 149 | return sent_ids, sents, poss, triggers, arguments, lengths, seq_pairs, all_pairs, glove_idx, orig_to_tok_map, bert_lengths, ent_to_arg_dict 150 | -------------------------------------------------------------------------------- /component/BETTER/joint/generate_data/all_uw.comb.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/BETTER/joint/generate_data/all_uw.comb.pkl -------------------------------------------------------------------------------- /component/BETTER/joint/generate_data/contextualized_features.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import pickle 4 | from util import * 5 | from transformers import * 6 | import tqdm 7 | 8 | MODELS = [(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-large')] 9 | #MODELS = [(RobertaModel, RobertaTokenizer, 'roberta-large')] 10 | 11 | p = argparse.ArgumentParser() 12 | p.add_argument('input_file', type=str, 13 | help="Input pkl file (converted from internal JSON)") 14 | p.add_argument('output_file', type=str, 15 | help="Where to save the output features pkl file") 16 | args = p.parse_args() 17 | 18 | for model_class, tokenizer_class, pretrained_weights in MODELS: 19 | tokenizer = tokenizer_class.from_pretrained(pretrained_weights) 20 | model = model_class.from_pretrained(pretrained_weights) 21 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 22 | model.to(device) 23 | data = pickle.load(open(args.input_file, 'rb')) 24 | output = list() 25 | cnt = 0 26 | for d in tqdm.tqdm(data): 27 | ori_sent = d['ori_sent'] 28 | tokens = d['tokens'] 29 | # Encode text 30 | ori_sent = clean_ori_sent(ori_sent) 31 | input_ids = torch.tensor([tokenizer.encode(ori_sent, add_special_tokens=True)], device=device) 32 | input_tok_list = [tokenizer.decode([x]) for x in input_ids[0]] 33 | assert input_ids.size(1) == len(input_tok_list) 34 | try: 35 | alignment = align_bpe_to_words(input_tok_list, tokens) 36 | except: 37 | # print('Align BPE failed. Skipped') 38 | continue 39 | with torch.no_grad(): 40 | last_hidden_states = model(input_ids)[0] 41 | features = align_features_to_words((last_hidden_states[0]).cpu(), alignment) 42 | try: 43 | assert features.size(0) == len(tokens) 44 | except: 45 | print('Align contextualized features failed. Skipped') 46 | continue 47 | d['contextual_feature'] = features 48 | output.append(d) 49 | cnt += 1 50 | print(cnt) 51 | with open(args.output_file, 'wb') as of: 52 | pickle.dump(output, of) 53 | -------------------------------------------------------------------------------- /component/BETTER/joint/generate_data/contextualized_features_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import pickle 4 | from util import * 5 | from transformers import * 6 | import pdb 7 | import tqdm 8 | 9 | def bert_token(sent_orig_tokens, tokenizer): 10 | 11 | orig_to_tok_map = [] 12 | sent_bert_tokens = [] 13 | sent_bert_ids = [] 14 | sent_bert_tokens.append("[CLS]") 15 | sent_bert_ids.extend(tokenizer.encode("[CLS]", add_special_tokens=False)) 16 | 17 | for idx, orig_token in enumerate(sent_orig_tokens): 18 | orig_to_tok_map.append(len(sent_bert_tokens)) 19 | # if orig_token != ' ': 20 | # sent_bert_tokens.extend(tokenizer.tokenize(orig_token)) 21 | # sent_bert_ids.extend(tokenizer.encode(orig_token, add_special_tokens=False)) 22 | # else: 23 | # sent_bert_ids.extend(tokenizer.convert_tokens_to_ids([orig_token])) 24 | # sent_bert_tokens.extend(tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([orig_token]))) 25 | if len(tokenizer.tokenize(orig_token)) > 0: 26 | sent_bert_tokens.extend(tokenizer.tokenize(orig_token)) 27 | sent_bert_ids.extend(tokenizer.encode(orig_token, add_special_tokens=False)) 28 | elif len(tokenizer.tokenize(orig_token)) == 0: 29 | # case of some special chars that cause bert tokenizer return empty 30 | sent_bert_ids.extend(tokenizer.convert_tokens_to_ids([orig_token])) 31 | sent_bert_tokens.extend(tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([orig_token]))) 32 | sent_bert_tokens.append("[SEP]") 33 | sent_bert_ids.extend(tokenizer.encode("[SEP]", add_special_tokens=False)) 34 | return sent_bert_tokens, sent_bert_ids, orig_to_tok_map 35 | 36 | def get_bert_embedding(last_hid_state, orig_to_tok_map): 37 | ''' 38 | last_hid_state is a tensor of shape (batch_size, seq_len, hid_dim) 39 | orig_to_tok_map is a list, len(orig_to_tok_map) = len(sent_orig_tokens) 40 | ''' 41 | out_feats = [] 42 | for orig_idx, bert_idx in enumerate(orig_to_tok_map): 43 | if orig_idx != len(orig_to_tok_map) - 1: 44 | sel_idx = list(range(orig_to_tok_map[orig_idx], orig_to_tok_map[orig_idx+1])) 45 | else: 46 | # last token 47 | sel_idx = list(range(orig_to_tok_map[orig_idx], last_hid_state.size(1) - 1)) # do not use the [SEP] representation 48 | sel = last_hid_state[:, sel_idx, :] 49 | sel_mean = torch.mean(sel, dim=1, keepdim=True) 50 | out_feats.append(sel_mean) 51 | out_feats = torch.cat(out_feats, dim=1) 52 | return out_feats 53 | 54 | if __name__ == '__main__': 55 | # MODELS = [(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-large')] 56 | MODELS = [(BertModel, BertTokenizer, 'bert-large-uncased')] 57 | #MODELS = [(RobertaModel, RobertaTokenizer, 'roberta-large')] 58 | 59 | p = argparse.ArgumentParser() 60 | p.add_argument('input_file', type=str, 61 | help="Input pkl file (converted from internal JSON)") 62 | p.add_argument('output_file', type=str, 63 | help="Where to save the output features pkl file") 64 | args = p.parse_args() 65 | 66 | for model_class, tokenizer_class, pretrained_weights in MODELS: 67 | tokenizer = tokenizer_class.from_pretrained(pretrained_weights) 68 | model = model_class.from_pretrained(pretrained_weights) 69 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 70 | model.to(device) 71 | data = pickle.load(open(args.input_file, 'rb')) 72 | output = list() 73 | cnt = 0 74 | for d in tqdm.tqdm(data): 75 | ori_sent = d['ori_sent'] 76 | tokens = d['tokens'] 77 | sent_bert_tokens, sent_bert_ids, orig_to_tok_map = bert_token(tokens, tokenizer) 78 | assert len(sent_bert_tokens) == len(sent_bert_ids) 79 | assert len(tokens) == len(orig_to_tok_map) 80 | with torch.no_grad(): 81 | bert_output = model(torch.tensor([sent_bert_ids]).to(device)) 82 | last_hid_state = bert_output[0].cpu() 83 | out_feats = get_bert_embedding(last_hid_state, orig_to_tok_map) 84 | assert out_feats.size(1) == len(tokens) # orig seq_len 85 | d['contextual_feature'] = out_feats.squeeze(0) 86 | output.append(d) 87 | cnt += 1 88 | print(cnt) 89 | with open(args.output_file, 'wb') as of: 90 | pickle.dump(output, of) 91 | -------------------------------------------------------------------------------- /component/BETTER/joint/generate_data/json_to_pkl_newformat.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import json 3 | import pdb 4 | import argparse 5 | import os 6 | from events.better_core import BetterDocument 7 | 8 | 9 | def get_seq_label_from_idxs(idxs, tokens, label_str='ANCHOR', types=None): 10 | 11 | seq_label = ['O'] * len(tokens) 12 | 13 | if label_str == 'ANCHOR': 14 | # for anchor case, only consider label to be among {'O', 'B-ANCHOR', 'I-ANCHOR'} 15 | for i in idxs: 16 | l_idx = i[0] 17 | r_idx = i[1] 18 | if r_idx - l_idx == 0: 19 | # single-token trigger 20 | seq_label[l_idx] = 'B-{}'.format(label_str) 21 | elif r_idx - l_idx > 0: 22 | seq_label[l_idx] = 'B-{}'.format(label_str) 23 | seq_label[l_idx + 1: r_idx + 1] = ['I-{}'.format(label_str)] * (r_idx - l_idx) 24 | elif label_str == 'TYPE': 25 | assert len(idxs) == len(types), pdb.set_trace() 26 | # for type case, only consider label to be among {'O', 'B-material--helpful', 'I-material--helpful', ...} 27 | for i, idx in enumerate(idxs): 28 | l_idx = idx[0] 29 | r_idx = idx[1] 30 | if r_idx - l_idx == 0: 31 | # single-token trigger 32 | seq_label[l_idx] = 'B-{}'.format(types[i]) 33 | elif r_idx - l_idx > 0: 34 | seq_label[l_idx] = 'B-{}'.format(types[i]) 35 | seq_label[l_idx + 1: r_idx + 1] = ['I-{}'.format(types[i])] * (r_idx - l_idx) 36 | 37 | elif label_str == 'ENT': 38 | # for argument case, consider label to be among {'O', 'B-agent', 'I-agent', 'B-patient', 'I-patient'} 39 | for i in idxs: 40 | l_idx = i[0] 41 | r_idx = i[1] 42 | if len(i) == 3: 43 | # when arg_role is fed in, use this as label 44 | arg_role = i[2].upper() 45 | else: 46 | # else this is for sent-level arg label, only consider {'O', 'B-ENT', 'I-ENT'} 47 | arg_role = 'ENT' 48 | if r_idx - l_idx == 0: 49 | seq_label[l_idx] = 'B-{}'.format(arg_role) 50 | elif r_idx - l_idx > 0: 51 | seq_label[l_idx] = 'B-{}'.format(arg_role) 52 | seq_label[l_idx + 1: r_idx + 1] = ['I-{}'.format(arg_role)] * (r_idx - l_idx) 53 | 54 | return seq_label 55 | 56 | 57 | def get_seq_label_fine_grained(idxs, tokens, label_str='AGENT'): 58 | assert label_str == 'AGENT' or label_str == 'PATIENT' 59 | seq_label = ['O'] * len(tokens) 60 | for i in idxs: 61 | if label_str == 'AGENT': 62 | if i[2] != 'agent': 63 | continue 64 | elif label_str == 'PATIENT': 65 | if i[2] != 'patient': 66 | continue 67 | l_idx = i[0] 68 | r_idx = i[1] 69 | if r_idx - l_idx == 0: 70 | # single-token trigger 71 | seq_label[l_idx] = 'B-{}'.format(label_str) 72 | elif r_idx - l_idx > 0: 73 | seq_label[l_idx] = 'B-{}'.format(label_str) 74 | seq_label[l_idx + 1: r_idx + 1] = ['I-{}'.format(label_str)] * (r_idx - l_idx) 75 | return seq_label 76 | 77 | 78 | def check_span(gold_start, gold_end, c_start, c_end): 79 | if gold_start > c_start: 80 | if gold_end <= c_end: 81 | # candidate contains gold 82 | return True 83 | elif gold_end > c_end: 84 | return False 85 | elif gold_start == c_start: 86 | if gold_end >= c_end: 87 | # gold contains candidate 88 | return True 89 | elif gold_end < c_end: 90 | # candidate contains gold 91 | return True 92 | elif gold_start < c_start: 93 | if gold_end >= c_end: 94 | # gold contains candidate 95 | return True 96 | elif gold_end < c_end: 97 | return False 98 | 99 | 100 | def check_duplicate(all_pairs, current_pair): 101 | # if return True means there's duplicate in all_pairs already 102 | for p in all_pairs: 103 | trigger_flag = check_span(p[0], p[1], current_pair[0], current_pair[1]) 104 | argument_flag = check_span(p[2], p[3], current_pair[2], current_pair[3]) 105 | if trigger_flag and argument_flag: 106 | return True 107 | return False 108 | 109 | 110 | def generate_all_candidate_pairs(all_candidates, all_pairs): 111 | ''' 112 | all_candidates is a list of tuple: (start_idx, end_idx) 113 | all_pairs is a list of tuple that contains gold trigger argument pairs: 114 | [(tri_start, tri_end, arg_start, arg_end, label), (....)] 115 | 116 | output: similar structure like all_pairs, but augument with all_candidates 117 | ''' 118 | output = list() 119 | for i in range(len(all_candidates)): 120 | for j in range(len(all_candidates)): 121 | if i != j: 122 | current_pair = (all_candidates[i][0], all_candidates[i][1], all_candidates[j][0], all_candidates[j][1]) 123 | if not check_duplicate(all_pairs, current_pair): 124 | output.append((all_candidates[i][0], all_candidates[i][1], all_candidates[j][0], all_candidates[j][1], 'None')) 125 | return output + all_pairs 126 | 127 | def get_data_from_json(json_file): 128 | 129 | with open(json_file, 'rb') as f: 130 | data = json.load(f) 131 | documents = {} 132 | data_outs = [] 133 | for doc_id, doc in data.items(): 134 | documents[doc_id] = BetterDocument.from_json(doc) 135 | for s in documents[doc_id].sentences: 136 | sent_id = s.sent_id 137 | sentence = s.text 138 | tokens = s.words 139 | pos_tags = s.pos_tags 140 | 141 | # gather all events in this sentence 142 | sent_events = documents[doc_id].abstract_events[sent_id] 143 | sent_tri_idxs, sent_arg_idxs = [], [] 144 | sent_tri_arg_pairs = [] 145 | sent_tri_agent_pairs = [] 146 | sent_tri_patient_pairs = [] 147 | sent_tri_arg_pairs_type = [] 148 | sent_event_types = [] 149 | for event in sent_events: 150 | tri_idxs = [(x.grounded_span.head_span.start_token, x.grounded_span.head_span.end_token) for x in event.anchors.spans] 151 | arg_idxs = [(x.grounded_span.head_span.start_token, x.grounded_span.head_span.end_token, y.role) 152 | for y in event.arguments for x in y.span_set.spans] 153 | assert len(tri_idxs) > 0, pdb.set_trace() 154 | type1 = event.properties['material-verbal'] 155 | type2 = event.properties['helpful-harmful'] 156 | # if type1 not in ['material', 'verbal', 'both', 'unk']: 157 | # type1 = 'unk' 158 | # if type2 not in ['helpful', 'harmful', 'neutral']: 159 | # type2 = 'unk' 160 | assert type1 in ['material', 'verbal', 'both', 'unk'], pdb.set_trace() 161 | assert type2 in ['helpful', 'harmful', 'neutral'], pdb.set_trace() 162 | event_type = '{}_{}'.format(type1, type2) 163 | sent_event_types.extend([event_type] * len(tri_idxs)) 164 | 165 | tri_label = get_seq_label_from_idxs(tri_idxs, tokens, 'ANCHOR') 166 | arg_label = get_seq_label_from_idxs(arg_idxs, tokens, 'ENT') 167 | sent_tri_arg_pairs.append((tri_label, arg_label)) 168 | 169 | tri_label_type = get_seq_label_from_idxs(tri_idxs, tokens, 'TYPE', [event_type] * len(tri_idxs)) 170 | sent_tri_arg_pairs_type.append((tri_label_type, arg_label)) 171 | sent_tri_idxs.extend(tri_idxs) 172 | sent_arg_idxs.extend(arg_idxs) 173 | sent_tri_idxs_uniq = list(set(sent_tri_idxs)) # there are cases where the sent_tri_idxs has duplicated event idxs 174 | sent_arg_idxs = list(set([(i[0], i[1]) for i in sent_arg_idxs])) 175 | sent_tri_label = get_seq_label_from_idxs(sent_tri_idxs_uniq, tokens, 'ANCHOR') 176 | sent_arg_label = get_seq_label_from_idxs(sent_arg_idxs, tokens, 'ENT') 177 | sent_tri_label_type = get_seq_label_from_idxs(sent_tri_idxs, tokens, 'TYPE', sent_event_types) 178 | 179 | data_outs.append({ 180 | 'ori_sent': sentence.strip(), 181 | 'sent_id': '{}_{}_0'.format(doc_id, sent_id), 182 | 'tokens': tokens, 183 | 'pos_tag': pos_tags, 184 | 'trigger_label': sent_tri_label, 185 | 'argu_label': sent_arg_label, 186 | 'tri_arg_pairs': sent_tri_arg_pairs, 187 | 'tri_agent_pairs': sent_tri_agent_pairs, 188 | 'tri_patient_pairs': sent_tri_patient_pairs, 189 | 'sent_tri_label_type': sent_tri_label_type, 190 | 'sent_tri_arg_pairs_type': sent_tri_arg_pairs_type 191 | }) 192 | return data_outs 193 | 194 | def save_pkl(data, out_file): 195 | with open(out_file, 'wb') as f: 196 | pickle.dump(data, f) 197 | print('{} saved.'.format(out_file)) 198 | 199 | if __name__ == '__main__': 200 | p = argparse.ArgumentParser( 201 | description="""Convert internal JSON to pkl.""") 202 | p.add_argument('input_file', type=str, help="JSON file in internal format, or a dir holding the JSONs") 203 | p.add_argument('output_file', type=str, help="Where to save the pkl file, or a dir to hold the output Pkls") 204 | args = p.parse_args() 205 | 206 | if os.path.isdir(args.input_file): 207 | # ensure output dir 208 | # directory = os.path.dirname(args.output_file) 209 | # if directory == '': 210 | # raise OSError('{} is not a dir. Output argument and Input argument should be both dir or both file'.format(args.output_file)) 211 | # assume the output_file is a dir 212 | if not os.path.exists(args.output_file): 213 | os.makedirs(args.output_file) 214 | assert os.path.isdir(args.output_file) 215 | 216 | print('Read JSON files from dir {}'.format(args.input_file)) 217 | for json_file in os.listdir(args.input_file): 218 | data = get_data_from_json(os.path.join(args.input_file, json_file)) 219 | base_name = os.path.splitext(json_file)[0] 220 | out_file = os.path.join(args.output_file, '{}.pkl'.format(base_name)) 221 | save_pkl(data, out_file) 222 | 223 | elif os.path.isfile(args.input_file): 224 | # assume the output_file is a file 225 | directory = os.path.split(args.output_file)[0] 226 | # if directory != '': 227 | # raise OSError('{} is not a dir. Output argument and Input argument should be both dir or both file'.format(args.output_file)) 228 | if directory != '': 229 | if not os.path.exists(directory): 230 | os.makedirs(directory) 231 | 232 | print('Read JSON file from file {}'.format(args.input_file)) 233 | data = get_data_from_json(args.input_file) 234 | save_pkl(data, args.output_file) 235 | 236 | -------------------------------------------------------------------------------- /component/BETTER/joint/generate_data/util.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from collections import Counter 3 | import torch 4 | import re 5 | 6 | def clean_ori_sent(ori_sent): 7 | ori_sent = re.sub(r"\.\.\.\.", "...", ori_sent) 8 | ori_sent = re.sub(r"---", "--", ori_sent) 9 | ori_sent = re.sub(r"``", '"', ori_sent) 10 | ori_sent = re.sub(r"''", '"', ori_sent) 11 | ori_sent = re.sub(r"`", "'", ori_sent) 12 | ori_sent = re.sub(r"\.{3,}", "...", ori_sent) 13 | ori_sent = re.sub(r"etc\.$", "etc. .", ori_sent) 14 | ori_sent = re.sub(r"etc\.\)$", "etc. .)", ori_sent) 15 | return ori_sent 16 | 17 | def align_bpe_to_words(bert_tokens: List[str], other_tokens: List[str]): 18 | def clean(text): 19 | text = text.strip() 20 | if text=='---': 21 | return '--' 22 | else: 23 | return text 24 | def clean_stanford(text): 25 | text = text.strip() 26 | text = text.replace(u"\xa0", "") 27 | text = re.sub(r"-LRB-", '(', text) 28 | text = re.sub(r"-RRB-", ')', text) 29 | text = re.sub(r"-LSB-", '[', text) 30 | text = re.sub(r"-RSB-", ']', text) 31 | text = re.sub(r"-LCB-", '{', text) 32 | text = re.sub(r"-RCB-", '}', text) 33 | text = re.sub(r"``", '"', text) 34 | text = re.sub(r"''", '"', text) 35 | text = re.sub(r"`", "'", text) 36 | text = re.sub(r"---------", "------", text) 37 | text = re.sub(r"---------------------", "-------------------", text) 38 | if text =='-------------------': 39 | return '--------------------' 40 | if text =='------------': 41 | return '-----------' 42 | return text 43 | 44 | # remove whitespaces to simplify alignment 45 | bpe_tokens = [] 46 | for o in bert_tokens: 47 | if o not in {'', ''}: 48 | bpe_tokens.append(clean(str(o))) 49 | other_tokens = [clean_stanford(str(o)) for o in other_tokens] 50 | try: 51 | assert ''.join(bpe_tokens) == ''.join(other_tokens) 52 | except AssertionError: 53 | if (len(''.join(bpe_tokens))+1==len(''.join(other_tokens))) and (other_tokens[-1]=='.'): 54 | bpe_tokens[-1]+='.' 55 | assert ''.join(bpe_tokens) == ''.join(other_tokens) 56 | 57 | # create alignment 58 | alignment = [] 59 | bpe_toks = filter(lambda item: item[1] != '', enumerate(bpe_tokens, start=1)) 60 | j, bpe_tok = next(bpe_toks) 61 | for other_tok in other_tokens: 62 | bpe_indices = [] 63 | while True: 64 | if other_tok.startswith(bpe_tok): 65 | bpe_indices.append(j) 66 | other_tok = other_tok[len(bpe_tok):] 67 | try: 68 | j, bpe_tok = next(bpe_toks) 69 | except StopIteration: 70 | j, bpe_tok = None, None 71 | elif bpe_tok.startswith(other_tok): 72 | # other_tok spans multiple BPE tokens 73 | bpe_indices.append(j) 74 | bpe_tok = bpe_tok[len(other_tok):] 75 | other_tok = '' 76 | else: 77 | raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok)) 78 | if other_tok == '': 79 | break 80 | assert len(bpe_indices) > 0 81 | alignment.append(bpe_indices) 82 | assert len(alignment) == len(other_tokens) 83 | return alignment 84 | 85 | def align_features_to_words(features, alignment): 86 | assert features.dim() == 2 87 | bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices) 88 | assert bpe_counts[0] == 0 # shouldn't be aligned 89 | denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))]) 90 | weighted_features = features / denom.unsqueeze(-1) 91 | #output = [weighted_features[0]] # 92 | output = [] 93 | #largest_j = -1 94 | for bpe_indices in alignment: 95 | output.append(weighted_features[bpe_indices].sum(dim=0)) 96 | #largest_j = max(largest_j, *bpe_indices) 97 | #for j in range(largest_j + 1, len(features)): 98 | # output.append(weighted_features[j]) 99 | output = torch.stack(output) 100 | return output 101 | 102 | def spacy_nlp(): 103 | if getattr(spacy_nlp, '_nlp', None) is None: 104 | try: 105 | from spacy.lang.en import English 106 | spacy_nlp._nlp = English() 107 | except ImportError: 108 | raise ImportError('Please install spacy with: pip install spacy') 109 | return spacy_nlp._nlp 110 | 111 | def spacy_tokenizer(): 112 | if getattr(spacy_tokenizer, '_tokenizer', None) is None: 113 | try: 114 | nlp = spacy_nlp() 115 | spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp) 116 | except ImportError: 117 | raise ImportError('Please install spacy with: pip install spacy') 118 | return spacy_tokenizer._tokenizer 119 | 120 | def correct_unmatch(tokens, new_tok, features): 121 | tok_aln, new_tok_aln, tok_id, new_tok_id = minEditMatching(tokens, new_tok) 122 | assert len(new_tok_id)==0 123 | new_fea = list() 124 | for idx, tid in enumerate(tok_id[::-1]): 125 | assert new_tok[tid]+new_tok[tid+1] == tokens[tid-idx] 126 | idx = 0 127 | lens = len(features) 128 | while(idx < lens): 129 | if idx not in tok_id: 130 | new_fea.append(features[idx]) 131 | idx += 1 132 | else: 133 | new_fea.append(np.mean(features[idx:idx+2], axis=0)) 134 | idx += 2 135 | assert len(new_fea)==len(tokens) 136 | for n in new_fea: 137 | assert n.size==1024 138 | return new_fea 139 | 140 | def minEditMatching(target, source): 141 | ''' Return a pair of aligned target and source''' 142 | n = len(target) 143 | m = len(source) 144 | distance = [[0 for i in range(m+1)] for j in range(n+1)] 145 | for i in range(1,n+1): 146 | #distance[i][0] = distance[i-1][0] + insertCost(target[i-1]) 147 | distance[i][0] = distance[i-1][0] + 1 148 | 149 | for j in range(1,m+1): 150 | #distance[0][j] = distance[0][j-1] + deleteCost(source[j-1]) 151 | distance[0][j] = distance[0][j-1] + 1 152 | 153 | for i in range(1,n+1): 154 | for j in range(1,m+1): 155 | distance[i][j] = min(distance[i-1][j-1]+substCostSen(source[j-1],target[i-1]), 156 | distance[i-1][j]+1, 157 | distance[i][j-1]+1) 158 | ii = n 159 | jj = m 160 | 161 | target_aln = [] 162 | source_aln = [] 163 | target_id = [] 164 | source_id = [] 165 | while (ii > 0) or (jj > 0): 166 | if distance[ii][jj]-substCostSen(source[jj-1],target[ii-1]) == distance[ii-1][jj-1]: 167 | target_aln.append(target[ii-1]) 168 | source_aln.append(source[jj-1]) 169 | ii -= 1 170 | jj -= 1 171 | elif distance[ii][jj] - 1 == distance[ii][jj-1]: 172 | source_aln.append(source[jj-1]) 173 | target_aln.append("___") 174 | jj -= 1 175 | target_id.append(jj) 176 | elif distance[ii][jj] - 1 == distance[ii-1][jj]: 177 | source_aln.append("___") 178 | target_aln.append(target[ii-1]) 179 | ii -= 1 180 | source_id.append(ii) 181 | else: 182 | print ("error!") 183 | 184 | target_aln = target_aln[::-1] 185 | source_aln = source_aln[::-1] 186 | return (target_aln,source_aln,target_id, source_id) 187 | 188 | def substCostSen(x,y): 189 | if x==y: 190 | return 0 191 | else: 192 | return 1 193 | 194 | 195 | -------------------------------------------------------------------------------- /component/BETTER/joint/requirements.txt: -------------------------------------------------------------------------------- 1 | allennlp==0.9.0 2 | pytorch-crf==0.7.2 3 | pytorch-nlp==0.5.0 4 | seqeval==0.0.12 5 | sklearn==0.0 6 | tensorboardX==2.0 7 | torch==1.4.0 8 | transformers==2.4.1 9 | git+ssh://git@gitlab.com/isi-better/better-events.git@master 10 | -------------------------------------------------------------------------------- /component/BETTER/joint/saved_args.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch": 2, 3 | "iter_size": 2, 4 | "epochs": 40, 5 | "pipe_epochs": 1000, 6 | "tri_start_epochs": 50, 7 | "lr": 0.001, 8 | "lr_other_ner": 0.001, 9 | "lr_other_t": 0.001, 10 | "lr_other_a": 0.001, 11 | "num_warmup_steps": 300, 12 | "opt": "adam", 13 | "momentum": 0.9, 14 | "cuda": false, 15 | "multigpu": false, 16 | "params": {}, 17 | "patience": 10000, 18 | "do_train": true, 19 | "do_test": false, 20 | "write_pkl": false, 21 | "eval_on_gold_tri": true, 22 | "e2e_eval": true, 23 | "train_on_e2e_data": true, 24 | "tri_partial_match": true, 25 | "use_single_token_tri": true, 26 | "gold_ent": false, 27 | "hid": 150, 28 | "hid_lastmlp": 600, 29 | "num_layers": 1, 30 | "dropout": 0.4, 31 | "activation": "relu", 32 | "ner_weight": 1.0, 33 | "argument_weight": 5.0, 34 | "trigger_weight": 1.0, 35 | "finetune_bert": true, 36 | "bert_model_type": "bert-large-uncased", 37 | "bert_encode_mthd": "head", 38 | "use_bert": false, 39 | "use_glove": false, 40 | "bert_dim": 1024, 41 | "use_pos": false, 42 | "regen_vocfile": false, 43 | "trainable_emb": false, 44 | "trainable_pos_emb": false, 45 | "random_seed": 123, 46 | "lower": false, 47 | "use_crf_ner": true, 48 | "use_crf_t": true, 49 | "use_crf_a": true, 50 | "use_att": true, 51 | "att_func": "general", 52 | "att_dropout": 0.0, 53 | "use_att_linear_out": true, 54 | "norm": true, 55 | "att_pool": "max", 56 | "att_mthd": "cat", 57 | "k_ner": 1, 58 | "k_tri": 1, 59 | "k_arg": 1, 60 | "bias_t": 1.0, 61 | "bias_a": 1.0, 62 | "decode_w_ents_mask": true, 63 | "decode_w_arg_role_mask_by_tri": true, 64 | "decode_w_trigger_mask": true, 65 | "decode_w_arg_role_mask_by_ent": false, 66 | "load_model": true, 67 | "load_model_path": "worked_model_ace/baseline_repro.pt", 68 | "load_model_path_t": "worked_model_ace/singletrigger_bertlarge2.pt", 69 | "load_model_path_ner": "worked_model_ace/ner_bertlarge.pt", 70 | "load_model_single": true 71 | } -------------------------------------------------------------------------------- /component/BETTER/joint/split_event.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | def split_tri_output(trigger_seq, B2I): 4 | ''' 5 | given trigger_sequence, we will generate trigger word indexes for argument module 6 | Args: 7 | trigger_seq: a list of int that represent the trigger sequence. 8 | label_to_id_tri: a map that mapping from BIO labels ro index 9 | Return: 10 | A list of lists of trigger word idx. E.g. [[1,2], [5,6], [10]] 11 | ''' 12 | tri_idx = [] 13 | tri_type = [] 14 | in_chunk = False 15 | curr_idx = [] 16 | curr_I = None 17 | for i in range(len(trigger_seq)): 18 | # end of chunk 19 | if in_chunk: 20 | if trigger_seq[i] != curr_I: 21 | tri_idx.append(curr_idx) 22 | tri_type.append(curr_I - 1) # -1 accounts for finding the id of B-xxx 23 | curr_idx = [] 24 | curr_I = None 25 | in_chunk = False 26 | elif trigger_seq[i] == curr_I: 27 | curr_idx.append(i) 28 | if i == len(trigger_seq) - 1: 29 | # the last token is a I token 30 | tri_idx.append(curr_idx) 31 | tri_type.append(curr_I - 1) # -1 accounts for finding the id of B-xxx 32 | 33 | # beginning of chunk 34 | if trigger_seq[i] in B2I: 35 | curr_idx = [i] 36 | in_chunk = True 37 | curr_I = B2I[trigger_seq[i]] 38 | if i == len(trigger_seq) - 1: 39 | # the last token is a B token 40 | tri_idx.append(curr_idx) 41 | tri_type.append(curr_I - 1) # -1 accounts for finding the id of B-xxx 42 | 43 | assert len(tri_idx) == len(tri_type) 44 | return tri_idx, tri_type 45 | 46 | if __name__ == '__main__': 47 | label_to_id_t = OrderedDict([('O', 1), ('B-ANCHOR', 2), ('I-ANCHOR', 3), ('', 0)]) 48 | fake_data = ['O', 'B-ANCHOR', 'I-ANCHOR', 'I-ANCHOR', 'B-ANCHOR', 'O', 'O', 'B-ANCHOR', 'I-ANCHOR', 'O', 'B-ANCHOR', 'I-ANCHOR'] 49 | print(fake_data) 50 | fake_data = [label_to_id_t[x] for x in fake_data] 51 | tri_idx = split_tri_output(fake_data, label_to_id_t) 52 | print(tri_idx) 53 | -------------------------------------------------------------------------------- /component/BETTER/joint/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | from tensorboardX import SummaryWriter 4 | from seqeval.metrics import f1_score, accuracy_score, classification_report 5 | from datetime import datetime 6 | import os 7 | import pickle 8 | from eval import eval_e2e_event_level_arg_id_cls 9 | import torch 10 | 11 | def read_glove_dict(glove_dir): 12 | glove_emb = open(glove_dir, 'r+', encoding="utf-8") 13 | emb_dict = OrderedDict([(x.strip().split(' ')[0], [float(xx) for xx in x.strip().split(' ')[1:]]) for x in glove_emb]) 14 | return emb_dict 15 | 16 | def read_glove_emb(word2idx, glove_dict): 17 | word_emb = [] 18 | for word in word2idx: 19 | if word in glove_dict: 20 | word_emb.append(glove_dict[word]) 21 | elif word == '': 22 | word_emb.append(np.zeros(300)) 23 | else: 24 | word_emb.append(glove_dict['unk']) 25 | 26 | return np.array(word_emb) 27 | 28 | class Logger(object): 29 | def __init__(self, logdir='./log'): 30 | self.writer = SummaryWriter(logdir) 31 | 32 | def scalar_summary(self, tag, value, step): 33 | self.writer.add_scalar(tag, value, step) 34 | 35 | def write_result(filename, args, scores): 36 | if os.path.exists(filename): 37 | result = pickle.load(open(filename, 'rb')) 38 | else: 39 | result = list() 40 | result.append({ 41 | 'score_time': datetime.today().strftime('%Y-%m-%d-%H:%M:%S'), 42 | 'save_dir': args.save_dir, 43 | 'task': args.task, 44 | 'use_bert': args.use_bert, 45 | 'batch_size': args.batch, 46 | 'epoch': args.epochs, 47 | 'lr': args.lr, 48 | 'opt': args.opt, 49 | 'hid': args.hid, 50 | 'n_layers': args.num_layers, 51 | 'dp': args.dropout, 52 | 'act': args.activation, 53 | 'use_crf': args.use_crf, 54 | 'use_att': args.use_att, 55 | 'att_mthd': args.att_mthd, 56 | 'trigger_weight': args.trigger_weight, 57 | 'argument_weight': args.argument_weight, 58 | 'dev_f1': scores['dev_f1'], 59 | 'test_f1_tri': scores['test_f1_t'], 60 | 'test_f1_argu': scores['test_f1_e'] 61 | }) 62 | with open(filename, 'wb') as f: 63 | pickle.dump(result, f) 64 | 65 | def write_result_struct(filename, args, scores): 66 | if os.path.exists(filename): 67 | result = pickle.load(open(filename, 'rb')) 68 | else: 69 | result = list() 70 | result.append({ 71 | 'score_time': datetime.today().strftime('%Y-%m-%d-%H:%M:%S'), 72 | 'save_dir': args.save_dir, 73 | 'method': args.method, 74 | 'use_bert': args.use_bert, 75 | 'batch_size': args.batch, 76 | 'epoch': args.epochs, 77 | 'lr': args.lr, 78 | 'opt': args.opt, 79 | 'hid': args.hid, 80 | 'n_layers': args.num_layers, 81 | 'dp': args.dropout, 82 | 'act': args.activation, 83 | 'use_crf': args.use_crf, 84 | 'eval_on_gold_tri': args.eval_on_gold_tri, 85 | 'trigger_weight': args.trigger_weight, 86 | 'argument_weight': args.argument_weight, 87 | 'soft_attn': args.soft_attn, 88 | 'query_mthd': args.query_mthd, 89 | 'attn_mthd': args.attn_mthd, 90 | 'att_heads': args.att_heads, 91 | 'att_dropout': args.att_dropout, 92 | 'att_func': args.att_func, 93 | 'use_att_linear_out': args.use_att_linear_out, 94 | 'dev_f1': scores['dev_f1'], 95 | 'test_f1': scores['test_f1'], 96 | }) 97 | with open(filename, 'wb') as f: 98 | pickle.dump(result, f) 99 | 100 | def get_loss_mlp(lengths, label, pred_logit, criterion): 101 | # retrieve and flatten prediction for loss calculation 102 | tri_pred, tri_label = [], [] 103 | for i,l in enumerate(lengths): 104 | # flatten prediction 105 | tri_pred.append(pred_logit[i, :l]) 106 | # flatten entity label 107 | tri_label.append(label[i, :l]) 108 | tri_pred = torch.cat(tri_pred, 0) 109 | tri_label = torch.cat(tri_label, 0) 110 | assert tri_pred.size(0) == tri_label.size(0) 111 | return(criterion(tri_pred, tri_label)) 112 | 113 | def get_output_rel(pred_logit, input_ref): 114 | ''' 115 | input_ref: a list of integer, each integer indicate how many output in each batch 116 | pred_logit: a tensor (# of total events in a batch, num_class) 117 | 118 | output: a list of list of prediction(integer) 119 | ''' 120 | output = list() 121 | cnt = 0 122 | for n in input_ref: 123 | if n != 0: 124 | output.append(torch.argmax(pred_logit[cnt:cnt+n], dim=1, keepdim=False).tolist()) 125 | else: 126 | output.append([]) 127 | cnt += n 128 | return output 129 | 130 | def get_loss_rel(gold_rel, pred_logit, criterion): 131 | ''' 132 | gold_rel = a list of list of prediction(integer) 133 | pred_logit: a tensor (# of total events in a batch, num_class) 134 | ''' 135 | # flatten gold_rel 136 | flatten = pred_logit.new_tensor([x for i in gold_rel for x in i], dtype=torch.long) 137 | return (criterion(pred_logit, flatten)) 138 | 139 | def eval_struct_score(y_trues_t, y_preds_t, y_trues_e, y_preds_e, y_pred_paired, sent_ids, test=True): 140 | # trigger id score: 141 | assert len(y_trues_t) == len(y_preds_t) 142 | f1_tri = f1_score(y_trues_t, y_preds_t) 143 | acc_tri = accuracy_score(y_trues_t, y_preds_t) 144 | report = classification_report(y_trues_t, y_preds_t) 145 | 146 | # sent-level argument id score: 147 | assert len(y_trues_e) == len(y_preds_e) 148 | f1_arg = f1_score(y_trues_e, y_preds_e) 149 | acc_arg = accuracy_score(y_trues_e, y_preds_e) 150 | report = classification_report(y_trues_e, y_preds_e) 151 | 152 | # end2end eval 153 | output_event = [] 154 | for i in range(len(sent_ids)): 155 | for event in y_pred_paired[i]: 156 | output_event.append({'sent_id': sent_ids[i], 'pred_trigger': event[0], 'pred_arg': event[1]}) 157 | with open('temp/end2end_event_level_arg_cls.pkl', 'wb') as f: 158 | pickle.dump(output_event, f) 159 | print('temp pkl saved, start evaluation...') 160 | B2I_trigger = {'B-ANCHOR': 'I-ANCHOR'} 161 | B2I_arg = {'B-AGENT': 'I-AGENT', 'B-PATIENT': 'I-PATIENT'} 162 | if test: 163 | prec, recall, f1 = eval_e2e_event_level_arg_id_cls('out_pkl/gold_event_level_tri_arg_cls.pkl', 164 | 'temp/end2end_event_level_arg_cls.pkl', 165 | B2I_trigger, B2I_arg) 166 | else: 167 | prec, recall, f1 = eval_e2e_event_level_arg_id_cls('out_pkl/dev_gold_event_level_tri_arg_cls.pkl', 168 | 'temp/end2end_event_level_arg_cls.pkl', 169 | B2I_trigger, B2I_arg) 170 | 171 | scores = { 172 | 'f1_tri': f1_tri, 173 | 'acc_tri': acc_tri, 174 | 'f1_arg': f1_arg, 175 | 'acc_arg': acc_arg, 176 | 'precision_e2e': prec, 177 | 'recall_e2e': recall, 178 | 'f1_e2e': f1 179 | } 180 | return scores 181 | -------------------------------------------------------------------------------- /component/Duration/.gitignore: -------------------------------------------------------------------------------- 1 | *stanford-corenlp* 2 | dataset 3 | # scripts/src/factslab 4 | scripts/.ipynb_checkpoints 5 | *__pycache__* 6 | *.hdf5 7 | # *.pth 8 | .idea 9 | *df.csv 10 | logs/ -------------------------------------------------------------------------------- /component/Duration/README.md: -------------------------------------------------------------------------------- 1 | # UDS-T 2 | 3 | Event Duration Baselines on UDS-T 4 | 5 | > This repo provides baseline models and evaluations for 6 | time-duration classification, on USD-T dataset. 7 | 8 | --- 9 | 10 | ## Environment Setup 11 | ``` 12 | conda create -n event_dur 13 | conda install pip 14 | pip install git+https://github.com/hltcoe/PredPatt.git 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | 19 | 20 | ## Inference API 21 | 22 | See requirements.txt for required dependencies.

23 | Example for performing duration model inference. 24 | ```python 25 | from inference_api import predict_duration_elmo 26 | 27 | # events_json = EventsModel(...) 28 | # events = json.loads(events_json) # Parse JSON string 29 | out_json_str = predict_duration_elmo(events) 30 | ``` 31 | 32 | Output JSON structure: 33 | ```json5 34 | [ 35 | { 36 | 'duration': 'days', 37 | 'pred_text': 'meeting', 38 | 'pred_idx': 13, 39 | 'sentence': 'There was a ...' 40 | }, 41 | ] 42 | ``` 43 | 44 | 45 | --- 46 | ## Models 47 | 48 | 49 | - ELMo-MLP Baseline 50 |
51 | - Under eval mode (torch.no_grad), consumes ~ 1.3GB GPU-RAM, for batch-size=1 52 | 53 |
54 | 55 | - BERT/RoBERTa Baseline 56 | 57 | 58 |
59 | 60 |
61 | 62 | --- 63 | 64 | ## Training 65 | 66 | Run the following script for training: 67 | 68 | 69 | 70 | ```bash 71 | $ python3 main.py \ 72 | --mode train 73 | ``` 74 | 75 | 76 | 77 |
78 | --- 79 | 80 | *TO-DOs* 81 | 82 | - [ ] Add BERT baseline 83 | 84 | 85 | 86 | ## References 87 | [1] [Fine-Grained Temporal Relation Extraction](https://www.aclweb.org/anthology/P19-1280/)
88 | [2] []()
89 | -------------------------------------------------------------------------------- /component/Duration/UDS_T_data/first10.tsv: -------------------------------------------------------------------------------- 1 | Split Annotator.ID Sentence1.ID Pred1.Span Pred1.Token Event1.ID Sentence2.ID Pred2.Span Pred2.Token Event2.ID Pred1.Text Pred1.Lemma Pred2.Text Pred2.Lemma Pred1.Duration Pred2.Duration Pred1.Beg Pred1.End Pred2.Beg Pred2.End Pred1.Duration.Confidence Pred2.Duration.Confidence Relation.Confidence Document.ID 2 | train 209 en-ud-train.conllu 418 17 17 en-ud-train.conllu 418_17 en-ud-train.conllu 418 18_19 19 en-ud-train.conllu 418_19 think think is intentional intentional 2 0 35 41 64 65 4.0 4.0 4.0 10 3 | train 209 en-ud-train.conllu 9490 51 51 en-ud-train.conllu 9490_51 en-ud-train.conllu 9490 53 53 en-ud-train.conllu 9490_53 grey grey dark dark 6 6 0 65 0 65 4.0 4.0 4.0 189 4 | train 209 en-ud-train.conllu 7143 8 8 en-ud-train.conllu 7143_8 en-ud-train.conllu 7144 11_12_13_14_15_18_19 14 en-ud-train.conllu 7144_14 employed employ be the greatest threats to... threat 7 7 0 81 0 81 4.0 4.0 4.0 83 5 | train 209 en-ud-train.conllu 7786 17 17 en-ud-train.conllu 7786_17 en-ud-train.conllu 7787 4_5_6_7_8_9 9 en-ud-train.conllu 7787_9 getting get is a good nano protein... skimmer 2 10 35 44 0 100 4.0 4.0 4.0 126 6 | train 209 en-ud-train.conllu 5729 4 4 en-ud-train.conllu 5729_4 en-ud-train.conllu 5729 36 36 en-ud-train.conllu 5729_36 solicit solicit served serve 2 7 36 42 0 26 4.0 4.0 4.0 65 7 | train 508 en-ud-train.conllu 6416 8 8 en-ud-train.conllu 6416_8 en-ud-train.conllu 6416 32_33_34_35_36 35 en-ud-train.conllu 6416_35 switched switch being a net source of source 7 7 21 79 40 67 2.0 2.0 2.0 77 8 | train 508 en-ud-train.conllu 11326 32 32 en-ud-train.conllu 11326_32 en-ud-train.conllu 11326 40 40 en-ud-train.conllu 11326_40 go go came come 4 3 28 74 68 88 2.0 2.0 2.0 439 9 | train 508 en-ud-train.conllu 11550 1_2_3_4_5_6 5 en-ud-train.conllu 11550_5 en-ud-train.conllu 11551 14 14 en-ud-train.conllu 11551_14 was a very trying time... time made make 4 4 21 71 42 54 2.0 2.0 2.0 465 10 | train 508 en-ud-train.conllu 8062 5 5 en-ud-train.conllu 8062_5 en-ud-train.conllu 8062 7 7 en-ud-train.conllu 8062_7 get get warm warm 3 3 15 78 43 63 2.0 2.0 2.0 141 11 | train 508 en-ud-train.conllu 12252 25 25 en-ud-train.conllu 12252_25 en-ud-train.conllu 12252 30 30 en-ud-train.conllu 12252_30 show show charge charge 3 2 26 58 40 51 2.0 2.0 2.0 527 12 | -------------------------------------------------------------------------------- /component/Duration/inference_api.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os 4 | import json 5 | import pandas as pd 6 | from torch.utils.data import DataLoader 7 | from os.path import dirname, abspath 8 | parent_dir = dirname(dirname(abspath(__file__))) 9 | sys.path.insert(0, parent_dir) 10 | from utils_duration import str2bool, compute_predictions, idx2label 11 | from preprocess import TempEveDataset 12 | # from allennlp.commands.elmo import ElmoEmbedder 13 | from allennlp.modules.elmo import Elmo 14 | from .scripts.src.factslab.factslab.pytorch.temporalmodule import TemporalModel 15 | 16 | class DurationAPI: 17 | def __init__(self, base_dir = '.', gpu_id=-1): 18 | """ 19 | :param int gpu_id: cuda device id (optional); default - cpu 20 | """ 21 | self.base_dir = base_dir 22 | device = torch.device('cuda:{}'.format(gpu_id) if torch.cuda.is_available() and gpu_id != -1 else 'cpu') 23 | 24 | # Model Configs 25 | options_file = os.path.join(base_dir, "./scripts/elmo_files/elmo_2x4096_512_2048cnn_2xhighway_options.json") 26 | weight_file = os.path.join(base_dir, "./scripts/elmo_files/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5") 27 | 28 | model_ckpt = os.path.join(base_dir, "./model_ckpt/model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth") 29 | file_name = model_ckpt.split('/')[-1] 30 | 31 | tokens = file_name.split("_") 32 | eventatt = tokens[1] 33 | duratt = tokens[2] 34 | relatt = tokens[3] 35 | concat_fine_to_dur = str2bool(tokens[-8]) 36 | concat_dur_to_fine = str2bool(tokens[-7]) 37 | fine_2_dur = str2bool(tokens[-6]) 38 | dur_2_fine = str2bool(tokens[-5]) 39 | weight = float(tokens[-4]) 40 | drop = float(tokens[-3]) 41 | activ = tokens[-2] 42 | bino_bool = str2bool(tokens[-1].split(".")[0]) 43 | 44 | # coarse_size = int(tokens[-1].split(".")[0]) 45 | 46 | print("Eventatt: {}, Duratt: {}, Relatt: {}, Dropout: {}, Activation: {}, Binomial: {}, " 47 | "concat_fine2dur: {}, concat_dur2fine:{}, fine_to_dur: {}, dur_to_fine: {} \n" 48 | .format(eventatt, duratt, relatt, drop, activ, bino_bool, 49 | concat_fine_to_dur, concat_dur_to_fine, fine_2_dur, dur_2_fine)) 50 | 51 | self.batch_size = 1 52 | self.num_workers = 1 53 | 54 | # Model 55 | self.model = TemporalModel(embedding_size=1024, duration_distr=bino_bool, 56 | # elmo_class= 57 | mlp_dropout=drop, mlp_activation=activ, tune_embed_size=256, event_attention=eventatt, 58 | dur_attention=duratt, rel_attention=relatt, concat_fine_to_dur=concat_fine_to_dur, 59 | concat_dur_to_fine=concat_dur_to_fine, fine_to_dur=fine_2_dur, dur_to_fine=dur_2_fine, 60 | fine_squash=True, baseline=False, dur_MLP_sizes=[128], fine_MLP_sizes=[128], 61 | dur_output_size=11, fine_output_size=4, device=device) 62 | 63 | self.model.to(device) 64 | 65 | # Load model weights 66 | checkpoint = torch.load(model_ckpt, map_location=device) 67 | self.model.load_state_dict(checkpoint) 68 | self.model.elmo_class = Elmo(options_file, weight_file, num_output_representations=3) 69 | 70 | def pred(self, events): 71 | """ 72 | Model inference for ELMo baseline, given Events JSON 73 | 74 | :param list[dict] events: list of sentences and extracted event-triggers (within dict) 75 | :return: json containing event-duration as list of dict 76 | :rtype: str 77 | """ 78 | # Dataloader 79 | test_dataset = TempEveDataset(events) 80 | 81 | test_loader = DataLoader(test_dataset, self.batch_size, num_workers=self.num_workers, drop_last=False) 82 | 83 | # Inference 84 | outputs = compute_predictions(self.model, test_loader) 85 | 86 | # DataFrame 87 | df_out = pd.DataFrame(outputs) 88 | df_out = df_out[['p1_dur', 'root_text', 'root_idx', 'sentence']] 89 | 90 | df_out.rename(columns={'p1_dur': 'duration', 91 | 'root_text': 'pred_text', 92 | 'root_idx': 'pred_idx'}, inplace=True) 93 | 94 | # Map duration index to label 95 | df_out['duration'] = df_out['duration'].apply(lambda idx: idx2label[idx]) 96 | 97 | json_str = df_out.to_json(orient='records') 98 | 99 | # Parse json string to List[dict] 100 | json_obj = json.loads(json_str) 101 | return json_obj 102 | 103 | 104 | if __name__ == '__main__': 105 | # Input 106 | json_file = './Mu_test_data/dev_tbd.pred.json' 107 | 108 | # Read json file 109 | events_input = json.load(open(json_file)) 110 | 111 | # For demo, input is obtained from Mu's Event model, 112 | # thus first decode the json string as follows: 113 | # events_input = json.loads(events_json_str) # str --> List[dict] 114 | # result = predict_duration_elmo(events_input, gpu_id=0) 115 | events_input = events_input[:2] 116 | print(events_input) 117 | 118 | api = DurationAPI() 119 | result = api.pred(events_input) 120 | 121 | print(result) 122 | -------------------------------------------------------------------------------- /component/Duration/input_data/sample_document.txt: -------------------------------------------------------------------------------- 1 | Before the arrival of Keep, which Google launched this week, there was no default note-taking app for Android. 2 | It was a glaring hole, considering that Apple's iPhone has built-in Notes and Reminders apps that can be powered by Siri. 3 | Instead of settling for a bare bones app to fill the void, the search giant took things one step further. 4 | Keep isn't simply just a place to bank whatever random half-thoughts come to mind: Users can construct to-do lists, stash photos, and color code your notes -- all in one well-designed and easy-to-use interface. 5 | The second you log anything into your phone, it is also accessible from a PC Web browser via Google Drive. 6 | Alternatively, you can save things while working on your computer, and it will instantly appear on your phone, ready for use while on the go. 7 | The design may not be as progressive as the to-do app Clear, but Keep makes up for that in its simplicity and efficiency. 8 | Everything in Keep is presented like a Microsoft (MSFT, Fortune 500) Windows Phone-esque stream of tiles. 9 | Swiping left or right will archive those notes you no longer need (but don't want to erase entirely). 10 | At the top of the app is a text entry field that serves as your main point of entry for all new notes. 11 | And when viewing any specific note, tapping any part of that note (title, body, etc.) 12 | will allow you to edit it. 13 | The entire experience is frictionless. 14 | That said, it's not going to conquer the world quite yet. 15 | Organization options are limited -- color coding is your only choice, and you can't re-order your notes. 16 | Sharing with others is mostly limited to email and Google+, and the desktop features are pretty bare bones. 17 | But that's more a function of it being new, rather than poorly thought out. 18 | Like most things Google, expect the company to flesh out Keep over time and really turn it into our personal internet junk drawer. 19 | It's easy to foresee the day the when users will be able to send anything from their Web browser or Maps directly to Keep. 20 | The prospect of Keep incorporating features of services such as Pinterest or Pocket, or even making it easy to catalog streaming media, could turn it into something big. 21 | That should scare Evernote. 22 | Keep is not the reinvention of the wheel in any aspect -- there are a plethora of third-party apps already available for Android. 23 | But it is a well-exectuted refinement. 24 | In filling a minor, but important gap in its mobile ecosystem, Google gives the competition one less claim of superiority over Android. -------------------------------------------------------------------------------- /component/Duration/input_data/sample_fig2.txt: -------------------------------------------------------------------------------- 1 | What to feed my dog after gastroenteritis? My dog has been sick for about 3 days now. -------------------------------------------------------------------------------- /component/Duration/input_data_conllu/sample_fig2.txt.output: -------------------------------------------------------------------------------- 1 | 1 What what _ WP _ 3 dep _ _ 2 | 2 to to _ TO _ 3 mark _ _ 3 | 3 feed feed _ VB _ 0 root _ _ 4 | 4 my my _ PRP$ _ 5 nmod:poss _ _ 5 | 5 dog dog _ NN _ 3 dobj _ _ 6 | 6 after after _ IN _ 7 case _ _ 7 | 7 gastroenteritis gastroenteritis _ NN _ 3 nmod _ _ 8 | 8 ? ? _ . _ 3 punct _ _ 9 | 10 | 1 My my _ PRP$ _ 2 nmod:poss _ _ 11 | 2 dog dog _ NN _ 5 nsubj _ _ 12 | 3 has have _ VBZ _ 5 aux _ _ 13 | 4 been be _ VBN _ 5 cop _ _ 14 | 5 sick sick _ JJ _ 0 root _ _ 15 | 6 for for _ IN _ 9 case _ _ 16 | 7 about about _ IN _ 9 case _ _ 17 | 8 3 3 _ CD _ 9 nummod _ _ 18 | 9 days day _ NNS _ 5 nmod _ _ 19 | 10 now now _ RB _ 5 advmod _ _ 20 | 11 . . _ . _ 5 punct _ _ 21 | 22 | -------------------------------------------------------------------------------- /component/Duration/model_ckpt/model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/Duration/model_ckpt/model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth -------------------------------------------------------------------------------- /component/Duration/predictions/.~lock.sample_document.txt.output_timeline.csv#: -------------------------------------------------------------------------------- 1 | ,sidvash,lambda-quad,25.03.2019 21:49,file:///home/sidvash/.config/libreoffice/4; -------------------------------------------------------------------------------- /component/Duration/predictions/README_predictions.txt: -------------------------------------------------------------------------------- 1 | ## Data dictionary for sample_document.txt.output_predictions.csv: 2 | 3 | 1. Each row corresponds to an event-pair in a sentence. 4 | 5 | 2. sent_pred_id1: filename sent_id pred_position 6 | For eg (row 1): sample_document.txt.output 1 4 denotes predicate at the 4th position (index starting at 0) in the 1st sentence of 'sample_document.txt.output' file. 7 | 8 | 3. sent_pred_id2: same as above 9 | For eg (row 1): sample_document.txt.output 1 8 denotes predicate at the 8th position (index starting at 0) in the 1st sentence of 'sample_document.txt.output' file. 10 | 11 | Note that there are two sentence ids because the full sentence is the concatenation of the sentence in the sent_pred_id1 and the next adjacent sentence in the document. 12 | 13 | Examples: 14 | The 1st row in the csv file has: 15 | sent_pred_id1: sample_document.txt.output 1 4 16 | sent_pred_id2: sample_document.txt.output 1 8 17 | 18 | which denotes that both the predicates in the predicate-pair are being considered from the 1st sentence and are at 4th and 8th position. 19 | 20 | The 4th row in the csv file has: 21 | sent_pred_id1: sample_document.txt.output 1 13 22 | sent_pred_id2: sample_document.txt.output 2 4 23 | 24 | which denotes that the first predicate is at the 13th position in the 1st sentence and the second predicate is at the 4th position in the 2nd sentence in the document. 25 | 26 | 4. B1: beginning point of the first predicate 27 | 28 | 5. E1: end point of the first predicate 29 | 30 | 6. B2: beginning point of the second predicate 31 | 32 | 7. E2: end point of the second predicate 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /component/Duration/predictions/sample_document.txt.output_timeline.csv: -------------------------------------------------------------------------------- 1 | start_pt,duration,sent_pred_id,pred_text 2 | 0.22953895,0.23559411,sample_document.txt.output 1 13,was 3 | 0.1616388,0.90549994,sample_document.txt.output 1 4,Keep 4 | 0.287634,0.7549082,sample_document.txt.output 1 8,launched 5 | 1.072766,0.103116445,sample_document.txt.output 10 12,serves 6 | 1.0222178,0.11418431,sample_document.txt.output 10 6,is 7 | 1.06561,0.074647896,sample_document.txt.output 11 2,viewing 8 | 1.1178255,0.07104552,sample_document.txt.output 11 21,allow 9 | 1.1504356,0.050872874,sample_document.txt.output 11 24,edit 10 | 1.0895916,0.07437399,sample_document.txt.output 11 7,tapping 11 | 0.0,1.3020271,sample_document.txt.output 12 4,is frictionless 12 | 0.49332798,0.033563256,sample_document.txt.output 13 1,said 13 | 0.48924226,0.23515902,sample_document.txt.output 13 6,going 14 | 0.32785034,1.1066597,sample_document.txt.output 13 8,conquer 15 | 0.49388248,0.10346455,sample_document.txt.output 14 10,is only choice 16 | 0.57775855,0.0680006,sample_document.txt.output 14 16,re-order 17 | 0.29515114,0.36924908,sample_document.txt.output 14 3,limited 18 | 0.31241438,0.39484847,sample_document.txt.output 15 10,+ 19 | 0.41692692,0.27291024,sample_document.txt.output 15 19,are pretty bare bones 20 | 0.12191195,1.6530606,sample_document.txt.output 15 5,limited 21 | 0.3450333,0.27553135,sample_document.txt.output 15 7,email 22 | 0.4190132,0.33701715,sample_document.txt.output 16 14,thought 23 | 0.33446252,0.4187518,sample_document.txt.output 16 5,'s more a function of 24 | 0.33700866,0.42752492,sample_document.txt.output 16 9,being new 25 | 0.68224037,0.13986059,sample_document.txt.output 17 16,turn 26 | 0.47907346,0.23535627,sample_document.txt.output 17 5,expect 27 | 0.6549459,0.18746355,sample_document.txt.output 18 12,be able 28 | 0.7119201,0.13109462,sample_document.txt.output 18 14,send 29 | 0.5940464,0.18792449,sample_document.txt.output 18 2,'s easy 30 | 0.72883064,0.2629165,sample_document.txt.output 18 24,Keep 31 | 0.5958486,0.21551259,sample_document.txt.output 18 4,foresee 32 | 0.6627697,0.3056006,sample_document.txt.output 19 16,making 33 | 0.67700034,0.33953798,sample_document.txt.output 19 18,easy 34 | 0.8993714,0.21139722,sample_document.txt.output 19 25,turn 35 | 0.60749745,0.30814624,sample_document.txt.output 19 3,Keep 36 | 0.61776954,0.30698663,sample_document.txt.output 19 4,incorporating 37 | 0.23025304,0.4327411,sample_document.txt.output 2 11,has 38 | 0.2912727,0.39179614,sample_document.txt.output 2 20,powered 39 | 0.20970988,0.35870302,sample_document.txt.output 2 4,was a glaring hole 40 | 0.33967128,0.18600482,sample_document.txt.output 2 6,considering 41 | 0.9585166,0.3046474,sample_document.txt.output 20 2,scare 42 | 1.0286418,1.1654844,sample_document.txt.output 21 13,are 43 | 1.0180484,0.9873926,sample_document.txt.output 21 4,is not the reinvention of 44 | 1.5583814,0.70114505,sample_document.txt.output 22 5,is a well-exectuted refinement 45 | 1.7318684,0.52950096,sample_document.txt.output 23 1,filling 46 | 1.9559892,0.44548014,sample_document.txt.output 23 14,gives 47 | 0.5044588,0.21667121,sample_document.txt.output 3 16,took 48 | 0.35501373,0.18694507,sample_document.txt.output 3 2,settling 49 | 0.42702043,0.20121588,sample_document.txt.output 3 9,fill 50 | 0.54812557,0.27332816,sample_document.txt.output 4 18,construct 51 | 0.6833602,0.16512632,sample_document.txt.output 4 27,code 52 | 0.5855998,0.3608075,sample_document.txt.output 4 34,well-designed 53 | 0.73807853,0.27333945,sample_document.txt.output 4 37,interface 54 | 0.43750316,0.43180317,sample_document.txt.output 4 6,is n't simply just a... 55 | 0.6779917,0.23474711,sample_document.txt.output 5 12,is also accessible from 56 | 0.75836116,0.044174373,sample_document.txt.output 5 3,log 57 | 0.84418917,0.14313164,sample_document.txt.output 6 16,appear 58 | 0.8906975,0.15871912,sample_document.txt.output 6 27,go 59 | 0.6935278,0.20407178,sample_document.txt.output 6 4,save 60 | 0.71397096,0.23956096,sample_document.txt.output 6 7,working 61 | 0.81361145,0.19291249,sample_document.txt.output 7 14,Keep 62 | 0.9003202,0.19139186,sample_document.txt.output 7 15,makes 63 | 0.5575306,0.497628,sample_document.txt.output 7 6,be as progressive as 64 | 0.70687485,0.3683506,sample_document.txt.output 8 2,Keep 65 | 0.7990323,0.26095173,sample_document.txt.output 8 4,presented 66 | 0.96460164,0.116692804,sample_document.txt.output 9 0,Swiping 67 | -------------------------------------------------------------------------------- /component/Duration/predictions_new/sample_document.txt.output_timeline.csv: -------------------------------------------------------------------------------- 1 | start_pt,duration,sent_pred_id,pred_text 2 | 0.1659592,0.3867714,sample_document.txt.output 1 13,was 3 | 0.057858784,0.38209978,sample_document.txt.output 1 4,Keep 4 | 0.11140874,0.22814499,sample_document.txt.output 1 8,launched 5 | 0.58783954,0.24757639,sample_document.txt.output 10 12,serves 6 | 0.4584901,0.28038725,sample_document.txt.output 10 6,is 7 | 0.56658405,0.18215424,sample_document.txt.output 11 2,viewing 8 | 0.692741,0.1787399,sample_document.txt.output 11 21,allow 9 | 0.75968033,0.14175409,sample_document.txt.output 11 24,edit 10 | 0.61380315,0.18479265,sample_document.txt.output 11 7,tapping 11 | 0.5091141,0.57216734,sample_document.txt.output 12 4,is frictionless 12 | 0.7279427,0.09699013,sample_document.txt.output 13 1,said 13 | 0.71798456,0.3629794,sample_document.txt.output 13 6,going 14 | 0.6839229,0.6700831,sample_document.txt.output 13 8,conquer 15 | 0.7431355,0.25156957,sample_document.txt.output 14 10,is only choice 16 | 0.94288635,0.16647394,sample_document.txt.output 14 16,re-order 17 | 0.40678498,0.6119435,sample_document.txt.output 14 3,limited 18 | 0.6178697,0.13728026,sample_document.txt.output 15 10,+ 19 | 0.47185704,0.5922663,sample_document.txt.output 15 19,are pretty bare bones 20 | 0.38864532,0.66203797,sample_document.txt.output 15 5,limited 21 | 0.49253073,0.33022714,sample_document.txt.output 15 7,email 22 | 0.608893,0.28437492,sample_document.txt.output 16 14,thought 23 | 0.44145197,0.5678541,sample_document.txt.output 16 5,'s more a function of 24 | 0.5642646,0.3093289,sample_document.txt.output 16 9,being new 25 | 0.89284486,0.2879107,sample_document.txt.output 17 16,turn 26 | 0.63449514,0.3744282,sample_document.txt.output 17 5,expect 27 | 0.91353405,0.3282676,sample_document.txt.output 18 12,be able 28 | 1.0137923,0.22846185,sample_document.txt.output 18 14,send 29 | 0.82106173,0.3076777,sample_document.txt.output 18 2,'s easy 30 | 1.0391245,0.4751686,sample_document.txt.output 18 24,Keep 31 | 0.8270937,0.35527292,sample_document.txt.output 18 4,foresee 32 | 0.93579674,0.48665285,sample_document.txt.output 19 16,making 33 | 0.97048026,0.5357981,sample_document.txt.output 19 18,easy 34 | 1.2939894,0.32960615,sample_document.txt.output 19 25,turn 35 | 0.84815556,0.50060415,sample_document.txt.output 19 3,Keep 36 | 0.86849916,0.49890476,sample_document.txt.output 19 4,incorporating 37 | 0.1884663,0.6893078,sample_document.txt.output 2 11,has 38 | 0.34253854,0.5686484,sample_document.txt.output 2 20,powered 39 | 0.15306628,0.5725143,sample_document.txt.output 2 4,was a glaring hole 40 | 0.35936135,0.28860942,sample_document.txt.output 2 6,considering 41 | 1.3718944,0.5206325,sample_document.txt.output 20 2,scare 42 | 1.3775618,0.86497575,sample_document.txt.output 21 13,are 43 | 1.3708651,0.76067257,sample_document.txt.output 21 4,is not the reinvention of 44 | 1.6471236,0.64427495,sample_document.txt.output 22 5,is a well-exectuted refinement 45 | 1.806507,0.48694935,sample_document.txt.output 23 1,filling 46 | 2.0096083,0.41171712,sample_document.txt.output 23 14,gives 47 | 0.65977967,0.36332253,sample_document.txt.output 3 16,took 48 | 0.40279114,0.32262275,sample_document.txt.output 3 2,settling 49 | 0.5145695,0.34815133,sample_document.txt.output 3 9,fill 50 | 0.72906995,0.5002725,sample_document.txt.output 4 18,construct 51 | 0.9556468,0.3318769,sample_document.txt.output 4 27,code 52 | 0.8607931,0.5956468,sample_document.txt.output 4 34,well-designed 53 | 1.0808307,0.48955455,sample_document.txt.output 4 37,interface 54 | 0.6597389,0.611886,sample_document.txt.output 4 6,is n't simply just a... 55 | 0.18994676,0.42184663,sample_document.txt.output 5 12,is also accessible from 56 | 1.1183183,0.22149895,sample_document.txt.output 5 3,log 57 | 0.47176263,0.2754843,sample_document.txt.output 6 16,appear 58 | 0.5010931,0.37027907,sample_document.txt.output 6 27,go 59 | 0.21596201,0.36138824,sample_document.txt.output 6 4,save 60 | 0.2529694,0.42897764,sample_document.txt.output 6 7,working 61 | 0.42659408,0.32614288,sample_document.txt.output 7 14,Keep 62 | 0.56993645,0.32825184,sample_document.txt.output 7 15,makes 63 | 0.0,0.83541936,sample_document.txt.output 7 6,be as progressive as 64 | 0.18418065,0.37936413,sample_document.txt.output 8 2,Keep 65 | 0.2485228,0.35295543,sample_document.txt.output 8 4,presented 66 | 0.3916087,0.23731099,sample_document.txt.output 9 0,Swiping 67 | -------------------------------------------------------------------------------- /component/Duration/predictions_new/sample_fig2.txt.output_predictions.csv: -------------------------------------------------------------------------------- 1 | sent_pred_id1,sent_pred_id2,b1,e1,b2,e2,pred1_duration,pred2_duration,pred1_text,pred2_text,pred1_dict_idx,pred2_dict_idx 2 | sample_fig2.txt.output 1 2,sample_fig2.txt.output 2 4,0.0,0.0,0.0,0.0,0,0,feed,been sick for about now,0,1 3 | -------------------------------------------------------------------------------- /component/Duration/predictions_new/sample_fig2.txt.output_timeline.csv: -------------------------------------------------------------------------------- 1 | start_pt,duration,sent_pred_id,pred_text 2 | 0.0,1.3665516e-05,sample_fig2.txt.output 1 2,feed 3 | 0.0776875,0.0031946814,sample_fig2.txt.output 2 4,been sick for about now 4 | -------------------------------------------------------------------------------- /component/Duration/preprocess.py: -------------------------------------------------------------------------------- 1 | from predpatt import PredPatt 2 | import json 3 | from torch.utils.data import Dataset, DataLoader 4 | ''' Json input format: 5 | [ 6 | { 7 | "tokens": ["word_0", "word_1", ...], 8 | "events": [ 9 | { 10 | "event_type": "Movement:Transport", 11 | "triggers": [{ 12 | "event_type": "Movement:Transport", 13 | "text": "deploy", 14 | "start_token": 5, 15 | "end_token": 5 16 | }], 17 | ... 18 | }, 19 | ... 20 | ], 21 | "ner": [[]] 22 | }, 23 | ... 24 | ] 25 | ''' 26 | 27 | 28 | def predicate_info(predicate): 29 | ''' 30 | Author: sidvash 31 | 32 | Input: predicate object 33 | Output: pred_text, token, root_token 34 | 35 | Note: If predicate is copular: pred_text is only upto first 5 words 36 | ''' 37 | copula_bool = False 38 | 39 | #Extend predicate to start from the copula 40 | if predicate.root.tag not in ["VERB", "AUX"]: 41 | all_pred = predicate.tokens 42 | gov_rels = [tok.gov_rel for tok in all_pred] 43 | if 'cop' in gov_rels: 44 | copula_bool = True 45 | cop_pos = gov_rels.index('cop') 46 | pred = [x.text for x in all_pred[cop_pos:]] 47 | pred_token = [x.position for x in all_pred[cop_pos:]] 48 | def_pred_token = predicate.root.position #needed for it_happen set 49 | cop_bool = True 50 | #print(predicate, idx) 51 | 52 | elif predicate.root.tag == "ADJ": 53 | pred_token = [predicate.root.position] 54 | pred = [predicate.root.text] 55 | def_pred_token = predicate.root.position 56 | else: ## Different from protocol as we are considering all predicates 57 | pred_token = [predicate.root.position] 58 | pred = [predicate.root.text] 59 | def_pred_token = predicate.root.position 60 | 61 | #Else keep the root 62 | else: 63 | pred_token = [predicate.root.position] 64 | pred = [predicate.root.text] 65 | def_pred_token = predicate.root.position 66 | 67 | #Stringify pred and pred_tokens: 68 | #pred_token = "_".join(map(str, pred_token)) 69 | 70 | if len(pred)>5: 71 | pred = pred[:5] 72 | pred = " ".join(pred) + "..." 73 | else: 74 | pred = " ".join(pred) 75 | 76 | return pred, pred_token, def_pred_token 77 | 78 | 79 | def extract_pp_obj_instance(pp_obj_instance, pp_obj): 80 | _, span_idx_list, root_idx = predicate_info(pp_obj_instance) 81 | word_tokens = [token.text for token in pp_obj.tokens] 82 | span_text = ' '.join([pp_obj.tokens[i].text for i in span_idx_list]) 83 | root_text = pp_obj.tokens[root_idx].text 84 | return word_tokens, span_text, span_idx_list, root_text, root_idx 85 | 86 | 87 | class TempEveDataset(Dataset): 88 | def __init__(self, json_filename, from_UDST_dataset = False, from_pipeline = True): 89 | 90 | self.sentences_wordlist = [] # list of string list 91 | self.spans = [] # list of string 92 | self.spans_idx = [] # list of int list 93 | self.roots = [] # list of string 94 | self.roots_idx = [] # list of int 95 | 96 | if from_pipeline: 97 | if type(json_filename) == str: 98 | json_objs = json.load(open(json_filename)) 99 | else: 100 | json_objs = json_filename 101 | 102 | print("json file size:", len(json_objs)) 103 | 104 | for obj in json_objs: 105 | 106 | if len(obj['events']) > 0: # detected events 107 | for event in obj['events']: 108 | for trigger in event['triggers']: 109 | self.sentences_wordlist.append(obj['tokens']) 110 | self.spans.append(trigger['text']) 111 | self.spans_idx.append(list(range(int(trigger['start_token']), int(trigger['end_token']) + 1))) # seems like one-word span only but just to make sure 112 | self.roots.append(trigger['text'].split()[0]) # seems like one-word only but just to make sure 113 | self.roots_idx.append(int(trigger['start_token'])) 114 | # else: # no events detected 115 | # pass 116 | # # some odd error with "\""", see main below for examples 117 | # if len(obj['tokens']) > 3: # if truly a sentence but without trigger/event detected from event extraction 118 | # # try with predpatt 119 | # sentence = " ".join(obj['tokens']) 120 | # print(sentence) 121 | # pp_obj = PredPatt.from_sentence(sentence) # https://github.com/hltcoe/PredPatt/blob/5ce4b88c4678dcf7c99a6b0377e0f641701b8390/predpatt/patt.py#L376 122 | # if len(pp_obj.instances) > 0: 123 | # for pp_obj_instance in pp_obj.instances: 124 | # word_tokens, span_text, span_idx_list, root_text, root_idx = extract_pp_obj_instance(pp_obj_instance, pp_obj) 125 | # self.sentences_wordlist.append(word_tokens) 126 | # self.spans.append(span_text) 127 | # self.spans_idx.append(span_idx_list) 128 | # self.roots.append(root_text) 129 | # self.roots_idx.append(root_idx) 130 | # else: 131 | # # do nothing, filter it outs 132 | # pass 133 | # else: 134 | # # append entire sentence? no, do nothing (filter it out) 135 | # pass 136 | 137 | elif from_UDST_dataset: 138 | pass 139 | 140 | def __len__(self): 141 | return len(self.sentences_wordlist) 142 | 143 | def __getitem__(self, index): 144 | return { 145 | "words_list": self.sentences_wordlist[index], 146 | "span_text": self.spans[index], 147 | "root_text": self.roots[index], 148 | "span_idx_list": self.spans_idx[index], 149 | "root_idx": self.roots_idx[index] 150 | } 151 | 152 | 153 | if __name__ == "__main__": 154 | # pp = PredPatt.from_sentence('Chris loves silly dogs and clever cats .') 155 | # print(predicate_info(pp.instances[0])) 156 | # print(pp.tokens[0].text) 157 | # test_word_list = ["We", "'re", "talking", "about", "possibilities", "of", "full", "scale", "war", "with", "former", "Congressman", "Tom", "Andrews", ",", "Democrat", "of", "Maine", "."] 158 | # test_word_list_orig = ["New", "Questions", "About", "Attacking", "Iraq", ";", "Is", "Torturing", "Terrorists", "Necessary", "?", ] 159 | # print('orig:\t', ' '.join(test_word_list_orig)) 160 | # test_word_list_good = ["New", "Questions", "About", "Attacking", "\"", "Iraq", ";", "Is", "Torturing", "Terrorists", "Necessary", "?", ] 161 | # print('good:\t', ' '.join(test_word_list_good)) 162 | # test_word_list_bad = ["New", "Questions", "About", "Attacking", "Iraq", ";", "Is", "Torturing", "Terrorists", "Necessary", "\"","?", ] 163 | # print('bad:\t', ' '.join(test_word_list_bad)) 164 | # test_word_list = ["Why", "do", "we", "have", "to", "learn", "it", "from", "\"", "Newsweek", "\"", "?"] 165 | """ 166 | odd "\n"" 167 | error: "KeyError: 1" 168 | print("inside JPyoeBackend!! indices_to_words[index]:", indices_to_words[index]) 169 | print(len(indices_to_words)) # 0 170 | print(index) # 1 171 | """ 172 | # test_word_list = ["Why", "do", "we", "have", "to", "learn", "it", "from", "Newsweek", "?"] 173 | # test_word_list = ["And", "so", "I", "would", "like", "you", "to", "take", "a", "look", "at", "the", "CNN/\"USA", "TODAY\"", "\"", "Gallup", "poll", ",", "taken", "last", "week", ",", "should", "U.S.", "troops", "to", "go", "to", "Iraq", "to", "remove", "Saddam", "Hussein", "from", "power", "."] 174 | # sentence = " ".join(test_word_list) 175 | # print(sentence) 176 | # pp_obj = PredPatt.from_sentence(sentence) 177 | # for predicates in pp_obj.instances: 178 | # span, span_idx_list, root_idx = predicate_info(predicates) 179 | # print(span, span_idx_list, root_idx) 180 | # print([token.text for token in pp_obj.tokens]) 181 | # print(' '.join([pp_obj.tokens[i].text for i in span_idx_list])) 182 | # print(pp_obj.tokens[root_idx].text) 183 | 184 | dataset = TempEveDataset("mu_dev_out.json", False, True) 185 | print("dataset size:", len(dataset)) 186 | print("data sample:", dataset[0]) 187 | 188 | dataloader = DataLoader(dataset, batch_size=4) 189 | 190 | dataloader = iter(dataloader) 191 | batch = next(dataloader) 192 | 193 | print(batch) 194 | -------------------------------------------------------------------------------- /component/Duration/readme_eval.txt: -------------------------------------------------------------------------------- 1 | #### Steps to create a document timeline for an input document ### 2 | 3 | 1. Put all the input document files into the "input_data" folder. Note that each document file should have sentences separated by a "\n". A "sample_document.txt" file is already present as a reference for an input document file. 4 | 5 | 2. From the terminal, change the current directory to be the "scripts" folder and run the following command: 6 | bash run_input_data.bash 7 | 8 | 3. The predictions of all the input document files will be written to the predictions folder: 9 | - [input_doc_filename]_timeline.csv (contains the document timeline) 10 | - [input_doc_filename]_predictions.csv (contains the relative timelines and predicate durations) 11 | 12 | 13 | The mappings for durations are as follows: 14 | 0-inst 15 | 1-secs 16 | 2-mins 17 | 3-hrs 18 | 4-days 19 | 5-weeks 20 | 6-mnths 21 | 7-yrs 22 | 8-decs 23 | 9-cents 24 | 10-forever 25 | 26 | 27 | For a detailed description of the protocols, datasets, as well as models of these data, please see the following paper: 28 | Vashishtha, S., B. Van Durme, & A.S. White. 2019. Fine-Grained Temporal Relation Extraction. arXiv:1902.01390 [cs.CL]. (https://arxiv.org/abs/1902.01390) -------------------------------------------------------------------------------- /component/Duration/requirements.txt: -------------------------------------------------------------------------------- 1 | allennlp==1.0.0 2 | matplotlib==3.1.1 3 | nltk==3.4.5 4 | torch==1.5.0 5 | tqdm==4.45.0 6 | # predpatt==1.0 7 | numpy==1.17.1 8 | pandas==0.25.1 9 | scikit_learn==0.23.1 10 | -------------------------------------------------------------------------------- /component/Duration/run_jupyter.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | jupyter notebook --no-browser --allow-root --port=7745 --NotebookApp.token='temporal' -------------------------------------------------------------------------------- /component/Duration/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/Duration/scripts/__init__.py -------------------------------------------------------------------------------- /component/Duration/scripts/elmo_files/elmo_2x4096_512_2048cnn_2xhighway_options.json: -------------------------------------------------------------------------------- 1 | {"lstm": {"use_skip_connections": true, "projection_dim": 512, "cell_clip": 3, "proj_clip": 3, "dim": 4096, "n_layers": 2}, "char_cnn": {"activation": "relu", "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], "n_highway": 2, "embedding": {"dim": 16}, "n_characters": 262, "max_characters_per_token": 50}} 2 | -------------------------------------------------------------------------------- /component/Duration/scripts/run_document_timeline.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #base_dir=$(pwd) 3 | base_dir=$(cd ../ && pwd) 4 | cd ../stanford-corenlp-full-2018-10-05 5 | 6 | #docname="sample_document.txt" 7 | java -cp "*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP [ -props sampleProps.properties ] -file ../input_data/$1 -outputFormat conllu 8 | mv *.output $base_dir/input_data_conllu/ 9 | 10 | cd ../scripts 11 | python run_model.py -doc ../input_data_conllu/$1.output -gpu 0 -out ../predictions 12 | 13 | -------------------------------------------------------------------------------- /component/Duration/scripts/run_input_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | base_dir=$(cd ../ && pwd) 3 | 4 | #!/bin/bash 5 | for filename in $(ls $base_dir/input_data/); do 6 | bash run_document_timeline.bash "$filename" 7 | done -------------------------------------------------------------------------------- /component/Duration/scripts/run_model.py: -------------------------------------------------------------------------------- 1 | from scripts.utils import * 2 | from scripts.timelinemodule import TimelineModel 3 | import argparse 4 | import warnings 5 | 6 | 7 | warnings.filterwarnings('ignore') 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-doc", "--docpath", 13 | help="Path of the document file", 14 | type=str, 15 | default="") 16 | 17 | parser.add_argument("-gpu", "--gpunumber", 18 | help="Which gpu to use", 19 | type=int, 20 | default=1) 21 | 22 | parser.add_argument("-out", "--outpath", 23 | help="Path of the output folder", 24 | type=str, 25 | default="") 26 | 27 | args = parser.parse_args() 28 | 29 | ## Dependency Graph object 30 | filename = args.docpath.split("/")[-1] 31 | structures = get_structs(args.docpath) 32 | print("\n########### Parsing Conllu through PredPatt ###########") 33 | 34 | ## Sentences 35 | struct_dict = extract_struct_dicts(structures) 36 | 37 | ## A dataframe after processing the file through PredPatt and extracting 38 | ## roots and spans of each predicate. 39 | df = extract_dataframe(args.docpath, structures) 40 | 41 | ## Correct pred2_tokens as per the concatenated sentence 42 | df['pred2_token_mod'] = df.apply(lambda row: correct_pred2_tokens(row, struct_dict), axis=1) 43 | df['pred2_root_token_mod'] = df.apply(lambda row: correct_pred2_root(row, struct_dict), axis=1) 44 | # Convert tokens into list of numbers 45 | df['pred1_token_span'] = df['pred1_token'].map(lambda x: [int(y) for y in x.split("_")]) 46 | df['pred2_token_span'] = df['pred2_token_mod'].map(lambda x: [int(y) for y in x.split("_")]) 47 | 48 | ## Extract X for model predictions 49 | X = extract_X(df) 50 | 51 | ## Load the best model 52 | squashed = True 53 | baseline = False 54 | loss_confidence = True 55 | cuda_device_num = args.gpunumber 56 | cuda_device_str = "cuda:" + str(cuda_device_num) 57 | model_path = "../model/" 58 | file_path = "model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth" 59 | 60 | tokens = file_path.split("_") 61 | eventatt = tokens[1] 62 | duratt = tokens[2] 63 | relatt = tokens[3] 64 | concat_fine_to_dur = str2bool(tokens[-8]) 65 | concat_dur_to_fine = str2bool(tokens[-7]) 66 | fine_2_dur = str2bool(tokens[-6]) 67 | dur_2_fine = str2bool(tokens[-5]) 68 | weight = float(tokens[-4]) 69 | drop = float(tokens[-3]) 70 | activ = tokens[-2] 71 | bino_bool = str2bool(tokens[-1].split(".")[0]) 72 | # coarse_size = int(tokens[-1].split(".")[0]) 73 | print("\n########### Predicting Relative Timelines ###########") 74 | print("\nRelative Temporal Model configurations:") 75 | print( 76 | "Eventatt: {}, Duratt: {}, Relatt: {}, Dropout: {}, Activation: {}, Binomial: {}, concat_fine2dur: {}, concat_dur2fine:{}, fine_to_dur: {}, dur_to_fine: {} \n".format( 77 | eventatt, 78 | duratt, 79 | relatt, 80 | drop, 81 | activ, 82 | bino_bool, 83 | concat_fine_to_dur, 84 | concat_dur_to_fine, 85 | fine_2_dur, 86 | dur_2_fine)) 87 | device = torch.device(cuda_device_str if torch.cuda.is_available() else "cpu") 88 | 89 | best_model = TemporalModel( 90 | embedding_size=1024, 91 | duration_distr=bino_bool, 92 | elmo_class=ElmoEmbedder(options_file, weight_file, cuda_device=cuda_device_num), 93 | mlp_dropout=drop, 94 | mlp_activation=activ, 95 | tune_embed_size=256, 96 | event_attention=eventatt, 97 | dur_attention=duratt, 98 | rel_attention=relatt, 99 | concat_fine_to_dur=concat_fine_to_dur, 100 | concat_dur_to_fine=concat_dur_to_fine, 101 | fine_to_dur=fine_2_dur, 102 | dur_to_fine=dur_2_fine, 103 | fine_squash=True, 104 | baseline=False, 105 | dur_MLP_sizes=[128], fine_MLP_sizes=[128], 106 | dur_output_size=11, fine_output_size=4, 107 | device=device) 108 | 109 | best_model.load_state_dict(torch.load(model_path + file_path, map_location=cuda_device_str)) 110 | best_model.to(device) 111 | 112 | p1_dur_yhat, p2_dur_yhat, fine_yhat, rel_yhat = predict_fine_dur_only(X, best_model) 113 | print("Relative timelines completed!!\n") 114 | ## Store predictions in the dataset 115 | df['pred1_duration'] = p1_dur_yhat.cpu().numpy() 116 | df['pred2_duration'] = p2_dur_yhat.cpu().numpy() 117 | df['b1'] = [b1 for b1, d1, b2, d2 in fine_yhat.cpu().numpy()] 118 | df['d1'] = [d1 for b1, d1, b2, d2 in fine_yhat.cpu().numpy()] 119 | df['e1'] = df['b1'] + df['d1'] 120 | df['b2'] = [b2 for b1, d1, b2, d2 in fine_yhat.cpu().numpy()] 121 | df['d2'] = [d2 for b1, d1, b2, d2 in fine_yhat.cpu().numpy()] 122 | df['e2'] = df['b2'] + df['d2'] 123 | df = df.drop(['d1', 'd2'], axis=1) 124 | df['sent_pred_id1'] = df['sentence_id_1'] + " " + df['pred1_root_token'].map(lambda x: str(x)) 125 | df['sent_pred_id2'] = df['sentence_id_2'] + " " + df['pred2_root_token'].map(lambda x: str(x)) 126 | 127 | ## Document Timelines 128 | pred_dict, num_preds, local_data = extract_preds(df) 129 | 130 | ## Run Timeline Model on current docid's data 131 | model = TimelineModel(data=local_data, 132 | num_preds=num_preds, 133 | device=torch.device(type="cpu")) 134 | 135 | print("########### Creating document timelines ###########") 136 | pred_b1, pred_e1, pred_b2, pred_e2, pred_timeline = model.fit(local_data, epochs=5000) 137 | 138 | preds_arr = local_data[['sent_pred_id1', 'sent_pred_id2']].values 139 | uniq_preds = np.unique(preds_arr.flatten()) 140 | # print(uniq_preds) 141 | 142 | preds_text = extract_pred_text(uniq_preds, local_data) 143 | 144 | ans_df = pd.DataFrame(data=pred_timeline, 145 | columns=['start_pt', 'duration']) 146 | ans_df['sent_pred_id'] = uniq_preds 147 | ans_df['pred_text'] = preds_text 148 | 149 | ## Save prediction files 150 | ans_df.to_csv(args.outpath + "/" + filename + "_timeline.csv", index=False) 151 | local_data.to_csv(args.outpath + "/" + filename + "_predictions.csv", index=False) 152 | 153 | print("\nOutput written to the predictions folder.") 154 | 155 | 156 | if __name__ == "__main__": 157 | main() 158 | -------------------------------------------------------------------------------- /component/Duration/scripts/src/factslab/factslab/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/Duration/scripts/src/factslab/factslab/__init__.py -------------------------------------------------------------------------------- /component/Duration/scripts/src/factslab/factslab/pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/component/Duration/scripts/src/factslab/factslab/pytorch/__init__.py -------------------------------------------------------------------------------- /component/Duration/scripts/src/factslab/factslab/pytorch/roberta_extract.py: -------------------------------------------------------------------------------- 1 | # A fair portion of this code ('align_bpe_to_words') is taken from: https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/alignment_utils.py 2 | # Author: sidvash 3 | # Created: 10/28/2019 4 | # Last modified: 11/19/2019 5 | 6 | ''' 7 | The purpose of this code is to extract RoBERTa embeddings for a sentence whose gold tokens are known. 8 | 9 | Usage: 10 | from roberta_extract import aligned_roberta 11 | embeddings = aligned_roberta(sentence, tokens, roberta='large') 12 | 13 | where sentence is a string, and tokens are the tokens of the sentence. 14 | ''' 15 | from collections import Counter 16 | from typing import List 17 | 18 | import torch 19 | import fairseq #not importing this causes line 94 assertion to fail -- why? 20 | 21 | ##### Load Roberta model 22 | roberta_large = torch.hub.load('pytorch/fairseq', 'roberta.large') 23 | roberta_large.eval() 24 | print("Large Model loaded") 25 | 26 | roberta_base = torch.hub.load('pytorch/fairseq', 'roberta.base') 27 | roberta_base.eval() 28 | print("Base Model loaded") 29 | 30 | def aligned_roberta(sentence: str, 31 | tokens: List[str], 32 | roberta='large', 33 | return_all_hiddens=False, 34 | border_tokens=False): 35 | ''' 36 | Code inspired from: https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py 37 | 38 | Aligns roberta embeddings for an input tokenization of words for a sentence 39 | 40 | Inputs: 41 | 1. sentence: sentence in string 42 | 2. tokens: tokens of the sentence in which the alignment is to be done 43 | 3. roberta: 'large' or 'base' 44 | 4. border_tokens: Boolean for whether to include special token embeddings and 45 | 46 | Outputs: 47 | Roberta embeddings aligned with the input tokens 48 | ''' 49 | 50 | # tokenize both with GPT-2 BPE and get alignment with given tokens 51 | if roberta=='large': 52 | roberta_model = roberta_large 53 | else: 54 | roberta_model = roberta_base 55 | 56 | bpe_toks = roberta_model.encode(sentence) 57 | alignment = align_bpe_to_words(roberta_model, bpe_toks, tokens) 58 | 59 | 60 | # extract features and align them 61 | features = roberta_model.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens) 62 | features = features.squeeze(0) #Batch-size = 1 63 | aligned_feats = align_features_to_words(roberta_model, features, alignment) 64 | 65 | if border_tokens: 66 | return aligned_feats 67 | else: 68 | return aligned_feats[1:-1] #exclude and tokens 69 | 70 | 71 | def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]): 72 | """ 73 | Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy). 74 | 75 | Args: 76 | roberta (RobertaHubInterface): RoBERTa instance 77 | bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)` 78 | other_tokens (List[str]): other tokens of shape `(T_words)` 79 | 80 | Returns: 81 | List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*. 82 | """ 83 | assert bpe_tokens.dim() == 1 84 | assert bpe_tokens[0] == 0. ##added after revision in alignment utils from fairseq (Feb11, 2020) 85 | 86 | def clean(text): 87 | return text.strip() 88 | 89 | # remove whitespaces to simplify alignment 90 | bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens] 91 | bpe_tokens = [clean(roberta.bpe.decode(x) if x not in {'', ''} else x) for x in bpe_tokens] 92 | other_tokens = [clean(str(o)) for o in other_tokens] 93 | 94 | # strip leading 95 | 96 | bpe_tokens = bpe_tokens[1:] 97 | assert ''.join(bpe_tokens) == ''.join(other_tokens) 98 | 99 | # create alignment from every word to a list of BPE tokens 100 | alignment = [] 101 | bpe_toks = filter(lambda item: item[1] != '', enumerate(bpe_tokens, start=1)) 102 | j, bpe_tok = next(bpe_toks) 103 | for other_tok in other_tokens: 104 | bpe_indices = [] 105 | while True: 106 | if other_tok.startswith(bpe_tok): 107 | bpe_indices.append(j) 108 | other_tok = other_tok[len(bpe_tok):] 109 | try: 110 | j, bpe_tok = next(bpe_toks) 111 | except StopIteration: 112 | j, bpe_tok = None, None 113 | elif bpe_tok.startswith(other_tok): 114 | # other_tok spans multiple BPE tokens 115 | bpe_indices.append(j) 116 | bpe_tok = bpe_tok[len(other_tok):] 117 | other_tok = '' 118 | else: 119 | raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok)) 120 | if other_tok == '': 121 | break 122 | assert len(bpe_indices) > 0 123 | alignment.append(bpe_indices) 124 | assert len(alignment) == len(other_tokens) 125 | 126 | return alignment 127 | 128 | 129 | def align_features_to_words(roberta, features, alignment): 130 | """ 131 | Align given features to words. 132 | 133 | Args: 134 | roberta (RobertaHubInterface): RoBERTa instance 135 | features (torch.Tensor): features to align of shape `(T_bpe x C)` 136 | alignment: alignment between BPE tokens and words returned by 137 | func:`align_bpe_to_words`. 138 | """ 139 | assert features.dim() == 2 140 | 141 | bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices) 142 | assert bpe_counts[0] == 0 # shouldn't be aligned 143 | denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))]) 144 | weighted_features = features / denom.unsqueeze(-1) 145 | 146 | output = [weighted_features[0]] 147 | largest_j = -1 148 | for bpe_indices in alignment: 149 | output.append(weighted_features[bpe_indices].sum(dim=0)) 150 | largest_j = max(largest_j, *bpe_indices) 151 | for j in range(largest_j + 1, len(features)): 152 | output.append(weighted_features[j]) 153 | output = torch.stack(output) 154 | #assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4) 155 | return output 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /component/Duration/scripts/src/factslab/factslab/pytorch/transformer_regression.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.nn import CrossEntropyLoss, MSELoss, Linear, Dropout 4 | from transformers import AutoModel 5 | 6 | 7 | class TransformerRegressionModel(torch.nn.Module): 8 | 9 | def __init__(self, transformer, dropout, num_labels,cls_token=0, 10 | activation='relu'): 11 | ''' 12 | Setup the modules in the model - a transformer, followed by a GRU for 13 | the CLS hidden states/taking the mean of all tokens, followed by Linear 14 | layers that outputs one number, followed by softmax 15 | ''' 16 | super(TransformerRegressionModel, self).__init__() 17 | 18 | # Setup the transformer model 19 | self.transformer = AutoModel.from_pretrained(transformer) 20 | 21 | 22 | # For now CLS pooling is the only pooling supported 23 | self.tr_output_size = self.transformer.config.hidden_size 24 | self.num_labels = num_labels 25 | self.cls_token = cls_token 26 | 27 | # Setup the linear layers on top of the transformer 28 | self.dense = Linear(self.tr_output_size, self.tr_output_size) 29 | self.dropout = Dropout(p=dropout) 30 | self.classifier = Linear(self.tr_output_size, self.num_labels) 31 | self._activation = activation 32 | 33 | self._loss_fn = MSELoss() 34 | 35 | 36 | def _nonlinearity(self, x): 37 | '''Applies relu or tanh activation on tensor.''' 38 | 39 | if self._activation == 'relu': 40 | return torch.nn.functional.relu(x) 41 | elif self._activation == 'tanh': 42 | return torch.tanh(x) 43 | 44 | 45 | def forward(self, input_ids, input_mask, tokens, labels=None): 46 | ''' 47 | Runs forward pass on neural network 48 | 49 | Arguments: 50 | --------- 51 | input_ids: the tokenized, bert wordpiece IDs. (batch_size, MAX_LEN) 52 | input_masks: the masking to be done on input_ids due to padding. 53 | (batch_size, MAX_LEN) 54 | labels: target against which to computer the loss. DEFAULT: None 55 | max_seq_len: The length to which to pad the output of the rnn 56 | 57 | Returns: 58 | ------- 59 | 60 | Object of type Tuple of form (loss, logits) 61 | 62 | loss: Cross Entropy loss calculated in loss_fn which implements masking 63 | logits: logsoftmaxed probabilities of classifier output 64 | 65 | ''' 66 | 67 | 68 | # Forward pass through transformer 69 | # other values returned are pooler_output, hidden_states, and attentions 70 | outputs = self.transformer(input_ids, 71 | token_type_ids=None, 72 | attention_mask=input_mask) 73 | 74 | last_hidden_states = outputs[0] 75 | 76 | # Get the hidden states based on token indices 77 | token_hidden_states = torch.cat([h.index_select(0,tok) for \ 78 | h, tok in zip(last_hidden_states, tokens)]) 79 | 80 | 81 | # Then run it through linear layers 82 | x = self.dropout(token_hidden_states) 83 | x = self.dense(x) 84 | x = self._nonlinearity(x) 85 | x = self.dropout(x) 86 | logits = self.classifier(x) 87 | 88 | outputs = (logits,) 89 | 90 | if labels is not None: 91 | loss = self._loss_fn(logits, labels) 92 | outputs = (loss,) + outputs 93 | 94 | return outputs -------------------------------------------------------------------------------- /component/Duration/scripts/timelinemodule.py: -------------------------------------------------------------------------------- 1 | import allennlp 2 | import torch 3 | import torch.nn.functional as F 4 | #import matplotlib.pyplot as plt 5 | import pickle 6 | from torch.distributions.binomial import Binomial 7 | from torch.nn import MSELoss, L1Loss, SmoothL1Loss, CrossEntropyLoss 8 | 9 | import torch 10 | from torch import nn 11 | #from torchviz import make_dot, make_dot_from_trace 12 | import numpy as np 13 | import pandas as pd 14 | from tqdm import tqdm 15 | from tqdm import tqdm_notebook as tqdm_n 16 | 17 | 18 | class TimelineModel(torch.nn.Module): 19 | ''' 20 | A class to extract a simple timeline model from a 21 | given document's predicate-pair data 22 | ''' 23 | def __init__(self, 24 | data = None, 25 | num_preds = None, 26 | mlp_activation='relu', 27 | mlp_dropout=0.0, 28 | optimizer_class = torch.optim.Adam, 29 | dur_output_size = 11, fine_output_size = 4, 30 | device=torch.device(type="cpu"), 31 | **kwargs): 32 | super().__init__() 33 | 34 | self.device = device 35 | self.linear_maps = nn.ModuleDict() 36 | self.mlp_activation = mlp_activation 37 | self.mlp_dropout = nn.Dropout(mlp_dropout) 38 | self.dur_output_size = dur_output_size 39 | 40 | ## Parameters 41 | # Hidden predicate representations 42 | self.pred_tensor = torch.nn.Parameter(torch.randn(num_preds,2).to(self.device), requires_grad=True) 43 | # Binomial parameter 44 | self.k = torch.nn.Parameter(torch.randn(1).to(self.device), requires_grad=True) 45 | 46 | self.params = nn.ParameterList() 47 | self.params.extend([self.pred_tensor, self.k]) 48 | 49 | self._optimizer_class = optimizer_class 50 | 51 | ## Losses Initialization 52 | self.fine_loss = L1Loss().to(self.device) 53 | self.duration_loss = CrossEntropyLoss().to(self.device) 54 | 55 | 56 | def _init_MLP(self, input_size, hidden_sizes, output_size, param=None): 57 | ''' 58 | Initialise MLP or regression parameters 59 | ''' 60 | self.linear_maps[param] = nn.ModuleList() 61 | 62 | for h in hidden_sizes: 63 | linmap = torch.nn.Linear(input_size, h) 64 | linmap = linmap.to(self.device) 65 | self.linear_maps[param].append(linmap) 66 | input_size = h 67 | 68 | linmap = torch.nn.Linear(input_size, output_size) 69 | linmap = linmap.to(self.device) 70 | self.linear_maps[param].append(linmap) 71 | 72 | def forward(self, local_data, **kwargs): 73 | ''' 74 | INput: dataframe with cols: 75 | b1, e1, b2, e2, pred1_dict_idx, pred2_dict_idx 76 | 77 | Output: 78 | ''' 79 | t_sq = self.pred_tensor**2 80 | num_preds= t_sq.size()[0] 81 | anchored_tensor = torch.zeros(num_preds,2).to(self.device) 82 | 83 | anchored_tensor[:,0] = t_sq[:,0] - t_sq[:,0].min() 84 | anchored_tensor[:,1] = t_sq[:,1] 85 | 86 | #Predicted fine-grained values for the given document 87 | b1 = anchored_tensor[local_data.pred1_dict_idx.values][:,0] 88 | dur1 = anchored_tensor[local_data.pred1_dict_idx.values][:,1] 89 | b2 = anchored_tensor[local_data.pred2_dict_idx.values][:,0] 90 | dur2 = anchored_tensor[local_data.pred2_dict_idx.values][:,1] 91 | 92 | batch_size = b1.size()[0] 93 | #print(batch_size) 94 | 95 | pred1_dur = self._binomial_dist(dur1) 96 | pred2_dur = self._binomial_dist(dur2) 97 | 98 | yhat = (b1, dur1, b2, dur2, pred1_dur, pred2_dur, 99 | anchored_tensor) 100 | 101 | return yhat 102 | 103 | def fit(self, local_data, epochs=5000, **kwargs): 104 | losses = [10000] 105 | 106 | # print("#### Model Parameters ####") 107 | # for name,param in self.named_parameters(): 108 | # if param.requires_grad: 109 | # print(name, param.shape) 110 | # print("##########################") 111 | parameters = [p for p in self.parameters() if p.requires_grad] 112 | optimizer = self._optimizer_class(parameters) 113 | 114 | #Actual ground truth values 115 | b1_lst = local_data.b1.values 116 | e1_lst = local_data.e1.values 117 | b2_lst = local_data.b2.values 118 | e2_lst = local_data.e2.values 119 | durations = [local_data.pred1_duration.values, 120 | local_data.pred2_duration.values] 121 | 122 | 123 | # pbar = tqdm(total = total_obs//self.train_batch_size) 124 | 125 | for epoch in tqdm(range(epochs)): 126 | preds = self(local_data) 127 | #zero_grad 128 | optimizer.zero_grad() 129 | curr_loss = self._custom_loss(preds, 130 | b1_lst, 131 | e1_lst, 132 | b2_lst, 133 | e2_lst, 134 | durations) 135 | 136 | curr_loss.backward() 137 | optimizer.step() 138 | 139 | if epoch==0: 140 | tqdm.write("Epoch: {}, Loss: {}".format(epoch+1, curr_loss)) 141 | 142 | #print("Epoch: {}, Loss: {}".format(epoch+1, curr_loss)) 143 | 144 | ## Stop training when loss converges 145 | if abs(curr_loss.detach() - losses[-1]) < 0.00001: 146 | #print("Epoch: {}, Converging-Loss: {}".format(epoch+1, curr_loss)) 147 | break 148 | 149 | #pbar.update(1) 150 | 151 | losses.append(curr_loss.detach()) 152 | #pbar.close() 153 | tqdm.write("Epoch: {}, Converging-Loss: {}".format(epoch+1, curr_loss)) 154 | 155 | return self.predict(preds) 156 | 157 | def _custom_loss(self, preds, b1_lst, e1_lst, b2_lst, 158 | e2_lst,durations): 159 | ## Predictions 160 | b1_pred, dur1_pred, b2_pred, dur2_pred = preds[0], preds[1], preds[2], preds[3] 161 | out_p1_d, out_p2_d, anchored_tensor = preds[4], preds[5], preds[6] 162 | # out_coarse, out_coarser = preds[7], preds[8] 163 | 164 | ## Ground truth values: 165 | b1_act, e1_act, b2_act, e2_act = self._lsts_to_tensors(b1_lst, e1_lst, b2_lst, e2_lst, 166 | param="float") 167 | ## Store actual_y into tensors 168 | pred1_durs, pred2_durs = durations 169 | 170 | pred1_durs, pred2_durs = self._lsts_to_tensors(pred1_durs,pred2_durs) 171 | 172 | ## Duration Losses 173 | L5_p1 = self.duration_loss(out_p1_d, pred1_durs) 174 | L5_p2 = self.duration_loss(out_p2_d, pred2_durs) 175 | #print("L5_p1 {}, L5_p2: {}".format(L5_p1, L5_p2)) 176 | 177 | ## Normalize predicted fine-grained values: 178 | num_pairs = b1_pred.size()[0] 179 | t = torch.zeros(num_pairs,4).to(self.device) 180 | t[:,0] = b1_pred 181 | t[:,1] = b1_pred + dur1_pred 182 | t[:,2] = b2_pred 183 | t[:,3] = b2_pred + dur2_pred 184 | 185 | 186 | t_min, _ = torch.min(t,dim=1) 187 | t_min = t_min.unsqueeze(1).repeat(1,4) #add extra dimension 188 | t_adj = t - t_min 189 | t_adj_max, _ = torch.max(t_adj,dim=1) 190 | t_adj_max = t_adj_max.unsqueeze(1).repeat(1,4) 191 | t_normalized = t_adj/t_adj_max 192 | 193 | ## Fine-grained Losses 194 | l1 = self.fine_loss(t_normalized[:,0]-t_normalized[:,2], b1_act-b2_act) 195 | l2 = self.fine_loss(t_normalized[:,1]-t_normalized[:,2], e1_act-b2_act) 196 | l3 = self.fine_loss(t_normalized[:,3]-t_normalized[:,0], e2_act-b1_act) 197 | l4 = self.fine_loss(t_normalized[:,1]-t_normalized[:,3], e1_act-e2_act) 198 | 199 | L1to4 = sum([l1, l2, l3, l4])/4 200 | 201 | #L5_p1, L5_p2 = 0,0 202 | 203 | #print("L1to4: {}".format(L1to4)) 204 | 205 | dur = (L5_p1+L5_p2)/2 206 | fine = L1to4 207 | beta=2.0 208 | 209 | total_loss = (sum([dur, beta*fine])/2) 210 | 211 | return total_loss 212 | 213 | def _lsts_to_tensors(self, *args, param=None): 214 | ''' 215 | Input: list1, list2,...... 216 | 217 | Output: [Tensor(list1), tensor(list2),....] 218 | 219 | ''' 220 | if param=="float": 221 | return [torch.from_numpy(np.array(arg)).float().to(self.device) for arg in args] 222 | else: 223 | return [torch.from_numpy(np.array(arg, dtype="int64")).to(self.device) for arg in args] 224 | 225 | def predict(self, preds): 226 | b1_pred, dur1_pred, b2_pred, dur2_pred = preds[0], preds[1], preds[2], preds[3] 227 | pred_timeline = preds[6] 228 | 229 | ## Normalize predicted values: 230 | num_pairs = b1_pred.size()[0] 231 | t = torch.zeros(num_pairs,4).to(self.device) 232 | t[:,0] = b1_pred 233 | t[:,1] = b1_pred + dur1_pred 234 | t[:,2] = b2_pred 235 | t[:,3] = b2_pred + dur2_pred 236 | 237 | t_min, _ = torch.min(t,dim=1) 238 | t_min = t_min.unsqueeze(1).repeat(1,4) #add extra dimension 239 | t_adj = t - t_min 240 | t_adj_max, _ = torch.max(t_adj,dim=1) 241 | t_adj_max = t_adj_max.unsqueeze(1).repeat(1,4) 242 | t_normalized = t_adj/t_adj_max 243 | t_normalized = t_normalized.detach().cpu().numpy() 244 | 245 | return t_normalized[:,0],t_normalized[:,1], t_normalized[:,2], t_normalized[:,3], pred_timeline.detach().cpu().numpy() 246 | 247 | def _binomial_dist(self, pred_dur): 248 | ''' 249 | *** Vectorized implementation *** 250 | Input: A tensor with dimension: batch_size x 1 251 | Output: A tensor with dimension: batch_size x 11 252 | Binomial Prob distribution for a given duration value 253 | ''' 254 | pred_dur = torch.sigmoid((self.k)*(torch.log(pred_dur))) 255 | 256 | bin_class = Binomial(total_count=self.dur_output_size-1, probs=pred_dur) 257 | durations = torch.tensor(range(self.dur_output_size), dtype=torch.float).to(self.device) 258 | 259 | return self._log_prob_vectorized(bin_class, durations) 260 | 261 | def _log_prob_vectorized(self, bin_class, value): 262 | ''' 263 | 1. bin_class: Pytorch Binomial distribution class 264 | 2. Value is a tensor with size: [total_count+1] 265 | ''' 266 | batch_size = bin_class.total_count.size()[0] 267 | 268 | value = value.repeat(batch_size,1) 269 | #print(value.size()) 270 | 271 | bin_class.logits = bin_class.logits.repeat(11,1).permute(1,0) 272 | #print(bin_class.logits.size()) 273 | 274 | bin_class.total_count = bin_class.total_count.repeat(11,1).permute(1,0) 275 | #print(bin_class.total_count.size()) 276 | 277 | log_factorial_n = torch.lgamma(bin_class.total_count + 1) 278 | log_factorial_k = torch.lgamma(value + 1) 279 | log_factorial_nmk = torch.lgamma(bin_class.total_count - value + 1) 280 | max_val = (-bin_class.logits).clamp(min=0.0) 281 | # Note that: torch.log1p(-bin_class.probs)) = max_val - torch.log1p((bin_class.logits + 2 * max_val).exp())) 282 | 283 | return (log_factorial_n - log_factorial_k - log_factorial_nmk + 284 | value * bin_class.logits + bin_class.total_count * max_val - 285 | bin_class.total_count * torch.log1p((bin_class.logits + 2 * max_val).exp())) -------------------------------------------------------------------------------- /component/Duration/utils_duration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import numpy as np 5 | import torch.nn.functional as F 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | idx2label = ['inst', 'secs', 'mins', 'hours', 'days', 'weeks', 'months', 'years', 'decades', 'cents', 'forever'] 10 | 11 | 12 | @torch.no_grad() 13 | def compute_predictions(model, dataloader): 14 | """ 15 | Computes model outputs. 16 | 17 | :param model: model to evaluate 18 | :param dataloader: validation/test dataset loader 19 | :return: outputs 20 | :rtype: dict 21 | """ 22 | model.eval() 23 | outputs = {'sentence': [], 'root_text': [], 'root_idx': [], 24 | 'p1_dur': [], 'p2_dur': [], 'fine': [], 'rel': []} 25 | 26 | # Evaluate on mini-batches & then average over the total 27 | for batch in dataloader: 28 | # Load to device, for the list of batch tensors 29 | words = batch['words_list'] # .to(device) 30 | root = batch['root_idx'] # .to(device) 31 | 32 | # Add dummy event (2) 33 | span = [[[x], [x]] for x in root.tolist()] 34 | root = [[x, x] for x in root.tolist()] 35 | 36 | # Convert words to batch-first: [L, B] --> [B, L] 37 | words = list(map(list, zip(*words))) 38 | 39 | # Forward Pass 40 | p1_dur, p2_dur, fine, rel = model(words, span, root) 41 | 42 | _, p1_dur = p1_dur.max(1) 43 | _, p2_dur = p2_dur.max(1) 44 | 45 | outputs['sentence'] += [' '.join(w_lst) for w_lst in words] 46 | outputs['root_text'] += batch['root_text'] 47 | outputs['root_idx'] += [idx.item() for idx in batch['root_idx']] 48 | 49 | outputs['p1_dur'] += p1_dur.detach().cpu().tolist() 50 | outputs['p2_dur'] += p2_dur.detach().cpu().tolist() 51 | outputs['fine'] += fine.detach().cpu().tolist() 52 | outputs['rel'] += rel.detach().cpu().tolist() 53 | 54 | return outputs 55 | 56 | 57 | @torch.no_grad() 58 | def compute_eval_metrics(model, dataloader, device, size): 59 | """ 60 | For the given model, computes accuracy & loss on validation/test set. 61 | 62 | :param model: model to evaluate 63 | :param dataloader: validation/test set dataloader 64 | :param device: cuda/cpu device where the model resides 65 | :param size: no. of samples (subset) to use 66 | :return: metrics {'accuracy', 'loss'} 67 | :rtype: dict 68 | """ 69 | model.eval() 70 | 71 | loss = 0.0 72 | num_correct = 0 73 | total_samples = 0 74 | 75 | # Evaluate on mini-batches & then average over the total 76 | for n_iter, batch in enumerate(dataloader): 77 | # Load to device, for the list of batch tensors 78 | image = batch['image'].to(device) 79 | label = batch['label'].to(device) 80 | 81 | # Forward Pass 82 | label_logits = model(image) 83 | 84 | # Compute Accuracy 85 | label_predicted = torch.argmax(label_logits, dim=1) 86 | correct = (label == label_predicted) 87 | num_correct += correct.sum().item() 88 | 89 | # Compute Loss 90 | loss += F.cross_entropy(label_logits, label, reduction='mean') 91 | 92 | batch_size = label_logits.shape[0] 93 | total_samples += batch_size 94 | 95 | if total_samples > size: 96 | break 97 | 98 | # Final Accuracy 99 | accuracy = 100.0 * (num_correct / total_samples) 100 | 101 | # Final Loss (averaged over mini-batches - n_iter) 102 | loss = loss / n_iter 103 | 104 | metrics = {'accuracy': accuracy, 'loss': loss} 105 | 106 | return metrics 107 | 108 | 109 | # --------------------------------------------------------------------------- 110 | def setup_logger(parser, log_dir, file_name='train_log.txt'): 111 | """ 112 | Generates log file and writes the executed python flags for the current run, 113 | along with the training log (printed to console). \n 114 | 115 | This is helpful in maintaining experiment logs (with arguments). \n 116 | 117 | While resuming training, the new output log is simply appended to the previously created train log file. 118 | 119 | :param parser: argument parser object 120 | :param log_dir: file path (to create) 121 | :param file_name: log file name 122 | :return: train log file 123 | """ 124 | log_file_path = os.path.join(log_dir, file_name) 125 | 126 | log_file = open(log_file_path, 'a+') 127 | 128 | # python3 file_name.py 129 | log_file.write('python3 ' + sys.argv[0] + '\n') 130 | 131 | # Add all the arguments (key value) 132 | args = parser.parse_args() 133 | 134 | for key, value in vars(args).items(): 135 | # write to train log file 136 | log_file.write('--' + key + ' ' + str(value) + '\n') 137 | 138 | log_file.write('\n\n') 139 | log_file.flush() 140 | 141 | return log_file 142 | 143 | 144 | def print_log(msg, log_file): 145 | """ 146 | :param str msg: Message to be printed & logged 147 | :param file log_file: log file 148 | """ 149 | log_file.write(msg + '\n') 150 | log_file.flush() 151 | 152 | print(msg) 153 | 154 | 155 | def str2bool(v): 156 | v = v.lower() 157 | assert v in ['true', 'false', '1', '0'], 'Option requires: "true" or "false"' 158 | return v in ['true', '1'] 159 | -------------------------------------------------------------------------------- /component/NegationDetection/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | models 3 | bert_base_uncased_model 4 | bert_tokenizer 5 | xlnet_tokenizer 6 | xlnet-base-cased-model 7 | SFU_Review_Corpus_Negation_Speculation* -------------------------------------------------------------------------------- /component/NegationDetection/README.md: -------------------------------------------------------------------------------- 1 | # Negation Cue Detection and Scope Resolution 2 | 3 | The training, evaluation and inference code for NegBERT. 4 | 5 | For the cue detection, the label for each word follows the annotation schema: 6 | * 0: Affix 7 | * 1: Normal cue 8 | * 2: Part of a multiword cue 9 | * 3: Not a cue 10 | 11 | ## Performance 12 | 13 | Negation cue detection, evaluating on SFU review dataset: 14 | 15 | ``` 16 | Validation loss: 0.14660959019822234 17 | Validation Accuracy: 0.9972840671132076 18 | Validation Accuracy for Positive Cues: 0.9394110275689225 19 | 1 2 3 20 | 1 733.0 1.0 35.0 21 | 2 1.0 30.0 11.0 22 | 3 75.0 14.0 48267.0 23 | precision recall f1-score support 24 | 25 | 1 0.91 0.95 0.93 769 26 | 2 0.67 0.71 0.69 42 27 | 3 1.00 1.00 1.00 48356 28 | 29 | accuracy 1.00 49167 30 | macro avg 0.86 0.89 0.87 49167 31 | weighted avg 1.00 1.00 1.00 49167 32 | 33 | F1-Score: 0.9972513069839883 34 | Precision: 0.8955399061032864 35 | Recall: 0.9431396786155748 36 | F1 Score: 0.9187236604455147 37 | F1-Score Cue_No Cue: 0.9972891007811321 38 | ``` 39 | 40 | Negative scope resolution, evaluating on SFU review dataset: 41 | 42 | ``` 43 | Validation loss: 0.21461165494672807 44 | Validation Accuracy: 0.9522214842258335 45 | Validation Accuracy Scope Level: 0.7831683168316831 46 | Precision: 1 47 | Recall: 0.7838509316770186 48 | F1 Score: 0.8788300835654596 49 | precision recall f1-score support 50 | 51 | 0 0.97 0.97 0.97 16358 52 | 1 0.90 0.92 0.91 5279 53 | 54 | accuracy 0.95 21637 55 | macro avg 0.94 0.94 0.94 21637 56 | weighted avg 0.96 0.95 0.95 2163 57 | ``` 58 | 59 | ## Training 60 | 61 | To train the negation cue detection model, set `SUBTASK = 'cue_detection'`. For negation scope resolution model, set `SUBTASK = 'scope_resolution'`. Then 62 | 63 | ``` 64 | python train.py 65 | ``` 66 | 67 | ## Acknowledgement 68 | 69 | ``` 70 | @article{Khandelwal2020NegBERTAT, 71 | title={NegBERT: A Transfer Learning Approach for Negation Detection and Scope Resolution}, 72 | author={Aditya Khandelwal and Suraj Sawant}, 73 | journal={ArXiv}, 74 | year={2020}, 75 | volume={abs/1911.04211} 76 | } 77 | ``` 78 | 79 | Adapted from the codebase: https://github.com/adityak6798/Transformers-For-Negation-and-Speculation 80 | 81 | The SFU review dataset can be downloaded from [this link](https://www.sfu.ca/~mtaboada/SFU_Review_Corpus.html) -------------------------------------------------------------------------------- /component/REST_service/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from collections import defaultdict 5 | from flask import Flask, jsonify 6 | import json 7 | from flask import Response 8 | from flask import request 9 | import requests 10 | import urllib.parse 11 | import ast 12 | import sys 13 | sys.path.append("..") 14 | sys.path.append("../Duration") 15 | from Duration.inference_api import DurationAPI 16 | 17 | if __name__ == "__main__": 18 | ''' 19 | This program will establish or call an web service to call component. 20 | Mode 1: server. The machine is act as server to respond to web API REST calls (activate by run the program externally and set mode to 'server') 21 | Mode 2: client. The machine will call a server to get embedding. (activate by run the program externally and set mode to 'client') 22 | ''' 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("-mode", "--mode", help="run as server, client [server, client]", type=str, default="server") 25 | parser.add_argument("-port", "--port", help="port to run this REST service", type=int, default=17000) 26 | args = parser.parse_args() 27 | 28 | # Option 2: Run as a server to provide API service 29 | if args.mode == "server": 30 | print ('-----component/REST_service: HTTP SERVER MODE-----') 31 | 32 | # Load component class 33 | durationAPI = DurationAPI(base_dir = '../Duration') 34 | 35 | app = Flask(__name__) 36 | @app.route('/duration', methods=['POST']) 37 | def response_pred(): 38 | # get three parameters 39 | print('============REST_service') 40 | # text = request.args.get('text') 41 | # domain = request.args.get('domain') 42 | # events = request.args.get('events') 43 | # print(text) 44 | # print(domain) 45 | # print(events) 46 | json = request.get_json() 47 | print (json) 48 | events = json['events'] 49 | print (events) 50 | json_list = durationAPI.pred(events) 51 | print (json_list) 52 | response_json = {'json_list': json_list} 53 | return jsonify(response_json) 54 | app.run(port=args.port) 55 | else: 56 | print ('-> MODE NOT CHOSEN') -------------------------------------------------------------------------------- /component/TempRel/.gitignore: -------------------------------------------------------------------------------- 1 | # Project specific 2 | models 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # IDE 13 | .idea 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | -------------------------------------------------------------------------------- /component/TempRel/code/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python joint_model.py -input_text "Orders went out today to deploy 17,000 U.S. Army soldiers in the Persian Gulf region." 3 | -------------------------------------------------------------------------------- /component/TempRel/other/pos_tags.txt: -------------------------------------------------------------------------------- 1 | CC 2 | CD 3 | DT 4 | EX 5 | FW 6 | IN 7 | JJ 8 | JJR 9 | JJS 10 | LS 11 | MD 12 | NN 13 | NNS 14 | NNP 15 | NNPS 16 | PDT 17 | POS 18 | PRP 19 | PRP$ 20 | RB 21 | RBR 22 | RBS 23 | RP 24 | SYM 25 | TO 26 | UH 27 | VB 28 | VBD 29 | VBG 30 | VBN 31 | VBP 32 | VBZ 33 | WDT 34 | WP 35 | WP$ 36 | WRB -------------------------------------------------------------------------------- /component/component_envs/env_temprel.yml: -------------------------------------------------------------------------------- 1 | name: event-pipeline 2 | channels: 3 | - pytorch 4 | - gurobi 5 | - serge-sans-paille 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - _ipyw_jlab_nb_ext_conf=0.1.0=py37_0 10 | - _libgcc_mutex=0.1=main 11 | - _pytorch_select=0.2=gpu_0 12 | - alabaster=0.7.12=py37_0 13 | - anaconda=2019.10=py37_0 14 | - anaconda-client=1.7.2=py37_0 15 | - anaconda-navigator=1.9.7=py37_0 16 | - anaconda-project=0.8.3=py_0 17 | - asn1crypto=1.0.1=py37_0 18 | - astroid=2.3.1=py37_0 19 | - astropy=3.2.2=py37h7b6447c_0 20 | - atomicwrites=1.3.0=py37_1 21 | - attrs=19.2.0=py_0 22 | - babel=2.7.0=py_0 23 | - backcall=0.1.0=py37_0 24 | - backports=1.0=py_2 25 | - backports.functools_lru_cache=1.6.1=py_0 26 | - backports.os=0.1.1=py37_0 27 | - backports.shutil_get_terminal_size=1.0.0=py37_2 28 | - backports.tempfile=1.0=py_1 29 | - backports.weakref=1.0.post1=py_1 30 | - beautifulsoup4=4.8.0=py37_0 31 | - bitarray=1.0.1=py37h7b6447c_0 32 | - bkcharts=0.2=py37_0 33 | - blas=1.0=mkl 34 | - bleach=3.1.0=py37_0 35 | - blosc=1.16.3=hd408876_0 36 | - bokeh=1.3.4=py37_0 37 | - boto=2.49.0=py37_0 38 | - bottleneck=1.2.1=py37h035aef0_1 39 | - bzip2=1.0.8=h7b6447c_0 40 | - ca-certificates=2019.8.28=0 41 | - cairo=1.14.12=h8948797_3 42 | - catalogue=0.0.8=py_0 43 | - certifi=2019.9.11=py37_0 44 | - cffi=1.12.3=py37h2e261b9_0 45 | - chardet=3.0.4=py37_1003 46 | - click=7.0=py37_0 47 | - cloog=0.18.1=1 48 | - cloudpickle=1.2.2=py_0 49 | - clyent=1.2.2=py37_1 50 | - colorama=0.4.1=py37_0 51 | - conda=4.8.3=py37hc8dfbb8_1 52 | - conda-build=3.18.9=py37_3 53 | - conda-env=2.6.0=1 54 | - conda-package-handling=1.6.0=py37h7b6447c_0 55 | - conda-verify=3.4.2=py_1 56 | - contextlib2=0.6.0=py_0 57 | - cryptography=2.7=py37h1ba5d50_0 58 | - cudatoolkit=10.1.243=h6bb024c_0 59 | - curl=7.65.3=hbc83047_0 60 | - cycler=0.10.0=py37_0 61 | - cymem=2.0.3=py37he1b5a44_0 62 | - cython=0.29.13=py37he6710b0_0 63 | - cython-blis=0.4.1=py37h516909a_0 64 | - cytoolz=0.10.0=py37h7b6447c_0 65 | - dask=2.5.2=py_0 66 | - dask-core=2.5.2=py_0 67 | - dataclasses=0.7=py37_0 68 | - dbus=1.13.6=h746ee38_0 69 | - decorator=4.4.0=py37_1 70 | - defusedxml=0.6.0=py_0 71 | - distributed=2.5.2=py_0 72 | - docutils=0.15.2=py37_0 73 | - entrypoints=0.3=py37_0 74 | - et_xmlfile=1.0.1=py37_0 75 | - expat=2.2.6=he6710b0_0 76 | - fastcache=1.1.0=py37h7b6447c_0 77 | - filelock=3.0.12=py_0 78 | - flask=1.1.1=py_0 79 | - fontconfig=2.13.0=h9420a91_0 80 | - freetype=2.9.1=h8a8886c_1 81 | - fribidi=1.0.5=h7b6447c_0 82 | - fsspec=0.5.2=py_0 83 | - future=0.18.2=py37_0 84 | - gcc_49=4.9.1=6 85 | - get_terminal_size=1.0.0=haa9412d_0 86 | - gevent=1.4.0=py37h7b6447c_0 87 | - glib=2.56.2=hd408876_0 88 | - glob2=0.7=py_0 89 | - gmp=6.1.2=h6c8ec71_1 90 | - gmpy2=2.0.8=py37h10f8cd9_2 91 | - graphite2=1.3.13=h23475e2_0 92 | - greenlet=0.4.15=py37h7b6447c_0 93 | - gst-plugins-base=1.14.0=hbbd80ab_1 94 | - gstreamer=1.14.0=hb453b48_1 95 | - gurobi=9.0.1=py37_0 96 | - h5py=2.9.0=py37h7918eee_0 97 | - harfbuzz=1.8.8=hffaf4a1_0 98 | - hdf5=1.10.4=hb1b8bf9_0 99 | - heapdict=1.0.1=py_0 100 | - html5lib=1.0.1=py37_0 101 | - icu=58.2=h9c2bf20_1 102 | - idna=2.8=py37_0 103 | - imageio=2.6.0=py37_0 104 | - imagesize=1.1.0=py37_0 105 | - importlib_metadata=0.23=py37_0 106 | - intel-openmp=2019.4=243 107 | - ipykernel=5.1.2=py37h39e3cac_0 108 | - ipython=7.8.0=py37h39e3cac_0 109 | - ipython_genutils=0.2.0=py37_0 110 | - ipywidgets=7.5.1=py_0 111 | - isl=0.12.2=0 112 | - isort=4.3.21=py37_0 113 | - itsdangerous=1.1.0=py37_0 114 | - jbig=2.1=hdba287a_0 115 | - jdcal=1.4.1=py_0 116 | - jedi=0.15.1=py37_0 117 | - jeepney=0.4.1=py_0 118 | - jinja2=2.10.3=py_0 119 | - joblib=0.13.2=py37_0 120 | - jpeg=9b=h024ee3a_2 121 | - json5=0.8.5=py_0 122 | - jsonschema=3.0.2=py37_0 123 | - jupyter=1.0.0=py37_7 124 | - jupyter_client=5.3.3=py37_1 125 | - jupyter_console=6.0.0=py37_0 126 | - jupyter_core=4.5.0=py_0 127 | - jupyterlab=1.1.4=pyhf63ae98_0 128 | - jupyterlab_server=1.0.6=py_0 129 | - keyring=18.0.0=py37_0 130 | - kiwisolver=1.1.0=py37he6710b0_0 131 | - krb5=1.16.1=h173b8e3_7 132 | - lazy-object-proxy=1.4.2=py37h7b6447c_0 133 | - libarchive=3.3.3=h5d8350f_5 134 | - libcurl=7.65.3=h20c2e04_0 135 | - libedit=3.1.20181209=hc058e9b_0 136 | - libffi=3.2.1=hd88cf55_4 137 | - libgcc-ng=9.1.0=hdf63c60_0 138 | - libgfortran-ng=7.3.0=hdf63c60_0 139 | - liblief=0.9.0=h7725739_2 140 | - libpng=1.6.37=hbc83047_0 141 | - libsodium=1.0.16=h1bed415_0 142 | - libssh2=1.8.2=h1ba5d50_0 143 | - libstdcxx-ng=9.1.0=hdf63c60_0 144 | - libtiff=4.0.10=h2733197_2 145 | - libtool=2.4.6=h7b6447c_5 146 | - libuuid=1.0.3=h1bed415_2 147 | - libxcb=1.13=h1bed415_1 148 | - libxml2=2.9.9=hea5a465_1 149 | - libxslt=1.1.33=h7d1a2b0_0 150 | - llvmlite=0.29.0=py37hd408876_0 151 | - locket=0.2.0=py37_1 152 | - lxml=4.4.1=py37hefd8a0e_0 153 | - lz4-c=1.8.1.2=h14c3975_0 154 | - lzo=2.10=h49e0be7_2 155 | - markupsafe=1.1.1=py37h7b6447c_0 156 | - matplotlib=3.1.1=py37h5429711_0 157 | - mccabe=0.6.1=py37_1 158 | - mistune=0.8.4=py37h7b6447c_0 159 | - mkl=2019.4=243 160 | - mkl-service=2.3.0=py37he904b0f_0 161 | - mkl_fft=1.0.14=py37ha843d7b_0 162 | - mkl_random=1.1.0=py37hd6b4f25_0 163 | - mock=3.0.5=py37_0 164 | - more-itertools=7.2.0=py37_0 165 | - mpc=1.1.0=h10f8cd9_1 166 | - mpfr=4.0.1=hdf1c602_3 167 | - mpmath=1.1.0=py37_0 168 | - msgpack-python=0.6.1=py37hfd86e86_1 169 | - multipledispatch=0.6.0=py37_0 170 | - murmurhash=1.0.0=py37he1b5a44_0 171 | - navigator-updater=0.2.1=py37_0 172 | - nbconvert=5.6.0=py37_1 173 | - nbformat=4.4.0=py37_0 174 | - ncurses=6.1=he6710b0_1 175 | - networkx=2.3=py_0 176 | - ninja=1.9.0=py37hfd86e86_0 177 | - nltk=3.4.5=py37_0 178 | - nose=1.3.7=py37_2 179 | - notebook=6.0.1=py37_0 180 | - numba=0.45.1=py37h962f231_0 181 | - numexpr=2.7.0=py37h9e4a6bb_0 182 | - numpy=1.17.2=py37haad9e8e_0 183 | - numpy-base=1.17.2=py37hde5b4d6_0 184 | - numpydoc=0.9.1=py_0 185 | - olefile=0.46=py37_0 186 | - openpyxl=3.0.0=py_0 187 | - openssl=1.1.1d=h7b6447c_2 188 | - packaging=19.2=py_0 189 | - pandas=0.25.1=py37he6710b0_0 190 | - pandoc=2.2.3.2=0 191 | - pandocfilters=1.4.2=py37_1 192 | - pango=1.42.4=h049681c_0 193 | - parso=0.5.1=py_0 194 | - partd=1.0.0=py_0 195 | - patchelf=0.9=he6710b0_3 196 | - path.py=12.0.1=py_0 197 | - pathlib2=2.3.5=py37_0 198 | - patsy=0.5.1=py37_0 199 | - pcre=8.43=he6710b0_0 200 | - pep8=1.7.1=py37_0 201 | - pexpect=4.7.0=py37_0 202 | - pickleshare=0.7.5=py37_0 203 | - pillow=6.2.0=py37h34e0f95_0 204 | - pip=19.2.3=py37_0 205 | - pixman=0.38.0=h7b6447c_0 206 | - pkginfo=1.5.0.1=py37_0 207 | - plac=0.9.6=py37_0 208 | - pluggy=0.13.0=py37_0 209 | - ply=3.11=py37_0 210 | - preshed=3.0.2=py37he1b5a44_1 211 | - prometheus_client=0.7.1=py_0 212 | - prompt_toolkit=2.0.10=py_0 213 | - psutil=5.6.3=py37h7b6447c_0 214 | - ptyprocess=0.6.0=py37_0 215 | - py=1.8.0=py37_0 216 | - py-lief=0.9.0=py37h7725739_2 217 | - pycodestyle=2.5.0=py37_0 218 | - pycosat=0.6.3=py37h14c3975_0 219 | - pycparser=2.19=py37_0 220 | - pycrypto=2.6.1=py37h14c3975_9 221 | - pycurl=7.43.0.3=py37h1ba5d50_0 222 | - pyflakes=2.1.1=py37_0 223 | - pygments=2.4.2=py_0 224 | - pylint=2.4.2=py37_0 225 | - pyodbc=4.0.27=py37he6710b0_0 226 | - pyopenssl=19.0.0=py37_0 227 | - pyparsing=2.4.2=py_0 228 | - pyqt=5.9.2=py37h05f1152_2 229 | - pyrsistent=0.15.4=py37h7b6447c_0 230 | - pysocks=1.7.1=py37_0 231 | - pytables=3.5.2=py37h71ec239_1 232 | - pytest=5.2.1=py37_0 233 | - pytest-arraydiff=0.3=py37h39e3cac_0 234 | - pytest-astropy=0.5.0=py37_0 235 | - pytest-doctestplus=0.4.0=py_0 236 | - pytest-openfiles=0.4.0=py_0 237 | - pytest-remotedata=0.3.2=py37_0 238 | - python=3.7.4=h265db76_1 239 | - python-dateutil=2.8.0=py37_0 240 | - python-libarchive-c=2.8=py37_13 241 | - python_abi=3.7=1_cp37m 242 | - pytorch=1.3.1=py3.7_cuda10.1.243_cudnn7.6.3_0 243 | - pytz=2019.3=py_0 244 | - pywavelets=1.0.3=py37hdd07704_1 245 | - pyyaml=5.1.2=py37h7b6447c_0 246 | - pyzmq=18.1.0=py37he6710b0_0 247 | - qt=5.9.7=h5867ecd_1 248 | - qtawesome=0.6.0=py_0 249 | - qtconsole=4.5.5=py_0 250 | - qtpy=1.9.0=py_0 251 | - readline=7.0=h7b6447c_5 252 | - requests=2.22.0=py37_0 253 | - ripgrep=0.10.0=hc07d326_0 254 | - rope=0.14.0=py_0 255 | - ruamel_yaml=0.15.46=py37h14c3975_0 256 | - scikit-image=0.15.0=py37he6710b0_0 257 | - scikit-learn=0.21.3=py37hd81dba3_0 258 | - scipy=1.3.1=py37h7c811a0_0 259 | - seaborn=0.9.0=py37_0 260 | - secretstorage=3.1.1=py37_0 261 | - send2trash=1.5.0=py37_0 262 | - setuptools=41.4.0=py37_0 263 | - simplegeneric=0.8.1=py37_2 264 | - singledispatch=3.4.0.3=py37_0 265 | - sip=4.19.8=py37hf484d3e_0 266 | - six=1.12.0=py37_0 267 | - snappy=1.1.7=hbae5bb6_3 268 | - snowballstemmer=2.0.0=py_0 269 | - sortedcollections=1.1.2=py37_0 270 | - sortedcontainers=2.1.0=py37_0 271 | - soupsieve=1.9.3=py37_0 272 | - spacy=2.2.3=py37hc9558a2_0 273 | - sphinx=2.2.0=py_0 274 | - sphinxcontrib=1.0=py37_1 275 | - sphinxcontrib-applehelp=1.0.1=py_0 276 | - sphinxcontrib-devhelp=1.0.1=py_0 277 | - sphinxcontrib-htmlhelp=1.0.2=py_0 278 | - sphinxcontrib-jsmath=1.0.1=py_0 279 | - sphinxcontrib-qthelp=1.0.2=py_0 280 | - sphinxcontrib-serializinghtml=1.1.3=py_0 281 | - sphinxcontrib-websupport=1.1.2=py_0 282 | - spyder=3.3.6=py37_0 283 | - spyder-kernels=0.5.2=py37_0 284 | - sqlalchemy=1.3.9=py37h7b6447c_0 285 | - sqlite=3.30.0=h7b6447c_0 286 | - srsly=0.2.0=py37he1b5a44_0 287 | - statsmodels=0.10.1=py37hdd07704_0 288 | - sympy=1.4=py37_0 289 | - tbb=2019.4=hfd86e86_0 290 | - tblib=1.4.0=py_0 291 | - terminado=0.8.2=py37_0 292 | - testpath=0.4.2=py37_0 293 | - thinc=7.3.0=py37hc9558a2_0 294 | - tk=8.6.8=hbc83047_0 295 | - toolz=0.10.0=py_0 296 | - tornado=6.0.3=py37h7b6447c_0 297 | - traitlets=4.3.3=py37_0 298 | - unicodecsv=0.14.1=py37_0 299 | - unixodbc=2.3.7=h14c3975_0 300 | - urllib3=1.24.2=py37_0 301 | - wasabi=0.4.0=py_0 302 | - wcwidth=0.1.7=py37_0 303 | - webencodings=0.5.1=py37_1 304 | - werkzeug=0.16.0=py_0 305 | - wheel=0.33.6=py37_0 306 | - widgetsnbextension=3.5.1=py37_0 307 | - wrapt=1.11.2=py37h7b6447c_0 308 | - wurlitzer=1.0.3=py37_0 309 | - xlrd=1.2.0=py37_0 310 | - xlsxwriter=1.2.1=py_0 311 | - xlwt=1.3.0=py37_0 312 | - xz=5.2.4=h14c3975_4 313 | - yaml=0.1.7=had09818_2 314 | - zeromq=4.3.1=he6710b0_3 315 | - zict=1.0.0=py_0 316 | - zipp=0.6.0=py_0 317 | - zlib=1.2.11=h7b6447c_3 318 | - zstd=1.3.7=h0b5b093_0 319 | - pip: 320 | - en-core-web-sm-mirror==2.2.5 321 | - frozendict==1.2 322 | - immutablecollections==0.9.0 323 | - sacremoses==0.0.38 324 | - sentencepiece==0.1.85 325 | - tokenizers==0.0.11 326 | - torchvision==0.2.2 327 | - tqdm==4.19.9 328 | - transformers==2.3.0 329 | - typing-extensions==3.7.4.1 330 | - vistautils==0.17.0 331 | prefix: /nas/home/mingyuma/miniconda3/envs/event-pipeline 332 | 333 | -------------------------------------------------------------------------------- /component/component_envs/req_better.txt: -------------------------------------------------------------------------------- 1 | allennlp==0.9.0 2 | pytorch-crf==0.7.2 3 | pytorch-nlp==0.5.0 4 | seqeval==0.0.12 5 | sklearn==0.0 6 | tensorboardX==2.0 7 | torch==1.4.0 8 | transformers==2.4.1 -------------------------------------------------------------------------------- /component/component_envs/req_biomed.txt: -------------------------------------------------------------------------------- 1 | attrs==19.3.0 2 | awscli==1.18.84 3 | backcall==0.2.0 4 | bleach==3.1.5 5 | blis==0.4.1 6 | boto3==1.14.7 7 | botocore==1.17.7 8 | catalogue==1.0.0 9 | certifi==2020.4.5.2 10 | chardet==3.0.4 11 | click==7.1.2 12 | colorama==0.4.3 13 | conllu==3.0 14 | cycler==0.10.0 15 | cymem==2.0.3 16 | decorator==4.4.2 17 | defusedxml==0.6.0 18 | docutils==0.15.2 19 | en-ner-jnlpba-md @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_jnlpba_md-0.2.4.tar.gz 20 | entrypoints==0.3 21 | filelock==3.0.12 22 | googledrivedownloader==0.4 23 | h5py==2.10.0 24 | idna==2.9 25 | imageio==2.8.0 26 | importlib-metadata==1.6.1 27 | ipykernel==5.3.0 28 | ipython==7.15.0 29 | ipython-genutils==0.2.0 30 | ipywidgets==7.5.1 31 | isodate==0.6.0 32 | jedi==0.17.1 33 | Jinja2==2.11.2 34 | jmespath==0.10.0 35 | joblib==0.14.1 36 | jsonschema==3.2.0 37 | jupyter==1.0.0 38 | jupyter-client==6.1.3 39 | jupyter-console==6.1.0 40 | jupyter-core==4.6.3 41 | kiwisolver==1.2.0 42 | llvmlite==0.33.0 43 | MarkupSafe==1.1.1 44 | matplotlib==3.2.2 45 | mistune==0.8.4 46 | murmurhash==1.0.2 47 | nbconvert==5.6.1 48 | nbformat==5.0.7 49 | networkx==2.4 50 | nmslib==2.0.6 51 | notebook==6.0.3 52 | numba==0.50.0 53 | numpy==1.16.0 54 | packaging==20.4 55 | pandas==0.24.2 56 | pandocfilters==1.4.2 57 | parso==0.7.0 58 | pexpect==4.8.0 59 | pickleshare==0.7.5 60 | Pillow==7.1.2 61 | plac==1.1.3 62 | plyfile==0.7.2 63 | preshed==3.0.2 64 | prometheus-client==0.8.0 65 | prompt-toolkit==3.0.5 66 | psutil==5.7.0 67 | ptyprocess==0.6.0 68 | pyasn1==0.4.8 69 | pybind11==2.5.0 70 | Pygments==2.6.1 71 | pyparsing==2.4.7 72 | pyrsistent==0.16.0 73 | pysbd==0.2.3 74 | python-dateutil==2.8.1 75 | pytz==2020.1 76 | PyWavelets==1.1.1 77 | PyYAML==5.3.1 78 | pyzmq==19.0.1 79 | qtconsole==4.7.4 80 | QtPy==1.9.0 81 | rdflib==5.0.0 82 | regex==2020.6.8 83 | requests==2.24.0 84 | rsa==3.4.2 85 | s3transfer==0.3.3 86 | sacremoses==0.0.43 87 | scikit-image==0.17.2 88 | scikit-learn==0.20.3 89 | scipy==1.4.1 90 | scispacy==0.2.4 91 | Send2Trash==1.5.0 92 | sentencepiece==0.1.91 93 | six==1.15.0 94 | spacy==2.2.4 95 | srsly==1.0.2 96 | terminado==0.8.3 97 | testpath==0.4.4 98 | thinc==7.4.0 99 | threadpoolctl==2.1.0 100 | tifffile==2020.6.3 101 | tokenizers==0.5.0 102 | torch==1.4.0 103 | torch-geometric==1.4.3 104 | tornado==6.0.4 105 | tqdm==4.46.1 106 | traitlets==4.3.3 107 | transformers==2.5.0 108 | urllib3==1.25.9 109 | wasabi==0.6.0 110 | wcwidth==0.2.4 111 | webencodings==0.5.1 112 | widgetsnbextension==3.5.1 113 | zipp==3.1.0 -------------------------------------------------------------------------------- /project/APIs/coref.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import numpy as np 4 | from allennlp.predictors.predictor import Predictor 5 | 6 | class NumpyEncoder(json.JSONEncoder): 7 | """ Custom encoder for numpy data types """ 8 | def default(self, obj): 9 | if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, 10 | np.int16, np.int32, np.int64, np.uint8, 11 | np.uint16, np.uint32, np.uint64)): 12 | 13 | return int(obj) 14 | 15 | elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): 16 | return float(obj) 17 | 18 | elif isinstance(obj, (np.complex_, np.complex64, np.complex128)): 19 | return {'real': obj.real, 'imag': obj.imag} 20 | 21 | elif isinstance(obj, (np.ndarray,)): 22 | return obj.tolist() 23 | 24 | elif isinstance(obj, (np.bool_)): 25 | return bool(obj) 26 | 27 | elif isinstance(obj, (np.void)): 28 | return None 29 | 30 | return json.JSONEncoder.default(self, obj) 31 | 32 | def get_coference(doc): 33 | pred = predictor.predict(document = doc) 34 | clusters = pred['clusters'] 35 | document = pred['document'] 36 | top_spans = pred['top_spans'] 37 | 38 | # find the main span for each cluster 39 | clusters_top_span = [] 40 | for i in range(0, len(clusters)): 41 | one_cl = clusters[i] 42 | span_rank = [top_spans.index(span) for span in one_cl] 43 | top_span = np.argmin(span_rank) 44 | clusters_top_span.append(one_cl[top_span]) 45 | pred['clusters_top_span'] = clusters_top_span 46 | 47 | # convert top span for each cluster to text 48 | clusters_top_span_text = [] 49 | for each_top_span in clusters_top_span: 50 | span_text = document[each_top_span[0]:(each_top_span[1]+1)] 51 | clusters_top_span_text.append(span_text) 52 | pred['clusters_top_span_text'] = clusters_top_span_text 53 | 54 | return pred 55 | 56 | def save(args, result_json): 57 | # result_json = { 58 | # 'error_list': not_done_list, 59 | # 'result_list': result_list 60 | # } 61 | with open(args.save_path_json, 'w', encoding='utf-8') as f: 62 | # Use NumpyEncoder to convert numpy data to list 63 | # Previous error: Object of type int64 is not JSON serializable 64 | json.dump(result_json, f, indent=4, ensure_ascii=False, 65 | cls=NumpyEncoder) 66 | print ('Saved') 67 | 68 | if __name__ == '__main__': 69 | p = argparse.ArgumentParser() 70 | p.add_argument('-data', type=str, default='../../raw_text/test.0622_pipelined.json') 71 | p.add_argument('-save_path_json', type=str, default='../../raw_text/test.0622_pipelined_coref.json') 72 | args = p.parse_args() 73 | 74 | data = json.load(open(args.data)) 75 | # load AllenNLP predictor 76 | predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz") 77 | 78 | docs = data['result_list'] 79 | 80 | for doc_num, doc in enumerate(docs): 81 | # ensemble the document from sentences 82 | doc_text_list = [] 83 | doc_text_len = [0] 84 | for sen in doc: 85 | print(sen['tokens']) 86 | doc_text_list.append(sen['sentence']) 87 | doc_text_len.append(len(sen['tokens'])) 88 | doc_text = ' '.join(doc_text_list) 89 | for i, num in enumerate(doc_text_len): 90 | if i >= 1: 91 | doc_text_len[i] += doc_text_len[i - 1] 92 | sens_idx_beg = doc_text_len[:-1] 93 | sens_idx_end = doc_text_len[1:] 94 | # sens_idx_beg saves the beginning token idx of each sentence in the doc 95 | # sens_idx_end saves the ending token idx of each sentence in the doc 96 | print(doc_text) 97 | print(sens_idx_beg) 98 | print(sens_idx_end) 99 | 100 | # get coreference result for the document 101 | coref_pred = get_coference(doc_text) 102 | print(coref_pred) 103 | 104 | # save coref result to json 105 | for i_cluster, cluster in enumerate(coref_pred['clusters']): 106 | print('------') 107 | print(cluster) 108 | for mention in cluster: 109 | print(mention) 110 | # identify which sentence that this mention belongs to 111 | sen_nums = [i for i, beg in enumerate(sens_idx_beg) if mention[0] >= beg and mention[0] < sens_idx_end[i]] 112 | for sen_num in sen_nums: 113 | mention_idx_in_this_sen = [i - sens_idx_beg[sen_num] for i in mention] 114 | events_of_this_sen = data['result_list'][doc_num][sen_num]['events'] 115 | for i_e, e in enumerate(events_of_this_sen): 116 | for i_arg, arg_obj in enumerate(e['arguments']): 117 | if arg_obj['start_token'] == mention_idx_in_this_sen[0] and arg_obj['end_token'] == mention_idx_in_this_sen[1]: 118 | # this argument is exactly the one need to update its text to co-referenced span 119 | data['result_list'][doc_num][sen_num]['events'][i_e]['arguments'][i_arg]["text"] = " ".join(coref_pred['clusters_top_span_text'][i_cluster]) 120 | 121 | save(args, data) -------------------------------------------------------------------------------- /project/APIs/coref_pre.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import numpy as np 4 | from allennlp.predictors.predictor import Predictor 5 | 6 | class NumpyEncoder(json.JSONEncoder): 7 | """ Custom encoder for numpy data types """ 8 | def default(self, obj): 9 | if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, 10 | np.int16, np.int32, np.int64, np.uint8, 11 | np.uint16, np.uint32, np.uint64)): 12 | 13 | return int(obj) 14 | 15 | elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): 16 | return float(obj) 17 | 18 | elif isinstance(obj, (np.complex_, np.complex64, np.complex128)): 19 | return {'real': obj.real, 'imag': obj.imag} 20 | 21 | elif isinstance(obj, (np.ndarray,)): 22 | return obj.tolist() 23 | 24 | elif isinstance(obj, (np.bool_)): 25 | return bool(obj) 26 | 27 | elif isinstance(obj, (np.void)): 28 | return None 29 | 30 | return json.JSONEncoder.default(self, obj) 31 | 32 | def get_coference(doc): 33 | pred = predictor.predict(document = doc) 34 | clusters = pred['clusters'] 35 | document = pred['document'] 36 | top_spans = pred['top_spans'] 37 | 38 | # find the main span for each cluster 39 | clusters_top_span = [] 40 | for i in range(0, len(clusters)): 41 | one_cl = clusters[i] 42 | span_rank = [top_spans.index(span) for span in one_cl] 43 | top_span = np.argmin(span_rank) 44 | clusters_top_span.append(one_cl[top_span]) 45 | pred['clusters_top_span'] = clusters_top_span 46 | 47 | # convert top span for each cluster to text 48 | clusters_top_span_text = [] 49 | for each_top_span in clusters_top_span: 50 | span_text = document[each_top_span[0]:(each_top_span[1]+1)] 51 | clusters_top_span_text.append(span_text) 52 | pred['clusters_top_span_text'] = clusters_top_span_text 53 | 54 | return pred 55 | 56 | def save(args, result_json): 57 | # result_json = { 58 | # 'error_list': not_done_list, 59 | # 'result_list': result_list 60 | # } 61 | with open(args.save_path_json, 'w', encoding='utf-8') as f: 62 | # Use NumpyEncoder to convert numpy data to list 63 | # Previous error: Object of type int64 is not JSON serializable 64 | json.dump(result_json, f, indent=4, ensure_ascii=False, 65 | cls=NumpyEncoder) 66 | print ('Saved') 67 | 68 | def save_txt(args, docs_list): 69 | with open(args.save_path, 'w', encoding='utf-8') as f: 70 | f.write('\n'.join(docs_list)) 71 | 72 | if __name__ == '__main__': 73 | p = argparse.ArgumentParser() 74 | p.add_argument('-data', type=str, default='../../raw_text/test.0622.txt') 75 | p.add_argument('-save_path', type=str, default='../../raw_text/test.0622_coref_replaced.txt') 76 | args = p.parse_args() 77 | 78 | with (open(args.data, "r")) as f: 79 | docs = [line.rstrip() for line in f] 80 | # load AllenNLP predictor 81 | predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz") 82 | 83 | for doc_num, doc_text in enumerate(docs): 84 | # get coreference result for the document 85 | coref_pred = get_coference(doc_text) 86 | print('*' * 20) 87 | print(coref_pred) 88 | doc_tokens = coref_pred['document'] 89 | 90 | # replace coref mention to main mention 91 | for i_cluster, cluster in enumerate(coref_pred['clusters']): 92 | print('------') 93 | print(cluster) 94 | for mention in cluster: 95 | print("--> mention: %s" % mention) 96 | # replace each token in the mention range to empty 97 | for i in range(mention[0], mention[1]+1): 98 | doc_tokens[i] = '' 99 | # replace the first token with the main mention 100 | doc_tokens[mention[0]] = ' '.join(coref_pred['clusters_top_span_text'][i_cluster]) 101 | print(doc_tokens) 102 | docs[doc_num] = ' '.join([i for i in doc_tokens if i]) 103 | print("Replaced docs: %s" % docs[doc_num]) 104 | 105 | # save(args, data) 106 | save_txt(args, docs) -------------------------------------------------------------------------------- /project/APIs/test_on_ace_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run this script to run the API on the entire ACE dev and test data 3 | 4 | # Under project 5 | python APIs/test_on_ace_data.py 6 | """ 7 | 8 | import pickle 9 | import sys 10 | import os 11 | import argparse 12 | import json 13 | from main import EventAPIs 14 | 15 | def save(args, result_list, not_done_list): 16 | with open(args.save_path, 'wb') as f: 17 | pickle.dump(result_list, f) 18 | 19 | result_json = { 20 | 'error_list': not_done_list, 21 | 'result_list': result_list 22 | } 23 | with open(args.save_path_json, 'w', encoding='utf-8') as f: 24 | json.dump(result_json, f, indent=4) 25 | print ('Saved') 26 | 27 | 28 | if __name__ == '__main__': 29 | p = argparse.ArgumentParser() 30 | p.add_argument('-data', type=str, default='../../ace_data/ace_rawtext_dev.pkl') 31 | # p.add_argument('-data', type=str, default='../../ace_data/ace_rawtext_test.pkl') 32 | p.add_argument('-save_path', type=str, default='../../ace_data/ace_rawtext_dev_pipelined.pkl') 33 | p.add_argument('-save_path_json', type=str, default='../../ace_data/ace_rawtext_dev_pipelined.json') 34 | args = p.parse_args() 35 | eventAPIs = EventAPIs() 36 | print ('Loaded class') 37 | not_done_list = [] 38 | 39 | with (open(args.data, "rb")) as f: 40 | data = pickle.load(f) 41 | 42 | print ('Total sentences: ', len(data)) 43 | result_list = [] 44 | for i, text in enumerate(data): 45 | print ('='*40, i) 46 | params_this = { 47 | 'text': text, 48 | 'domain': 'news' 49 | } 50 | try: 51 | combined_result = eventAPIs.analyze(params_this) 52 | result_list.append(combined_result) 53 | except Exception as e: 54 | print('?'*60) 55 | print('Error for this text: ', text) 56 | print(str(e)) 57 | not_done_list.append(i) 58 | if i % 20 == 0: 59 | save(args, result_list, not_done_list) 60 | 61 | # print (result_list) 62 | save(args, result_list, not_done_list) 63 | print ('Not successfuly text:') 64 | print (not_done_list) -------------------------------------------------------------------------------- /project/APIs/test_on_raw_text.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run this script to run the API on raw text input 3 | 4 | # Under project 5 | python APIs/test_on_raw_text.py 6 | """ 7 | 8 | import pickle 9 | import sys 10 | import os 11 | import argparse 12 | import json 13 | from main import EventAPIs 14 | from nltk import tokenize 15 | import numpy as np 16 | 17 | class NumpyEncoder(json.JSONEncoder): 18 | """ Custom encoder for numpy data types """ 19 | def default(self, obj): 20 | if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, 21 | np.int16, np.int32, np.int64, np.uint8, 22 | np.uint16, np.uint32, np.uint64)): 23 | 24 | return int(obj) 25 | 26 | elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): 27 | return float(obj) 28 | 29 | elif isinstance(obj, (np.complex_, np.complex64, np.complex128)): 30 | return {'real': obj.real, 'imag': obj.imag} 31 | 32 | elif isinstance(obj, (np.ndarray,)): 33 | return obj.tolist() 34 | 35 | elif isinstance(obj, (np.bool_)): 36 | return bool(obj) 37 | 38 | elif isinstance(obj, (np.void)): 39 | return None 40 | 41 | return json.JSONEncoder.default(self, obj) 42 | 43 | def save(args, result_list, not_done_list): 44 | with open(args.save_path, 'wb') as f: 45 | pickle.dump(result_list, f) 46 | 47 | result_json = { 48 | 'error_list': not_done_list, 49 | 'result_list': result_list 50 | } 51 | with open(args.save_path_json, 'w', encoding='utf-8') as f: 52 | # Use NumpyEncoder to convert numpy data to list 53 | # Previous error: Object of type int64 is not JSON serializable 54 | json.dump(result_json, f, indent=4, ensure_ascii=False, 55 | cls=NumpyEncoder) 56 | print ('Saved') 57 | 58 | 59 | if __name__ == '__main__': 60 | p = argparse.ArgumentParser() 61 | p.add_argument('-data', type=str, default='../../raw_text/julsepscan.txt') 62 | # p.add_argument('-data', type=str, default='../../ace_data/ace_rawtext_test.pkl') 63 | p.add_argument('-save_path', type=str, default='../../raw_text/julsepscan_pipelined.pkl') 64 | p.add_argument('-save_path_json', type=str, default='../../raw_text/julsepscan_pipelined.json') 65 | p.add_argument('-negation_detection', action='store_true', default=True, 66 | help='Whether detection negation cue and scope resolution') 67 | args = p.parse_args() 68 | 69 | if args.negation_detection: 70 | eventAPIs = EventAPIs(negation_detection=True) 71 | else: 72 | eventAPIs = EventAPIs(negation_detection=False) 73 | print ('Loaded class') 74 | not_done_list = [] 75 | 76 | if args.data.split('.')[-1] == 'pkl': 77 | with (open(args.data, "rb")) as f: 78 | data = pickle.load(f) 79 | elif args.data.split('.')[-1] == 'txt': 80 | with (open(args.data, "r")) as f: 81 | linelist = [line.rstrip() for line in f] 82 | data = [] 83 | total_sen = 0 84 | # convert row text to list of sentences 85 | for line in linelist: 86 | sen_list = [] 87 | if line != '': 88 | # divide to sentences 89 | sen_list = tokenize.sent_tokenize(line) 90 | data.append(sen_list) 91 | total_sen += len(sen_list) 92 | # print(data[0:100]) 93 | # with open('../../raw_text/aprjunscan.pkl', 'wb') as f: 94 | # pickle.dump(data, f) 95 | print ('Total sentences: ', total_sen) 96 | 97 | print ('Total lines: ', len(data)) 98 | result_list = [] 99 | for i_line, sen_list in enumerate(data): 100 | result_list_this_line = [] 101 | for i_sen, text in enumerate(sen_list): 102 | print ('='*40, 'line num: ', i_line, "; sen num: ", i_sen) 103 | params_this = { 104 | 'text': text, 105 | 'domain': 'news' 106 | } 107 | try: 108 | combined_result = eventAPIs.analyze(params_this) 109 | combined_result['line_num'] = i_line 110 | combined_result['sen_num'] = i_sen 111 | combined_result['sentence'] = text 112 | result_list_this_line.append(combined_result) 113 | except Exception as e: 114 | print('?'*60) 115 | print('Error for this text: ', text) 116 | print(str(e)) 117 | not_done_list.append([i_line, i_sen]) 118 | result_list.append(result_list_this_line) 119 | if i_line % 20 == 0: 120 | save(args, result_list, not_done_list) 121 | 122 | # print (result_list) 123 | save(args, result_list, not_done_list) 124 | print ('Not successfuly text:') 125 | print (not_done_list) -------------------------------------------------------------------------------- /project/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "project.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError: 10 | # The above import may fail for some other reason. Ensure that the 11 | # issue is really that Django is missing to avoid masking other 12 | # exceptions on Python 2. 13 | try: 14 | import django 15 | except ImportError: 16 | raise ImportError( 17 | "Couldn't import Django. Are you sure it's installed and " 18 | "available on your PYTHONPATH environment variable? Did you " 19 | "forget to activate a virtual environment?" 20 | ) 21 | raise 22 | execute_from_command_line(sys.argv) 23 | -------------------------------------------------------------------------------- /project/project/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/project/project/__init__.py -------------------------------------------------------------------------------- /project/project/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for project project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.11.29. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.11/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = '5)se$(1!6n4apye2dal)f*=h3t%c7ep8e7)ikmb6%cux15tb0j' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = ["8dc63685fcb1.ngrok.io", "127.0.0.1", "localhost"] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | ] 41 | 42 | MIDDLEWARE = [ 43 | 'django.middleware.security.SecurityMiddleware', 44 | 'django.contrib.sessions.middleware.SessionMiddleware', 45 | 'django.middleware.common.CommonMiddleware', 46 | 'django.middleware.csrf.CsrfViewMiddleware', 47 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 48 | 'django.contrib.messages.middleware.MessageMiddleware', 49 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 50 | ] 51 | 52 | ROOT_URLCONF = 'project.urls' 53 | 54 | TEMPLATES = [ 55 | { 56 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 57 | 'DIRS': [os.path.join(BASE_DIR, 'templates')] 58 | , 59 | 'APP_DIRS': True, 60 | 'OPTIONS': { 61 | 'context_processors': [ 62 | 'django.template.context_processors.debug', 63 | 'django.template.context_processors.request', 64 | 'django.contrib.auth.context_processors.auth', 65 | 'django.contrib.messages.context_processors.messages', 66 | ], 67 | }, 68 | }, 69 | ] 70 | 71 | WSGI_APPLICATION = 'project.wsgi.application' 72 | 73 | 74 | # Database 75 | # https://docs.djangoproject.com/en/1.11/ref/settings/#databases 76 | 77 | DATABASES = { 78 | 'default': { 79 | 'ENGINE': 'django.db.backends.sqlite3', 80 | 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 81 | } 82 | } 83 | 84 | 85 | # Password validation 86 | # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators 87 | 88 | AUTH_PASSWORD_VALIDATORS = [ 89 | { 90 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 91 | }, 92 | { 93 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 94 | }, 95 | { 96 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 97 | }, 98 | { 99 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 100 | }, 101 | ] 102 | 103 | 104 | # Internationalization 105 | # https://docs.djangoproject.com/en/1.11/topics/i18n/ 106 | 107 | LANGUAGE_CODE = 'en-us' 108 | 109 | TIME_ZONE = 'UTC' 110 | 111 | USE_I18N = True 112 | 113 | USE_L10N = True 114 | 115 | USE_TZ = True 116 | 117 | 118 | # Static files (CSS, JavaScript, Images) 119 | # https://docs.djangoproject.com/en/1.11/howto/static-files/ 120 | 121 | STATIC_URL = '/static/' 122 | # this is the static files folder name which you created in django project root folder. This is different from above STATIC_URL. 123 | STATICFILES_DIRS = [ 124 | os.path.join(BASE_DIR, 'statics'), 125 | ] 126 | 127 | # hold our site’s static assets. This will allow Nginx to serve these directly, which will have a positive impact on performance 128 | STATIC_ROOT = os.path.join(BASE_DIR, "static/") 129 | -------------------------------------------------------------------------------- /project/project/urls.py: -------------------------------------------------------------------------------- 1 | """project URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.11/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.conf.urls import url, include 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 15 | """ 16 | from django.conf.urls import url 17 | from django.contrib import admin 18 | from project import views 19 | 20 | urlpatterns = [ 21 | url(r'^$', views.index, name='index'), 22 | url(r'^admin/', admin.site.urls), 23 | url('analyze_text/', views.analyze_text, name='analyze_text'), 24 | ] 25 | -------------------------------------------------------------------------------- /project/project/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for project project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "project.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /project/statics/css/index.css: -------------------------------------------------------------------------------- 1 | .body{ 2 | font-family: 'Roboto Mono', monospace!important 3 | } 4 | .container { 5 | width: 100%; 6 | height: 850px; 7 | margin: auto; 8 | position: center; 9 | padding: 2px; 10 | } 11 | 12 | .left { 13 | position: center; 14 | padding-top: 10%; 15 | width: 50%; 16 | height: 100%; 17 | float: left; 18 | border-right: 2px solid olivedrab; 19 | } 20 | 21 | .h2{ 22 | font-family: 'Roboto Mono', monospace!important; 23 | padding-top: 2px; 24 | } 25 | 26 | .h3{ 27 | font-family: 'Roboto Mono', monospace!important; 28 | } 29 | 30 | .introduction{ 31 | height: 30%; 32 | margin: auto; 33 | padding: auto; 34 | border-bottom: 1px dashed olivedrab; 35 | 36 | } 37 | 38 | .sample{ 39 | padding-top: 10%; 40 | height: 20%; 41 | /*border-bottom: 1px dashed olivedrab;*/ 42 | 43 | } 44 | .input{ 45 | padding-top: 10px; 46 | height: 50%; 47 | } 48 | 49 | 50 | .right { 51 | margin-left: 50%; 52 | height: 100%; 53 | /*background: black;*/ 54 | } 55 | 56 | .annotation{ 57 | /*padding-top: 20px;*/ 58 | height: 48.2%; 59 | /*background: blueviolet;*/ 60 | /*border-bottom: 1px dashed olivedrab;*/ 61 | /*margin: auto;*/ 62 | } 63 | 64 | .temporal{ 65 | height: 60%; 66 | /*background: chocolate;*/ 67 | margin: auto; 68 | } 69 | 70 | .node{} 71 | 72 | .link{ 73 | stroke: #999; stroke-opacity: .6; stroke-width: 1px; 74 | } 75 | 76 | .svg{ 77 | height: 50%; 78 | width: 100%; 79 | } 80 | 81 | .span{ 82 | font-size: 0.8em; 83 | font-weight: bold; 84 | line-height: 1; 85 | border-radius: 0.35em; 86 | text-transform: uppercase; 87 | vertical-align: middle; 88 | margin-left: 0.5rem 89 | } 90 | 91 | .mark{ 92 | padding: 0.45em 0.6em; 93 | margin: 0 0.25em; 94 | line-height: 1; 95 | border-radius: 0.35em; 96 | box-decoration-break: clone; 97 | -webkit-box-decoration-break: clone 98 | } 99 | 100 | div.tooltip { 101 | position: absolute; 102 | text-align: center; 103 | width: auto; 104 | height: auto; 105 | padding: 2px; 106 | font: 12px sans-serif; 107 | /*background: lightsteelblue;*/ 108 | background: #f4db4b; 109 | border: 0px; 110 | border-radius: 8px; 111 | pointer-events: none; 112 | } 113 | 114 | /* Start by setting display:none to make this hidden. 115 | Then we position it in relation to the viewport window 116 | with position:fixed. Width, height, top and left speak 117 | for themselves. Background we set to 80% white with 118 | our animation centered, and no-repeating */ 119 | .modal { 120 | display: none; 121 | position: fixed; 122 | z-index: 1000; 123 | top: 0; 124 | left: 0; 125 | height: 100%; 126 | width: 100%; 127 | background: rgba( 255, 255, 255, .8 ) 128 | url('loading.gif') 129 | 50% 50% 130 | no-repeat; 131 | } 132 | 133 | /* When the body has the loading class, we turn 134 | the scrollbar off with overflow:hidden */ 135 | body.loading .modal { 136 | overflow: hidden; 137 | } 138 | 139 | /* Anytime the body has the loading class, our 140 | modal element will be visible */ 141 | body.loading .modal { 142 | display: block; 143 | } -------------------------------------------------------------------------------- /project/statics/css/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlusLabNLP/EventPlus/f3c42ab392bf0b7698575c6557f012df27415a9c/project/statics/css/loading.gif -------------------------------------------------------------------------------- /project/statics/js/Tracking.js: -------------------------------------------------------------------------------- 1 | // potential tracker from Google Analytics -------------------------------------------------------------------------------- /project/statics/js/annotation.js: -------------------------------------------------------------------------------- 1 | const intro = introJs(); 2 | intro.setOptions({ 3 | exitOnOverlayClick:false, 4 | steps: [ 5 | {intro: "Welcome to EventPlus! Let\'s take a tour!"}, 6 | { 7 | element: "#btn-feature", 8 | intro: "Please take a look at features in EventPlus" 9 | }, 10 | { 11 | element: "#topic", 12 | intro: "Select the domain of your input, EventPlus support news domain and biomedical domain!" 13 | }, 14 | { 15 | element: "#examples", 16 | intro: "Select a sentence or input your sentences below." 17 | }, 18 | { 19 | element: "#analyze-text-btn", 20 | intro: "Click on analyze text button!" 21 | }, 22 | { 23 | element: "#show_annotation", 24 | intro: "We visualize entities in your input and their NER labels, they will be candidate arguments for your event!" 25 | }, 26 | { 27 | element: "#displayEvents", 28 | intro: "Here are all events that we extracted! Please click on any of them to see their corresponding arguments!" 29 | }, 30 | { 31 | element: "#show_annotation", 32 | intro: "We visualize your event and its corresponding arguments here!" 33 | }, 34 | { 35 | element: "#graph", 36 | intro: "Temporal relations between events if there are any and duration of events as node labels" 37 | } 38 | ] 39 | }); 40 | intro.start(); 41 | 42 | var return_value; 43 | var labels; 44 | var tokens; 45 | var mark_default = "padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone"; 46 | var span_default = "font-style: italic; background: #f4db4b; font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem"; 47 | var ner_default = "font-style: italic; background: #f4c2c2; font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem"; 48 | var default_annotation; 49 | String.prototype.template = function() { 50 | var args = arguments; 51 | return this.replace(/\{(\d+)\}/g,function(m,i){return args[i];}); 52 | }; 53 | 54 | 55 | $( function(){ 56 | $( document ).on('click', '#analyze-text-btn', function(e){ 57 | var input_text = $('#analyze-text-input').val().toString(); 58 | console.log('input_text:', input_text); 59 | var topic = $('#topic').val().toString(); 60 | ajaxAnalyzeText(input_text, topic); 61 | }); 62 | 63 | }); 64 | 65 | function ajaxAnalyzeText (input_text, topic){ 66 | console.log("ajax start"); 67 | $body = $("body"); 68 | $body.addClass("loading"); 69 | $.ajax({ 70 | url: '/analyze_text/', 71 | type: 'post', 72 | async: true, 73 | dataType: 'json', 74 | data: { 75 | text: input_text, 76 | domain: topic, 77 | }, 78 | beforeSend: function (xhr, settings) { 79 | if (!csrfSafeMethod(settings.type) && !this.crossDomain) { 80 | xhr.setRequestHeader("X-CSRFToken", csrftoken); 81 | } 82 | }, 83 | success: function (data) { 84 | console.log("ajax end"); 85 | $body.removeClass("loading"); 86 | return_value = data; 87 | $("#graph").empty(); 88 | draw_graph(return_value); 89 | tokens = return_value.tokens; 90 | labels = return_value.labels; 91 | var annotation = ""; 92 | var event_display = ''; 93 | // what if we delete the maintained visitedList to get rid of the nested problem. 94 | // var visitedList = []; 95 | // find all start indexes for ners and all start indexes for triggers 96 | var ner = []; 97 | var triggers = new Set(); 98 | for (i = 0; i < labels.length; i++) { 99 | if (labels[i].role === "trigger") { 100 | triggers.add(labels[i].start) 101 | } 102 | if ("ner" in labels[i]) { 103 | ner[labels[i].start] = labels[i] 104 | } 105 | } 106 | var visitListNer = []; 107 | for (let i = 0; i < tokens.length; i++) { 108 | // console.log(i.toString() + ": " + visitedList); 109 | // if (visitedList.includes(i)) { 110 | // continue; 111 | // } 112 | if (triggers.has(i)) { 113 | labels.forEach(function (item, index) { 114 | if ((item.start === i) && (item.role === "trigger")) { 115 | var text = " "; 116 | for (index = i; index < item.end; index++) { 117 | // this is correct 118 | var visitedList = []; 119 | if (visitedList.includes(index)) { 120 | continue; 121 | } 122 | text += tokens.slice(index, index + 1); 123 | text += " "; 124 | visitedList.push(index); 125 | } 126 | var mark_style = "background:" + item.color + ";" + mark_default; 127 | event_display += "{2}".template(mark_style, item.event, text); 128 | } 129 | }); 130 | } 131 | 132 | if (i in ner) { 133 | var label = ner[i] 134 | var text = " "; 135 | for (index = i; index < label.end; index++) { 136 | if(visitListNer.includes(index)) { 137 | continue; 138 | } 139 | text += tokens.slice(index, index + 1); 140 | text += " "; 141 | visitListNer.push(index) 142 | } 143 | ner_label = label.ner; 144 | annotation += "{0}{2} ".template(text, ner_default, ner_label) 145 | } 146 | if (!(i in ner) && !(i in triggers) && !(visitListNer.includes(i))) { 147 | annotation += tokens[i]; 148 | annotation += " "; 149 | } 150 | } 151 | $("#show_annotation").html(annotation) 152 | // give a default annotation as the annotation when it first loads 153 | default_annotation = annotation; 154 | $("#displayEvents").html(event_display) 155 | } 156 | }); 157 | } 158 | 159 | function event_click(obj) { 160 | var clicks = $(this).data('clicks'); 161 | if (clicks) { 162 | // odd clicks 163 | // console.log("odd clicks") 164 | onlyPlotThis(obj.getAttribute("eventId")); 165 | } else { 166 | // even clicks 167 | // console.log("even clicks") 168 | $("#show_annotation").html(default_annotation) 169 | 170 | } 171 | $(this).data("clicks", !clicks); 172 | } 173 | 174 | function onlyPlotThis(eventId) { 175 | 176 | var annotation = ""; 177 | var visitedList = []; 178 | var event_display = ""; 179 | var this_event = {}; 180 | // find everything corresponding to this event and mark it as a dictionary 181 | labels.forEach(function (item, index) { 182 | if (item.event == eventId) { 183 | this_event[item.start] = item 184 | } 185 | }); 186 | 187 | for (let i=0; i < tokens.length; i++) { 188 | // console.log(i.toString() + ": " + visitedList); 189 | if (visitedList.includes(i)) { 190 | continue; 191 | } 192 | if (i in this_event && !(visitedList.includes(i))) { 193 | var label = this_event[i]; 194 | var text = " "; 195 | for(index = i; index < label.end; index++) { 196 | text += tokens.slice(index, index + 1); 197 | text += " "; 198 | visitedList.push(index); 199 | } 200 | if ("event" in label && label.event == eventId) { 201 | if (label.role === "trigger") { 202 | // this labeled item is a trigger 203 | var mark_style = "background:" + label.color + ";" + mark_default; 204 | annotation += "{1}{3}".template(mark_style, text, span_default, label.label); 205 | event_display += "{2}".template(mark_style, label.event, text); 206 | } else if (label.role === "argument") { 207 | // this label item is an argument 208 | var mark_style = "background:" + label.color + ";" + mark_default; 209 | if("ner" in label) { 210 | mark_style += "text-decoration-line: underline;"; 211 | ner_label = label.ner; 212 | annotation += "{1}{3}{5}".template(mark_style, text, span_default, label.label, ner_default, ner_label); 213 | } else { 214 | annotation += "{1}{3}".template(mark_style, text, span_default, label.label); 215 | } 216 | } 217 | } else { 218 | annotation += text; 219 | annotation += " "; 220 | } 221 | } else { 222 | annotation += tokens[i]; 223 | annotation += " "; 224 | } 225 | } 226 | $("#show_annotation").html(annotation) 227 | 228 | } 229 | 230 | // var labels = return_value.labels 231 | 232 | // for(let i = 0; i < cherry_obj.tokens.length; i++){ 233 | // 234 | // } 235 | // 236 | // for(let i = 0; i < cherry_obj.events.length; i++) { 237 | // var mark_style = "background:" + "#42a4f0;" + mark_default 238 | // $("#displayEvents").html("{1}".template(mark_style, cherry_obj.events[i].triggers.text)) 239 | // } 240 | 241 | 242 | -------------------------------------------------------------------------------- /project/statics/js/main.js: -------------------------------------------------------------------------------- 1 | 2 | // two select box 3 | var news_array = [ 4 | "The other customers fled, and the police said it did not appear that anyone else was injured", 5 | "A powerful ice storm continues to maintain its grip. Yesterday New York governor George Pataki toured five counties that have been declared under a state of emergency", 6 | "Despite the recent possibility of military conflict with Iraq, oil prices have been falling, that's because of a worldwide glut of oil" 7 | ]; 8 | 9 | var bio_array = [ 10 | "We have found that the HTLV-1 transactivator protein, tax, acts as a costimulatory signal for GM-CSF and IL-2 gene transcription, in that it can cooperate with TCR signals to mediate high level gene expression.", 11 | "We show that ligand-induced homodimerization of chimeric surface receptors consisting of the extracellular and transmembrane domains of the erythropoietin receptor and of the intracellular domain of IL-4Ralpha induces Janus kinase 1 (Jak1) activation, STAT6 activation, and Cepsilon germline transcripts in human B cell line BJAB." 12 | ]; 13 | 14 | String.prototype.template = function() { 15 | var args = arguments; 16 | return this.replace(/\{(\d+)\}/g,function(m,i){return args[i];}); 17 | }; 18 | 19 | function htmlEntities(str) { 20 | return String(str).replace(/&/g, '&').replace(//g, '>').replace(/"/g, '"').replace(/'/g, '''); 21 | } 22 | 23 | function select_topic() { 24 | var val = document.getElementById("topic").value; 25 | var btnelem = document.getElementById("analyze-text-btn"); 26 | 27 | if (val == "news") { 28 | btnelem.disabled = false; 29 | console.log(btnelem.disabled); 30 | var news = ""; 31 | news += ""; 32 | for(let i = 0; i < news_array.length; i++) { 33 | news += "" 34 | } 35 | // console.log("news", news) 36 | $("#examples").html(news); 37 | } 38 | else if (val == "bio") { 39 | btnelem.disabled = false; 40 | var bio = ""; 41 | bio += ""; 42 | for(let i = 0; i < bio_array.length; i++) { 43 | bio += "" 44 | } 45 | $("#examples").html(bio); 46 | } 47 | } 48 | 49 | function give_examples() { 50 | var e2 = document.getElementById("examples"); 51 | document.getElementById("analyze-text-input").value = e2.value 52 | } 53 | -------------------------------------------------------------------------------- /project/statics/js/security.js: -------------------------------------------------------------------------------- 1 | function csrfSafeMethod(method) 2 | { 3 | // these HTTP methods do not require CSRF protection 4 | return (/^(GET|HEAD|OPTIONS|TRACE)$/.test(method)); 5 | } 6 | 7 | function getCookie(name) { 8 | var cookieValue = null; 9 | if (document.cookie && document.cookie !== '') { 10 | var cookies = document.cookie.split(';'); 11 | for (var i = 0; i < cookies.length; i++) { 12 | var cookie = jQuery.trim(cookies[i]); 13 | // Does this cookie string begin with the name we want? 14 | if (cookie.substring(0, name.length + 1) === (name + '=')) { 15 | cookieValue = decodeURIComponent(cookie.substring(name.length + 1)); 16 | break; 17 | } 18 | } 19 | } 20 | return cookieValue; 21 | } 22 | 23 | var csrftoken = getCookie('csrftoken'); -------------------------------------------------------------------------------- /project/statics/js/temporal.js: -------------------------------------------------------------------------------- 1 | function draw_graph(return_value) { 2 | console.log(return_value.graph); 3 | var json_obj = return_value.graph; 4 | 5 | var svg = d3.select("svg"), 6 | width = +svg.attr("width"), 7 | height = +svg.attr("height"), 8 | node, 9 | link; 10 | 11 | svg.append('defs').append('marker') 12 | .attrs({'id':'arrowhead', 13 | 'viewBox':'-0 -5 10 10', 14 | 'refX':13, 15 | 'refY':0, 16 | 'orient':'auto', 17 | 'markerWidth':13, 18 | 'markerHeight':13, 19 | 'xoverflow':'visible'}) 20 | .append('svg:path') 21 | .attr('d', 'M 0,-5 L 10 ,0 L 0,5') 22 | .attr('fill', '#999') 23 | .style('stroke','none'); 24 | 25 | var simulation = d3.forceSimulation() 26 | .force("link", d3.forceLink().id(function (d) {return d.id;}).distance(100).strength(1)) 27 | .force("charge", d3.forceManyBody()) 28 | .force("center", d3.forceCenter(width / 2, height / 2)); 29 | 30 | // d3.json(json_obj, function (error, graph) { 31 | // if (error) throw error; 32 | // update(graph.links, graph.nodes); 33 | // }) 34 | update(json_obj.links, json_obj.nodes) 35 | 36 | // Define the div for the tooltip 37 | var div = d3.select("body").append("div") 38 | .attr("class", "tooltip") 39 | .style("opacity", 0); 40 | 41 | 42 | function update(links, nodes) { 43 | link = svg.selectAll(".link") 44 | .data(links) 45 | .enter() 46 | .append("line") 47 | .attr("class", "link") 48 | .attr('marker-end','url(#arrowhead)') 49 | 50 | link.append("title") 51 | .text(function (d) {return d.type;}); 52 | 53 | edgepaths = svg.selectAll(".edgepath") 54 | .data(links) 55 | .enter() 56 | .append('path') 57 | .attrs({ 58 | 'class': 'edgepath', 59 | 'fill-opacity': 0, 60 | 'stroke-opacity': 0, 61 | 'id': function (d, i) {return 'edgepath' + i} 62 | }) 63 | .style("pointer-events", "none"); 64 | 65 | edgelabels = svg.selectAll(".edgelabel") 66 | .data(links) 67 | .enter() 68 | .append('text') 69 | .style("pointer-events", "none") 70 | .attrs({ 71 | 'class': 'edgelabel', 72 | 'id': function (d, i) {return 'edgelabel' + i}, 73 | 'font-size': 10, 74 | 'fill': '#aaa' 75 | }); 76 | 77 | edgelabels.append('textPath') 78 | .attr('xlink:href', function (d, i) {return '#edgepath' + i}) 79 | .style("text-anchor", "middle") 80 | .style("pointer-events", "none") 81 | .attr("startOffset", "50%") 82 | .text(function (d) {return d.type}); 83 | 84 | node = svg.selectAll(".node") 85 | .data(nodes) 86 | .enter() 87 | .append("g") 88 | .attr("class", "node") 89 | .call(d3.drag() 90 | .on("start", dragstarted) 91 | .on("drag", dragged) 92 | //.on("end", dragended) 93 | ); 94 | 95 | node.append("circle") 96 | .attr("r", 12) 97 | // .style("fill", function (d, i) {return colors(i);}) 98 | .style("fill", function (d) {return d.color;}) 99 | .on("mouseover", function(d) { 100 | div.transition() 101 | .duration(200) 102 | .style("opacity", .9); 103 | div .html(d.type ) 104 | .style("left", (d3.event.pageX) + "px") 105 | .style("top", (d3.event.pageY - 28) + "px"); 106 | }) 107 | .on("mouseout", function(d) { 108 | div.transition() 109 | .duration(500) 110 | .style("opacity", 0); 111 | }) 112 | 113 | // node.append("title") 114 | // .text(function (d) {return d.id;}); 115 | 116 | node.append("text") 117 | .attr("dy", -1) 118 | .text(function (d) {return d.name+":"+d.label;}); 119 | 120 | simulation 121 | .nodes(nodes) 122 | .on("tick", ticked); 123 | 124 | simulation.force("link") 125 | .links(links); 126 | } 127 | 128 | function ticked() { 129 | link 130 | .attr("x1", function (d) {return d.source.x;}) 131 | .attr("y1", function (d) {return d.source.y;}) 132 | .attr("x2", function (d) {return d.target.x;}) 133 | .attr("y2", function (d) {return d.target.y;}); 134 | 135 | node 136 | .attr("transform", function (d) {return "translate(" + d.x + ", " + d.y + ")";}); 137 | 138 | edgepaths.attr('d', function (d) { 139 | return 'M ' + d.source.x + ' ' + d.source.y + ' L ' + d.target.x + ' ' + d.target.y; 140 | }); 141 | 142 | edgelabels.attr('transform', function (d) { 143 | if (d.target.x < d.source.x) { 144 | var bbox = this.getBBox(); 145 | 146 | rx = bbox.x + bbox.width / 2; 147 | ry = bbox.y + bbox.height / 2; 148 | return 'rotate(180 ' + rx + ' ' + ry + ')'; 149 | } 150 | else { 151 | return 'rotate(0)'; 152 | } 153 | }); 154 | } 155 | 156 | function dragstarted(d) { 157 | if (!d3.event.active) simulation.alphaTarget(0.3).restart() 158 | d.fx = d.x; 159 | d.fy = d.y; 160 | } 161 | 162 | function dragged(d) { 163 | d.fx = d3.event.x; 164 | d.fy = d3.event.y; 165 | } 166 | 167 | 168 | } 169 | 170 | json_obj = { 171 | "nodes": [ 172 | { 173 | "name": "Peter", 174 | "label": "Person", 175 | "id": 1 176 | }, 177 | { 178 | "name": "Michael", 179 | "label": "Person", 180 | "id": 2 181 | }, 182 | { 183 | "name": "Neo4j", 184 | "label": "Database", 185 | "id": 3 186 | }, 187 | { 188 | "name": "Graph Database", 189 | "label": "Database", 190 | "id": 4 191 | } 192 | ], 193 | "links": [ 194 | { 195 | "source": 1, 196 | "target": 2, 197 | "type": "KNOWS", 198 | }, 199 | { 200 | "source": 1, 201 | "target": 3, 202 | "type": "FOUNDED" 203 | }, 204 | { 205 | "source": 2, 206 | "target": 3, 207 | "type": "WORKS_ON" 208 | }, 209 | { 210 | "source": 3, 211 | "target": 4, 212 | "type": "IS_A" 213 | } 214 | ] 215 | } 216 | 217 | -------------------------------------------------------------------------------- /project/templates/index.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 3 | 4 | 5 | 6 | 7 | Event Detection Pipeline 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 37 |
38 |
39 | 40 |
41 |

Description

42 |
43 | Powered by the state-of-the-art event-related knowledge extraction models, EventPlus extracts and integrates event triggers, corresponding arguments and roles, event duration, temporal relation between events, and etc. 44 | Please click on the "Feature &Task Help" button on the top right to know how to interpret the result! 45 | 46 |
47 | 48 |
49 |
50 |
51 |
52 | 53 | 58 |

59 |
60 |
61 | 62 |
63 | 71 |
72 |
73 |
74 | 75 |
76 |

Text Input

77 |
78 | 79 | 80 |
81 |
82 |
83 | 84 |
85 |
86 |
87 | 88 |
89 |
90 |

   Annotation

91 |
92 |
93 |
94 |
95 |
96 |
97 | 98 |
99 |

   Temporal Relation

100 |
101 |
102 |
103 |
104 | 105 | 106 | 168 | 169 | 170 | 171 | 172 |
173 |

174 |
175 | USC ISI all rights reserved. Event Detection Pipeline 176 |

177 |
178 | 179 | 180 | 181 | 182 | --------------------------------------------------------------------------------