├── .gitignore ├── DATA └── wiki25.jkaln ├── LICENSE ├── README.md ├── configs ├── amr2.0-structured-bart-large-joint-voc-neur-al.sh ├── amr2.0-structured-bart-large-joint-voc.sh ├── amr2.0-structured-bart-large-neur-al-importance-sampling5.sh ├── amr2.0-structured-bart-large-neur-al-sampling5.sh ├── amr2.0-structured-bart-large-neur-al.sh ├── amr2.0-structured-bart-large.sh ├── amr2joint_ontowiki2_g2g-structured-bart-large.sh ├── amr3.0-structured-bart-large-doc-sliding-ws300x200.sh ├── amr3.0-structured-bart-large-doc-truncate-sliding-finetune-ws200x100.sh ├── amr3.0-structured-bart-large-doc-truncate-sliding-ws300x200.sh ├── amr3.0-structured-bart-large-doc-truncate.sh ├── amr3.0-structured-bart-large-doc_MODE-doc+sen-truncate-sliding-ws200x100.sh ├── amr3.0-structured-bart-large-doc_MODE-doc+sen-truncate.sh ├── amr3.0-structured-bart-large-joint-voc-neur-al.sh ├── amr3.0-structured-bart-large-joint-voc.sh ├── amr3.0-structured-bart-large-neur-al-sampling5.sh ├── amr3.0-structured-bart-large-neur-al.sh ├── amr3.0-structured-bart-large.sh ├── amr3joint_ontowiki2_g2g-structured-bart-large.sh ├── both_doc+sen.sh ├── both_doc+sen_packed.sh ├── both_doc+sen_trainsliding_ws400x100.sh ├── doc-finetune-from-conll-good-ws300x200-lr00005.sh ├── gold_doc+sen.sh ├── little_prince-structured-bart-base-neur-al.sh ├── wiki25-structured-bart-base-joint-voc.sh ├── wiki25-structured-bart-base-neur-al-importance-sampling.sh ├── wiki25-structured-bart-base-neur-al-mini.sh ├── wiki25-structured-bart-base-neur-al-sampling5.sh ├── wiki25-structured-bart-base-neur-al.sh └── wiki25-structured-bart-base.sh ├── docker └── Dockerfile ├── pyproject.toml ├── run ├── align.sh ├── amr_actions.sh ├── lsf │ ├── README.md │ ├── align.sh │ ├── final_test.sh │ ├── parse.sh │ ├── run_experiment.sh │ └── run_model_eval.sh ├── parse.sh ├── preprocess.sh ├── run_experiment.sh ├── run_model_eval.sh ├── run_model_eval_sliding.sh ├── status.py ├── status.sh ├── test.sh ├── train.sh └── train_aligner.sh ├── scripts ├── Blinker.py ├── README.md ├── add_wiki.py ├── amr_latex.py ├── convert_jamr_alignments_to_offsets.py ├── doc-amr │ ├── docamr_utils.py │ ├── get_doc_amr_from_sen.py │ ├── pack_amrs.py │ ├── remove_amrs.py │ └── remove_sen.py ├── export_alignment_model.sh ├── export_model.sh ├── install_satori.sh ├── jamr2isi.py ├── mbse.py ├── merge_files.py ├── parse.sh ├── play.py ├── plot_amr.py ├── plot_amr_latex.py ├── plot_results.py ├── read_propbank.py ├── remove_optimizer_state.py ├── remove_wiki.py ├── retyper.py ├── sanity_check_amr.py ├── smatch_aligner.py ├── split_amrs.py ├── tokenize_amr.py ├── triple_stats.py └── vimdiff_amr_files.py ├── service ├── amr.proto ├── amr2.proto ├── amr_client.py ├── amr_server.py └── wordvec.proto ├── setup.py ├── src ├── fairseq_ext │ ├── __init__.py │ ├── amr_reform │ │ ├── __init__.py │ │ └── o10_action_reformer_subtok.py │ ├── amr_spec │ │ ├── __init__.py │ │ ├── action_info.py │ │ ├── action_info_bartsv.py │ │ ├── action_info_binarize.py │ │ ├── action_info_binarize_bartsv.py │ │ ├── action_info_binarize_graphmp.py │ │ ├── action_info_binarize_graphmp_amr1.py │ │ ├── action_info_graphmp.py │ │ ├── action_info_graphmp_amr1.py │ │ ├── old_action_info.py │ │ └── old_action_info_binarize.py │ ├── average_checkpoints.py │ ├── binarize.py │ ├── criterions │ │ ├── __init__.py │ │ ├── label_smoothed_cross_entropy_pointer.py │ │ └── label_smoothed_cross_entropy_pointer_alignment.py │ ├── data │ │ ├── __init__.py │ │ ├── amr_action_pointer_bartsv_dataset.py │ │ ├── amr_action_pointer_dataset.py │ │ ├── amr_action_pointer_goldamr_dataset.py │ │ ├── amr_action_pointer_graphmp_dataset.py │ │ ├── amr_bpe.py │ │ ├── data_utils.py │ │ ├── indexed_dataset.py │ │ └── language_pair_dataset.py │ ├── extract_bart │ │ ├── __init__.py │ │ ├── binarize_encodings.py │ │ ├── composite_embeddings.py │ │ ├── mapavg_embeddings.py │ │ └── sentence_encoding.py │ ├── generate.py │ ├── generate_sliding.py │ ├── models │ │ ├── __init__.py │ │ ├── attention_masks.py │ │ ├── graph_attention_masks.py │ │ ├── graphmp_attention_masks.py │ │ ├── transformer_tgt_pointer.py │ │ ├── transformer_tgt_pointer_bart.py │ │ ├── transformer_tgt_pointer_bart_sattn.py │ │ ├── transformer_tgt_pointer_bartsv.py │ │ ├── transformer_tgt_pointer_bartsv_sattn.py │ │ ├── transformer_tgt_pointer_graph.py │ │ └── transformer_tgt_pointer_graphmp.py │ ├── modules │ │ ├── __init__.py │ │ ├── factored_embeddings.py │ │ ├── multihead_attention.py │ │ ├── multihead_attention_old.py │ │ ├── transformer_layer.py │ │ └── transformer_layer_old.py │ ├── options.py │ ├── options_train.py │ ├── preprocess.py │ ├── preprocess_bart.py │ ├── preprocess_bartsv.py │ ├── preprocess_graphmp.py │ ├── roberta │ │ ├── __init__.py │ │ ├── binarize_embeddings.py │ │ ├── pretrained_embeddings.py │ │ └── pretrained_embeddings_bert.py │ ├── sequence_generator.py │ ├── sequence_generator_bartsv.py │ ├── sequence_generator_graph.py │ ├── sequence_generator_graphmp.py │ ├── tasks │ │ ├── __init__.py │ │ ├── amr_action_pointer.py │ │ ├── amr_action_pointer_bart.py │ │ ├── amr_action_pointer_bart_dyo.py │ │ ├── amr_action_pointer_bartsv.py │ │ ├── amr_action_pointer_graphmp.py │ │ └── amr_action_pointer_graphmp_amr1.py │ ├── tests │ │ ├── test_action_info_graphmp_tofile.py │ │ ├── test_action_info_tofile.py │ │ ├── test_action_info_tolist.py │ │ ├── test_amr_action_bpe.py │ │ ├── test_amr_action_unk.py │ │ ├── test_composite_embeddings.py │ │ ├── test_composite_embeddings_mapping.py │ │ └── test_factored_embeddings.py │ ├── tokenizer.py │ ├── train.py │ ├── utils.py │ ├── utils_font.py │ └── utils_import.py ├── ibm_neural_aligner │ ├── README.md │ ├── __init__.py │ ├── align_leamr.py │ ├── align_utils.py │ ├── alignment_decoder.py │ ├── amr_utils.py │ ├── ccc.launch_many_jobs.py │ ├── ccc.summarize.py │ ├── dummy_align.py │ ├── evaluation.py │ ├── formatter.py │ ├── gcn.py │ ├── gypsum │ │ ├── setup_amr2.sh │ │ ├── setup_amr3.sh │ │ ├── sweep.0.py │ │ ├── sweep.4a.py │ │ ├── templates.py │ │ ├── view_sweep.2.py │ │ └── view_sweep.py │ ├── install.sh │ ├── leamr_align.py │ ├── lexicon.py │ ├── main.py │ ├── make_splits.py │ ├── metric_utils.py │ ├── pretrained_embeddings.py │ ├── pretrained_embeddings.sh │ ├── resolve_manual_alignments.py │ ├── run_detailed_eval.py │ ├── run_eval.py │ ├── run_model_selection.py │ ├── run_sampler.py │ ├── standalone_elmo.py │ ├── tokenize_amr.py │ ├── transformer_lm.py │ ├── tree_lstm.py │ ├── tree_rnn.py │ ├── view_manual_alignments.py │ ├── vocab.py │ └── vocab_definitions.py └── transition_amr_parser │ ├── __init__.py │ ├── action_pointer │ ├── __init__.py │ ├── amr_parser.py │ ├── o8_data_oracle.py │ ├── o8_fake_parse.py │ ├── o8_state_machine.py │ ├── o8_state_machine_amr1.py │ ├── o8_state_machine_reformer.py │ ├── o8_state_machine_reformer_amr1.py │ ├── parse.py │ └── roberta_utils.py │ ├── add_id_to_amr.py │ ├── add_sentence_amrs_to_file.py │ ├── amr.py │ ├── amr_aligner.py │ ├── amr_constituents.py │ ├── amr_latex.py │ ├── amr_machine.py │ ├── clbar.py │ ├── force_overlap_actions.py │ ├── gold_subgraph_align.py │ ├── io.py │ ├── make_sliding_splits.py │ ├── merge_sliding_splits.py │ ├── parse.py │ └── plots.py └── tests ├── align_mode.py ├── align_mode.sh ├── all.sh ├── amr_io.py ├── amr_io.sh ├── correctly_installed.py ├── correctly_installed.sh ├── create_wiki25_mockup.sh ├── download_little_prince.sh ├── fairseq_data_iterator.py ├── fairseq_data_iterator.sh ├── minimal_test.sh ├── minimal_test_lsf.sh ├── neural_aligner.sh ├── oracles ├── amr_o10.sh └── amr_o10_doc.sh ├── smatch.sh ├── standalone-doc.sh ├── standalone.sh └── tokenizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | # These should not be commited 2 | # environment setter. May contain keys and other THIS CAN NOT BE COMMITED 3 | set_environment.sh 4 | 5 | # git 6 | *.orig 7 | .mailmap 8 | 9 | # data 10 | media/ 11 | PROGRESS* 12 | DATA* 13 | *.zip 14 | *.json 15 | *.npy 16 | # *.npy.* 17 | 18 | # run scripts logs 19 | logs* 20 | *.log 21 | 22 | # external tools 23 | amr-evaluation/ 24 | git-filter-repo/ 25 | 26 | # virtual environments 27 | .python-version 28 | venv*/ 29 | cenv*/ 30 | 31 | # debug 32 | *.lprof 33 | debug* 34 | tmp* 35 | TMP* 36 | 37 | # other 38 | __pycache__/ 39 | *.ipynb_checkpoints/ 40 | *.ipynb 41 | transition_amr_parser.egg-info/ 42 | 43 | # python package 44 | dist/ 45 | 46 | # cluster tools 47 | jbsub_logs/ 48 | .jbsub_logs/ 49 | 50 | # vim 51 | .vim/ 52 | ctags 53 | *.swp 54 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM registry.access.redhat.com/ubi8/ubi:latest as rhel-base 2 | 3 | RUN dnf update -y \ 4 | && dnf install -y --disableplugin=subscription-manager \ 5 | python38 \ 6 | python38-setuptools \ 7 | python38-wheel \ 8 | python38-pip \ 9 | python38-devel \ 10 | make \ 11 | git \ 12 | glibc-langpack-en \ 13 | curl \ 14 | gcc \ 15 | gcc-c++ \ 16 | unzip \ 17 | && dnf autoremove -y \ 18 | && dnf clean all \ 19 | && pip3 install --upgrade --no-cache-dir pip 20 | 21 | ADD https://github.com/ibmruntimes/semeru8-binaries/releases/download/jdk8u302-b08_openj9-0.27.0/ibm-semeru-open-8-jdk-1.8.0.302.b08-1.x86_64.rpm . 22 | RUN dnf install -y ibm-semeru-open-8-jdk-1.8.0.302.b08-1.x86_64.rpm 23 | 24 | #RUN locale-gen en_US.UTF-8 && /usr/sbin/update-locale LANG=en_US.UTF-8 25 | ENV LANG en_US.UTF-8 26 | ENV LANGUAGE en_US:en 27 | ENV LC_ALL en_US.UTF-8 28 | # Model Location 29 | 30 | ENV MODEL_PATH "DATA/gofa20220412/models/exp_cofill_o10_act-states_bart.base/_act-pos_vmask1_shiftpos1_ptr-lay6-h1_cam-layall-h2-abuf_dec-sep-emb-sha0_bart-init-dec-emb__fp16-_lr0.0001-mt1024x8-wm4000-dp0.2/ep15-seed44/checkpoint_top3-avg.pt" 31 | 32 | # GRPC Port (so that it can be set during run time) 33 | ENV GRPC_PORT "50051" 34 | 35 | # Set cache paths 36 | ENV CACHE_DIR "DATA" 37 | ENV ROBERTA_CACHE_PATH ${CACHE_DIR}/bart.base 38 | ENV PYTHONPATH /amr_parser/pip_modules 39 | ## Install grpc for python3 40 | 41 | FROM rhel-base as rhel-stage1 42 | ARG ARTIFACTORY_USERNAME 43 | ARG ARTIFACTORY_API_KEY 44 | ENV ARTIFACTORY_USERNAME=$ARTIFACTORY_USERNAME 45 | ENV ARTIFACTORY_API_KEY=$ARTIFACTORY_API_KEY 46 | 47 | # ADD . /amr_parser/ 48 | WORKDIR /amr_parser 49 | 50 | COPY LICENSE README.md setup.py requirements.txt /amr_parser/ 51 | COPY DATA/ /amr_parser/DATA 52 | COPY preprocess/ /amr_parser/preprocess 53 | COPY scripts/ /amr_parser/scripts/ 54 | COPY tests/ /amr_parser/tests 55 | COPY configs/ /amr_parser/configs/ 56 | COPY fairseq_ext/ /amr_parser/fairseq_ext/ 57 | COPY service/ /amr_parser/service 58 | COPY transition_amr_parser/ /amr_parser/transition_amr_parser/ 59 | 60 | RUN python3 -m pip install -t ${PYTHONPATH} --upgrade pip \ 61 | && python3 -m pip install -t ${PYTHONPATH} protobuf grpcio grpcio-tools grpcio-health-checking \ 62 | && python3 -m pip install -t ${PYTHONPATH} statsd 63 | 64 | # Copy code 65 | # ADD . /amr_parser 66 | 67 | #RUN scripts/update_config.sh \ 68 | # --artifactory_username $ARTIFACTORY_USERNAME \ 69 | # --artifactory_api_key $ARTIFACTORY_API_KEY \ 70 | # --encode_username 71 | #RUN pip install -t ${PYTHONPATH} 'pyizumo[dp]'==0.1.5 watson-sire==1.0.18 requests 72 | 73 | ARG TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX 7.5+PTX" 74 | # Install the packages 75 | RUN python3 -m pip install -t ${PYTHONPATH} . 76 | #RUN pip3 install -t ${PYTHONPATH} -r requirements.txt 77 | RUN rm -rf ${PYTHONPATH}/dataclasses* 78 | RUN python3 -m pip install -t ${PYTHONPATH} torch-scatter==1.3.2 79 | 80 | # Compile the protos 81 | RUN python3 -m grpc_tools.protoc -I./service/ --python_out=./service/ --grpc_python_out=./service/ ./service/wordvec.proto 82 | RUN python3 -m grpc_tools.protoc -I./service/ --python_out=./service/ --grpc_python_out=./service/ ./service/amr2.proto 83 | # RUN chown worker:worker /amr_parser 84 | RUN rm -rf *.zip 85 | RUN chmod -R 777 /amr_parser 86 | 87 | FROM golang:1.17 AS grpcurl_build 88 | RUN go install github.com/fullstorydev/grpcurl/cmd/grpcurl@latest 89 | 90 | FROM rhel-base as amr-final 91 | COPY --from=rhel-stage1 /amr_parser/ /amr_parser/ 92 | COPY --from=grpcurl_build /go/bin/grpcurl /usr/local/bin/grpcurl 93 | # start the server 94 | ENV PYTHONPATH "/amr_parser:/amr_parser/server:/amr_parser/pip_modules" 95 | 96 | WORKDIR /amr_parser 97 | RUN ls -l /amr_parser/DATA 98 | RUN ls -l /amr_parser/DATA/bart.base 99 | #RUN mkdir -p /.cache && chmod -R 777 /.cache 100 | #RUN python3 service/amr_test.py -m ${MODEL_PATH} -c ${ROBERTA_CACHE_PATH} 101 | CMD python3 -u service/amr_server.py --in-model ${MODEL_PATH} --roberta-cache-path ${ROBERTA_CACHE_PATH} --port ${GRPC_PORT} 102 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "transition_neural_parser" 7 | version = '0.5.4' 8 | authors = [ 9 | { name="Ramon", email="ramon.astudillo@ibm.com" }, 10 | { name="Young-Suk", email="ysuklee@us.ibm.com" }, 11 | { name="Tahira", email="tnaseem@us.ibm.com" }, 12 | { name="Sadhana", email="sadhana.kumaravel1@ibm.com" }, 13 | { name="GX", email="GX.Xu@ibm.com" }, 14 | { name="Hans", email="raduf@us.ibm.com" }, 15 | { name="Salim", email="roukos@us.ibm.com" }, 16 | ] 17 | description = "The package for transition based nueral AMR parser" 18 | readme = "README.md" 19 | requires-python = ">=3.7" 20 | classifiers = [ 21 | "Programming Language :: Python :: 3", 22 | "License :: OSI Approved :: Apache Software License", 23 | "Operating System :: POSIX :: Linux", 24 | ] 25 | 26 | [tool.poetry.platforms] 27 | os = "linux, darwin" 28 | 29 | [project.urls] 30 | homepage = "https://github.com/IBM/transition-amr-parser" 31 | tracker = "https://github.com/IBM/transition-amr-parser" -------------------------------------------------------------------------------- /run/align.sh: -------------------------------------------------------------------------------- 1 | set -o pipefail 2 | set -o errexit 3 | . set_environment.sh 4 | HELP="$0 " 5 | [ -z $1 ] && echo "$HELP" && exit 1 6 | [ -z $2 ] && echo "$HELP" && exit 1 7 | [ -z $3 ] && echo "$HELP" && exit 1 8 | checkpoint=$1 9 | in_amr=$2 10 | out_amr=$3 11 | set -o nounset 12 | 13 | amr-parse --in-checkpoint $checkpoint --in-amr $in_amr --out-amr $out_amr --batch-size 512 --roberta-batch-size 512 14 | -------------------------------------------------------------------------------- /run/lsf/README.md: -------------------------------------------------------------------------------- 1 | 2 | This code is intended to train models from scratch on the CCC cluster but can 3 | be repurposed for other task managers e.g. slurm. You can do a mini run to 4 | check how this all works under 5 | 6 | bash tests/minimal_test_lsf.sh 7 | 8 | First of all make sure you have installed according to README.md. Be sure to 9 | activate your environment in `set_environment.sh` since this is called by the 10 | different scripts 11 | 12 | Then ensure you have unzipped the data from its location, you will need at least 13 | 14 | 1. the corpus you want to train for e.g. AMr2.0 (optionally already aligned) 15 | 16 | 2. the entity linking cache for that corpus 17 | 18 | once you have unzipped these items we are ready to go. The code is though to be 19 | latched from a **login node** not a compute node. You will need some app to 20 | have a pervasive session on that login node (this is a good idea in general) 21 | like tmux (recommended) or screen. From one of those do e.g. 22 | 23 | bash run/lsf/run_experiment.sh configs/amr2.0-structured-bart-large-neur-al.sh 24 | 25 | this will launch all the needed jobs in a dependent fashion so that one is run 26 | after another (seeds will be ran in parallel). It will also display the status 27 | of the training. The script will hold until the first checkpoint is created to 28 | launch the evaluation jobs. This is why this command line call needs to be kept 29 | alive, after that it is no longer necessary. 30 | 31 | At any point you can do 32 | 33 | bash run/status.sh -c configs/amr2.0-structured-bart-large-neur-al.sh 34 | 35 | to check the status of that experiment. Once results start appearing, you can use 36 | 37 | bash run/status.sh --configs configs/amr2.0-structured-bart-large-neur-al.sh --results 38 | 39 | to check progress. To compare models and get details of loss and Smatch, you 40 | can plot a png and bring it locally with scp with 41 | 42 | python scripts/plot_results.py --in-configs configs/amr2.0-structured-bart-large-neur-al.sh --title my-training --out-png my-training.png 43 | 44 | each step of the experiment has its own folder and it is completed it should 45 | have a `.done` file. If you delete this the stage will be redone (not the 46 | neural aligner has multiple of these files). The final model should be found under e.g. 47 | 48 | DATA/AMR2.0/models/amr2.0-structured-bart-large-neur-al/ 49 | 50 | We try to avoid running on the tests set to prevent corpus overfitting, this 51 | can be done with 52 | 53 | bash run/lsf/final_test.sh configs/amr2.0-structured-bart-large-neur-al.sh 54 | 55 | It will ask you to confirm. 56 | 57 | Once training is done you can save space by calling 58 | 59 | bash run/status.sh -c configs/amr2.0-structured-bart-large-neur-al.sh --final-remove 60 | 61 | This will remove the optimizer from configs `DECODING_CHECKPOINT` and delete 62 | all other. Save copies if you want further train later. 63 | 64 | to save the minimal files needed for a model into a zip do 65 | 66 | bash scripts/export_model.sh configs/amr2.0-structured-bart-large-neur-al.sh 67 | 68 | ## Things that can go wrong 69 | 70 | Code is built to be able to resume if it stops, just do 71 | 72 | bash run/lsf/run_experiment.sh configs/amr2.0-structured-bart-large-neur-al.sh 73 | 74 | But it should not die, so if it did it is important to find the reason first 75 | before resuming. 76 | 77 | The most common problem is that you hit your space quota and code dies halfway 78 | while writing a checkpoint. You need to know how to check your quota to avoid 79 | this. Also the jobs doing evaluation also take care of removing checkpoints. If 80 | these die then your space can finish quickly. This should not happen and it is 81 | best to find the reason why this happened before relaunching evaluation. You 82 | can do this with 83 | 84 | bash run/lsf/run_model_eval.sh configs/amr2.0-structured-bart-large-neur-al.sh 85 | 86 | If you hit your quota, you need to fix that first, then you will also have to 87 | find and delete corrupted checkpoints. For this you can use 88 | 89 | bash run/status.sh -c configs/amr2.0-structured-bart-large-neur-al.sh --remove-corrupted-checkpoints 90 | 91 | the code automatically calls 92 | 93 | bash run/status.sh -c configs/amr2.0-structured-bart-large-neur-al.sh --link-best --remove 94 | 95 | to find the best checkpoint and remove checkpoints not in the top n-best, but 96 | it may come handy to run this yourself at some point. It is already a bad 97 | state of affairs if some checkpoint got deleted without being evaluated, but 98 | you can always ignore this by adding `--ignore-missing-checkpoints` 99 | 100 | ## Parsing Large Corpora 101 | 102 | calling this script on a login node 103 | 104 | ``` 105 | bash run/lsf/parse.sh [-s chunk size] 106 | ``` 107 | 108 | will split your data into chunks of `` and launch a paralell job for each. Results for each chunk are stored in 109 | 110 | ``` 111 | .split_ 112 | ``` 113 | 114 | once all jobs are completed to recompose, just do 115 | 116 | ``` 117 | cat .split_* > 118 | ``` 119 | -------------------------------------------------------------------------------- /run/lsf/align.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o pipefail 4 | . set_environment.sh 5 | set -o nounset 6 | 7 | # Argument handling 8 | # First argument must be checkpoint 9 | HELP="\nbash $0 [-s ]\n" 10 | [ -z "$1" ] && echo -e "$HELP" && exit 1 11 | [ -z "$2" ] && echo -e "$HELP" && exit 1 12 | [ -z "$3" ] && echo -e "$HELP" && exit 1 13 | first_path=$(echo $1 | sed 's@:.*@@g') 14 | [ ! -f "$first_path" ] && "Missing $1" && exit 1 15 | checkpoint=$1 16 | in_amr=$2 17 | out_amr=$3 18 | # process the rest with argument parser 19 | max_split_size=2000 20 | shift 21 | shift 22 | shift 23 | while [ "$#" -gt 0 ]; do 24 | case "$1" in 25 | -s) max_split_size="$2"; shift 2;; 26 | *) echo "unrecognized argument: $1"; exit 1;; 27 | esac 28 | done 29 | 30 | # splits folder 31 | splits_folder=${out_amr}.${max_split_size}splits/ 32 | mkdir -p $splits_folder 33 | 34 | # Split files 35 | split_files=$( 36 | python scripts/split_amrs.py \ 37 | $in_amr $max_split_size ${splits_folder}/in_split 38 | ) 39 | 40 | # Launch multiple decodings jobs 41 | for split in $split_files;do 42 | 43 | echo "bash run/align.sh $checkpoint $split ${split}.amr" 44 | 45 | if [ ! -f "${split}.amr" ];then 46 | 47 | jbsub -cores 1+1 -mem 50g -q x86_6h -require v100 \ 48 | -name $(basename $split)-$$ \ 49 | -out ${splits_folder}/align-%J-$$.stdout \ 50 | -err ${splits_folder}/align-%J-$$.stderr \ 51 | /bin/bash run/align.sh $checkpoint $split ${split}.amr 52 | 53 | fi 54 | 55 | done 56 | -------------------------------------------------------------------------------- /run/lsf/final_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o pipefail 4 | 5 | # Argument handling 6 | HELP="\ne.g. bash $0 \n" 7 | [ -z "$1" ] && echo -e "$HELP" && exit 1 8 | config=$1 9 | if [ -z "$2" ];then 10 | # identify experiment by the repository tag 11 | jbsub_basename="$(basename $config | sed 's@\.sh$@@')" 12 | else 13 | # identify experiment by given tag 14 | jbsub_basename=$2 15 | fi 16 | # set environment (needed for the python code below) 17 | # NOTE: Old set_environment.sh forbids launching in login node. 18 | . set_environment.sh 19 | set -o nounset 20 | 21 | # Load config 22 | echo "[Configuration file:]" 23 | echo $config 24 | . $config 25 | 26 | # MANUAL OVERRIDE !! 27 | # BEAM_SIZE=1 28 | # DECODING_CHECKPOINT=checkpoint_wiki.smatch_best1.pt 29 | 30 | # Running test announcement 31 | printf "\n\033[93mWARNING\033[0m: Everytime you look at the test set, your corpus dies a little (by corpus overfitting)\n\n" 32 | echo -e " \nbash run/ad_test.sh ${MODEL_FOLDER}seed${SEEDS}/$DECODING_CHECKPOINT -b $BEAM_SIZE -s test\n" 33 | read -p "Do you wish to continue? Y/[N]" answer 34 | [ "$answer" != "Y" ] && exit 1 35 | 36 | # Exit if we launch this directly from a computing node 37 | if [[ "$HOSTNAME" =~ dccpc.* ]] || [[ "$HOSTNAME" =~ dccx[cn].* ]] || [[ "$HOSTNAME" =~ cccx[cn].* ]];then 38 | echo -e "\n$0 must be launched from a login node (submits its own jbsub calls)\n" 39 | exit 1 40 | fi 41 | 42 | for seed in $SEEDS;do 43 | 44 | # define seed and working dir 45 | checkpoints_dir="${MODEL_FOLDER}seed${seed}/" 46 | 47 | # test all available checkpoints and link the best model on dev too 48 | jbsub_tag="fdec-${jbsub_basename}-s${seed}-$$" 49 | jbsub -cores 1+1 -mem 150g -q x86_6h -require v100 \ 50 | -name "$jbsub_tag" \ 51 | -out $checkpoints_dir/${jbsub_tag}-%J.stdout \ 52 | -err $checkpoints_dir/${jbsub_tag}-%J.stderr \ 53 | /bin/bash run/test.sh ${checkpoints_dir}/$DECODING_CHECKPOINT \ 54 | -b $BEAM_SIZE \ 55 | -s test 56 | 57 | done 58 | -------------------------------------------------------------------------------- /run/lsf/parse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o pipefail 4 | 5 | # Argument handling 6 | # First argument must be checkpoint 7 | HELP="\nbash $0 [-s ] [--tokenize]\n" 8 | [ -z "$1" ] && echo -e "$HELP" && exit 1 9 | [ -z "$2" ] && echo -e "$HELP" && exit 1 10 | first_path=$(echo $1 | sed 's@:.*@@g') 11 | [ ! -f "$first_path" ] && "Missing $1" && exit 1 12 | checkpoint=$1 13 | tokenized_sentences=$2 14 | # process the rest with argument parser 15 | tokenize="" 16 | max_split_size=2000 17 | shift 18 | shift 19 | while [ "$#" -gt 0 ]; do 20 | case "$1" in 21 | --tokenize) tokenize="--tokenize"; shift 1;; 22 | -s) max_split_size="$2"; shift 2;; 23 | *) echo "unrecognized argument: $1"; exit 1;; 24 | esac 25 | done 26 | 27 | set -o nounset 28 | 29 | # Split files 30 | split -l $max_split_size $tokenized_sentences ${tokenized_sentences}.split_ 31 | 32 | # Launch multiple decodings jobs 33 | for split in $(ls ${tokenized_sentences}.split_*);do 34 | jbsub -cores 1+1 -mem 50g -q x86_6h -require v100 \ 35 | -name $(basename $split) \ 36 | -out $(dirname $split)%J.stdout \ 37 | -err $(dirname $split)/%J.stderr \ 38 | /bin/bash run/parse.sh $checkpoint $split ${split}.amr $tokenize 39 | done 40 | -------------------------------------------------------------------------------- /run/lsf/run_model_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o pipefail 4 | 5 | # Argument handling 6 | HELP="\ne.g. bash $0 \n" 7 | [ -z "$1" ] && echo -e "$HELP" && exit 1 8 | config=$1 9 | if [ -z "$2" ];then 10 | # identify experiment by the repository tag 11 | jbsub_basename="$(basename $config | sed 's@\.sh$@@')" 12 | else 13 | # identify experiment by given tag 14 | jbsub_basename=$2 15 | fi 16 | # set environment (needed for the python code below) 17 | # NOTE: Old set_environment.sh forbids launching in login node. 18 | . set_environment.sh 19 | set -o nounset 20 | 21 | # Load config 22 | echo "[Configuration file:]" 23 | echo $config 24 | . $config 25 | 26 | # Quick exits 27 | # Data not extracted or aligned data not provided 28 | if [ ! -f "$AMR_TRAIN_FILE_WIKI" ] && [ ! -f "$ALIGNED_FOLDER/train.txt" ];then 29 | echo -e "\nNeeds $AMR_TRAIN_FILE_WIKI or $ALIGNED_FOLDER/train.txt\n" 30 | exit 1 31 | fi 32 | # linking cache not empty but folder does not exist 33 | if [ "$LINKER_CACHE_PATH" != "" ] && [ ! -d "$LINKER_CACHE_PATH" ];then 34 | echo -e "\nNeeds linking cache $LINKER_CACHE_PATH\n" 35 | exit 1 36 | fi 37 | 38 | # wait until first checkpoint is available for any of the seeds. 39 | # Clean-up checkpoints and inform of status in the meanwhile 40 | python run/status.py -c $config \ 41 | --wait-checkpoint-ready-to-eval --clear 42 | 43 | for seed in $SEEDS;do 44 | 45 | checkpoints_dir="${MODEL_FOLDER}seed${seed}/" 46 | 47 | # test all available checkpoints and link the best model on dev too 48 | jbsub_tag="tdec-${jbsub_basename}-s${seed}-$$" 49 | jbsub -cores 1+1 -mem 50g -q x86_6h -require v100 \ 50 | -name "$jbsub_tag" \ 51 | -out $checkpoints_dir/${jbsub_tag}-%J.stdout \ 52 | -err $checkpoints_dir/${jbsub_tag}-%J.stderr \ 53 | /bin/bash run/run_model_eval.sh $config "$seed" 54 | 55 | done 56 | 57 | # wait until final models has been evaluated 58 | # NOTE checkpoints are cleaned-up by run_model_eval.sh 59 | python run/status.py -c $config --wait-finished --clear 60 | -------------------------------------------------------------------------------- /run/parse.sh: -------------------------------------------------------------------------------- 1 | set -o pipefail 2 | set -o errexit 3 | . set_environment.sh 4 | HELP="$0 [--tokenize]" 5 | [ -z $1 ] && echo "$HELP" && exit 1 6 | [ -z $2 ] && echo "$HELP" && exit 1 7 | [ -z $3 ] && echo "$HELP" && exit 1 8 | checkpoint=$1 9 | tokenized_sentences=$2 10 | out_amr=$3 11 | 12 | tokenize="" 13 | shift 3 14 | while [ "$#" -gt 0 ]; do 15 | case "$1" in 16 | --tokenize) tokenize="--tokenize"; shift 1;; 17 | *) echo "unrecognized argument: $1"; exit 1;; 18 | esac 19 | done 20 | 21 | amr-parse \ 22 | --in-checkpoint $checkpoint \ 23 | --in-tokenized-sentences $tokenized_sentences \ 24 | --out-amr $out_amr \ 25 | $tokenize 26 | -------------------------------------------------------------------------------- /run/run_experiment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o pipefail 4 | 5 | # Argument handling 6 | HELP="\nbash $0 \n" 7 | [ -z "$1" ] && echo -e "$HELP" && exit 1 8 | config=$1 9 | [ ! -f "$config" ] && "Missing $config" && exit 1 10 | 11 | # activate virtualenenv and set other variables 12 | . set_environment.sh 13 | 14 | set -o nounset 15 | 16 | # random seed 17 | seed=42 18 | # decode in paralel to training. ATTENTION: you will need to GPUS for this 19 | on_the_fly_decoding=false 20 | 21 | # Load config 22 | echo "[Configuration file:]" 23 | echo $config 24 | . $config 25 | 26 | # Quick exits 27 | # Data not extracted or aligned data not provided 28 | if [ ! -f "$AMR_TRAIN_FILE_WIKI" ] && [ ! -f "$ALIGNED_FOLDER/train.txt" ];then 29 | echo -e "\nNeeds $AMR_TRAIN_FILE_WIKI or $ALIGNED_FOLDER/train.txt\n" 30 | exit 1 31 | fi 32 | # linking cache not empty but folder does not exist 33 | if [ "$LINKER_CACHE_PATH" != "" ] && [ ! -d "$LINKER_CACHE_PATH" ];then 34 | echo -e "\nNeeds linking cache $LINKER_CACHE_PATH\n" 35 | exit 1 36 | fi 37 | # not using neural aligner but no alignments provided 38 | if [ "$align_tag" != "ibm_neural_aligner" ] && [ ! -f $ALIGNED_FOLDER/.done ];then 39 | echo -e "\nYou need to provide $align_tag alignments\n" 40 | exit 1 41 | fi 42 | 43 | # This will store the final model 44 | mkdir -p ${MODEL_FOLDER}seed${seed} 45 | # Copy the config and soft-link it with an easy to find name 46 | cp $config ${MODEL_FOLDER}seed${seed}/ 47 | rm -f ${MODEL_FOLDER}seed${seed}/config.sh 48 | ln -s $(basename $config) ${MODEL_FOLDER}seed${seed}/config.sh 49 | 50 | # Add a tag with the commit(s) used to train this model. 51 | if [ "$(git status --porcelain | grep -v '^??')" == "" ];then 52 | # no uncommited changes 53 | touch "${MODEL_FOLDER}seed${seed}/$(git log --format=format:"%h" -1)" 54 | else 55 | # uncommited changes 56 | touch "${MODEL_FOLDER}seed${seed}/$(git log --format=format:"%h" -1)+" 57 | fi 58 | 59 | echo "[Aligning AMR:]" 60 | mkdir -p $ALIGNED_FOLDER 61 | bash run/train_aligner.sh $config 62 | 63 | echo "[Building oracle actions:]" 64 | mkdir -p $ORACLE_FOLDER 65 | # TODO: replace by task agnostic oracle creation 66 | bash run/amr_actions.sh $config 67 | 68 | echo "[Preprocessing data:]" 69 | mkdir -p $DATA_FOLDER 70 | bash run/preprocess.sh $config 71 | 72 | [ "$on_the_fly_decoding" = true ] \ 73 | && echo "[Decoding and computing smatch (on the fly):]" \ 74 | && bash run/run_model_eval.sh $config $seed & 75 | 76 | echo "[Training:]" 77 | bash run/train.sh $config $seed 78 | 79 | [ "$on_the_fly_decoding" = false ] \ 80 | && echo "[Decoding and computing smatch:]" \ 81 | && bash run/run_model_eval.sh $config $seed 82 | -------------------------------------------------------------------------------- /run/run_model_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | set -o pipefail 5 | 6 | # Argument handling 7 | HELP="\nbash $0 \n" 8 | # config file 9 | [ -z "$1" ] && echo -e "$HELP" && exit 1 10 | [ ! -f "$1" ] && "Missing $1" && exit 1 11 | config=$1 12 | # random seed 13 | [ -z "$2" ] && echo -e "$HELP" && exit 1 14 | seed=$2 15 | 16 | # activate virtualenenv and set other variables 17 | . set_environment.sh 18 | 19 | set -o nounset 20 | 21 | # Load config 22 | echo "[Configuration file:]" 23 | echo $config 24 | . $config 25 | 26 | # Quick exits 27 | # Data not extracted or aligned data not provided 28 | if [ ! -f "$AMR_TRAIN_FILE_WIKI" ] && [ ! -f "$ALIGNED_FOLDER/train.txt" ];then 29 | echo -e "\nNeeds $AMR_TRAIN_FILE_WIKI or $ALIGNED_FOLDER/train.txt\n" 30 | exit 1 31 | fi 32 | # linking cache not empty but folder does not exist 33 | if [ "$LINKER_CACHE_PATH" != "" ] && [ ! -d "$LINKER_CACHE_PATH" ];then 34 | echo -e "\nNeeds linking cache $LINKER_CACHE_PATH\n" 35 | exit 1 36 | fi 37 | 38 | # folder of the model seed 39 | checkpoints_folder=${MODEL_FOLDER}seed${seed}/ 40 | 41 | # Evaluate all required checkpoints with EVAL_METRIC 42 | if [ ! -f "$checkpoints_folder/epoch_tests/.done" ];then 43 | 44 | mkdir -p "$checkpoints_folder/epoch_tests/" 45 | 46 | # Note this removes models and links best models on the fly 47 | while [ "$(python run/status.py -c $config --seed $seed --list-checkpoints-to-eval --link-best --remove)" != "" ];do 48 | 49 | # get existing checkpoints 50 | ready_checkpoints=$(python run/status.py -c $config --seed $seed --list-checkpoints-ready-to-eval) 51 | 52 | # if there are no checkpoints at this moment, wait and restart loop 53 | if [ "$ready_checkpoints" == "" ];then 54 | printf "\r$$ is waiting for checkpoints of ${config}:$seed" 55 | sleep 1m 56 | continue 57 | fi 58 | echo "" 59 | 60 | # run test for these checkpoints 61 | for checkpoint in $ready_checkpoints;do 62 | results_prefix=$checkpoints_folder/epoch_tests/dec-$(basename $checkpoint .pt) 63 | bash run/test.sh $checkpoint -o $results_prefix 64 | 65 | # clean this checkpoint. This can be helpful if we started a job 66 | # with lots of pending checkpoints to evaluate 67 | python run/status.py -c $config --seed $seed --link-best --remove 68 | done 69 | done 70 | touch $checkpoints_folder/epoch_tests/.done 71 | 72 | else 73 | 74 | printf "[\033[92m done \033[0m] $checkpoints_folder/epoch_tests/.done\n" 75 | 76 | fi 77 | 78 | # This should not be needed, but its a sanity check 79 | python run/status.py -c $config --seed $seed --list-checkpoints-to-eval --link-best --remove 80 | 81 | # 3 checkpoint average 82 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt ]]; then 83 | echo "Evaluation/Ranking failed, missing $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt " 84 | exit 1 85 | fi 86 | 87 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_top3-avg.pt ]]; then 88 | python src/fairseq_ext/average_checkpoints.py \ 89 | --input \ 90 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best1.pt \ 91 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best2.pt \ 92 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt \ 93 | --output $checkpoints_folder/checkpoint_${EVAL_METRIC}_top3-avg.pt 94 | fi 95 | 96 | 97 | # 5 checkpoint average 98 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt ]]; then 99 | echo "Evaluation/Ranking failed, missing $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt " 100 | exit 1 101 | fi 102 | 103 | 104 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_top5-avg.pt ]]; then 105 | python src/fairseq_ext/average_checkpoints.py \ 106 | --input \ 107 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best1.pt \ 108 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best2.pt \ 109 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt \ 110 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best4.pt \ 111 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt \ 112 | --output $checkpoints_folder/checkpoint_${EVAL_METRIC}_top5-avg.pt 113 | fi 114 | 115 | # Final run 116 | [ ! -f "$checkpoints_folder/$DECODING_CHECKPOINT" ] \ 117 | && echo -e "Missing $checkpoints_folder/$DECODING_CHECKPOINT" \ 118 | && exit 1 119 | mkdir -p $checkpoints_folder/beam${BEAM_SIZE}/ 120 | bash run/test.sh $checkpoints_folder/$DECODING_CHECKPOINT -b $BEAM_SIZE 121 | -------------------------------------------------------------------------------- /run/run_model_eval_sliding.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | set -o pipefail 5 | 6 | # Argument handling 7 | HELP="\nbash $0 \n" 8 | # config file 9 | [ -z "$1" ] && echo -e "$HELP" && exit 1 10 | [ ! -f "$1" ] && "Missing $1" && exit 1 11 | config=$1 12 | # random seed 13 | [ -z "$2" ] && echo -e "$HELP" && exit 1 14 | seed=$2 15 | 16 | # activate virtualenenv and set other variables 17 | . set_environment.sh 18 | 19 | set -o nounset 20 | 21 | # Load config 22 | echo "[Configuration file:]" 23 | echo $config 24 | . $config 25 | 26 | # Quick exits 27 | # Data not extracted or aligned data not provided 28 | if [ ! -f "$AMR_TRAIN_FILE_WIKI" ] && [ ! -f "$ALIGNED_FOLDER/train.txt" ];then 29 | echo -e "\nNeeds $AMR_TRAIN_FILE_WIKI or $ALIGNED_FOLDER/train.txt\n" 30 | exit 1 31 | fi 32 | # linking cache not empty but folder does not exist 33 | if [ "$LINKER_CACHE_PATH" != "" ] && [ ! -d "$LINKER_CACHE_PATH" ];then 34 | echo -e "\nNeeds linking cache $LINKER_CACHE_PATH\n" 35 | exit 1 36 | fi 37 | 38 | # folder of the model seed 39 | checkpoints_folder=${MODEL_FOLDER}seed${seed}/ 40 | 41 | # Evaluate all required checkpoints with EVAL_METRIC 42 | if [ ! -f "$checkpoints_folder/epoch_tests/.done" ];then 43 | 44 | mkdir -p "$checkpoints_folder/epoch_tests/" 45 | 46 | # Note this removes models and links best models on the fly 47 | while [ "$(python run/status.py -c $config --seed $seed --list-checkpoints-to-eval --link-best --remove)" != "" ];do 48 | 49 | # get existing checkpoints 50 | ready_checkpoints=$(python run/status.py -c $config --seed $seed --list-checkpoints-ready-to-eval) 51 | 52 | # if there are no checkpoints at this moment, wait and restart loop 53 | if [ "$ready_checkpoints" == "" ];then 54 | printf "\r$$ is waiting for checkpoints of ${config}:$seed" 55 | sleep 1m 56 | continue 57 | fi 58 | echo "" 59 | 60 | # run test for these checkpoints 61 | for checkpoint in $ready_checkpoints;do 62 | results_prefix=$checkpoints_folder/epoch_tests/dec-$(basename $checkpoint .pt) 63 | bash run/test_sliding.sh $checkpoint -o $results_prefix 64 | 65 | # clean this checkpoint. This can be helpful if we started a job 66 | # with lots of pending checkpoints to evaluate 67 | python run/status.py -c $config --seed $seed --link-best --remove 68 | done 69 | done 70 | touch $checkpoints_folder/epoch_tests/.done 71 | 72 | else 73 | 74 | printf "[\033[92m done \033[0m] $checkpoints_folder/epoch_tests/.done\n" 75 | 76 | fi 77 | 78 | # This should not be needed, but its a sanity check 79 | python run/status.py -c $config --seed $seed --list-checkpoints-to-eval --link-best --remove 80 | 81 | # 3 checkpoint average 82 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt ]]; then 83 | echo "Evaluation/Ranking failed, missing $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt " 84 | exit 1 85 | fi 86 | 87 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_top3-avg.pt ]]; then 88 | python src/fairseq_ext/average_checkpoints.py \ 89 | --input \ 90 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best1.pt \ 91 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best2.pt \ 92 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt \ 93 | --output $checkpoints_folder/checkpoint_${EVAL_METRIC}_top3-avg.pt 94 | fi 95 | 96 | 97 | # 5 checkpoint average 98 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt ]]; then 99 | echo "Evaluation/Ranking failed, missing $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt " 100 | exit 1 101 | fi 102 | 103 | 104 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_top5-avg.pt ]]; then 105 | python src/fairseq_ext/average_checkpoints.py \ 106 | --input \ 107 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best1.pt \ 108 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best2.pt \ 109 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt \ 110 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best4.pt \ 111 | $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt \ 112 | --output $checkpoints_folder/checkpoint_${EVAL_METRIC}_top5-avg.pt 113 | fi 114 | 115 | # Final run 116 | [ ! -f "$checkpoints_folder/$DECODING_CHECKPOINT" ] \ 117 | && echo -e "Missing $checkpoints_folder/$DECODING_CHECKPOINT" \ 118 | && exit 1 119 | mkdir -p $checkpoints_folder/beam${BEAM_SIZE}/ 120 | bash run/test_sliding.sh $checkpoints_folder/$DECODING_CHECKPOINT -b $BEAM_SIZE 121 | -------------------------------------------------------------------------------- /run/status.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | set -o nounset 5 | 6 | python run/status.py $@ 7 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | ## Plot AMRs 2 | 3 | To plot in LaTex using tikz, you can use 4 | 5 | ``` 6 | python scripts/plot_amr.py --in-amr DATA/wiki25.jkaln --out-amr tmp.tex 7 | ``` 8 | 9 | Use `--indices` to select AMRs by the order they appear in the file. See 10 | `--has-*` flags to select by graph properties 11 | 12 | To plot using matplotlib (for e.g. notebooks) you can use `AMR.plot()` in the 13 | AMR class 14 | 15 | ## JAMR to ISI notaion 16 | 17 | To convert an AMR file aligned using JAMR (+Kevin) aligner into ISI alignments format. 18 | 19 | ``` 20 | python scripts/jamr2isi.py --in-amr --out-amr 21 | ``` 22 | 23 | ## Understanding the Oracle 24 | 25 | An oracle is a module that given a sentence and its AMR annotation (aligned, 26 | right now) provides a sequence of actions, that played on a state machine 27 | produce back the AMR. Current AMR oracle aka Oracle10 can be explored in 28 | isolation running 29 | 30 | ``` 31 | bash tests/oracles/amr_o10.sh DATA/wiki25.jkaln 32 | ``` 33 | 34 | ## Sanity check AMR 35 | 36 | You can check any AMR against any propbank frames and their rules 37 | 38 | Extract all frames in separate `xml` format file into one single `json` format 39 | ``` 40 | python scripts/read_propbank.py /path/to/amr_2.0/data/frames/xml/ DATA/probank_amr2.0.json 41 | ``` 42 | 43 | Run sanity check, for example 44 | ``` 45 | python scripts/sanity_check_amr.py /path/to/amr2.0/train.txt DATA/probank_amr2.0.json 46 | 47 | 36522 sentences 152897 predicates 48 | 401 role not in propbank 49 | 322 predicate not in propbank 50 | 25 missing required role 51 | ``` 52 | 53 | ## Paired Boostrap Significance Test for Smatch 54 | 55 | The following script implements the paired boostrap significance test after 56 | 57 | @Book{Nor89, 58 | author = {E. W. Noreen}, 59 | title = {Computer-Intensive Methods for Testing Hypotheses}, 60 | publisher = {John Wiley Sons}, 61 | year = {1989}, 62 | } 63 | 64 | . See also [this paper](https://aclanthology.org/W05-0908). To use you can call 65 | 66 | ```bash 67 | python scripts/smatch_aligner.py \ 68 | --in-reference-amr /path/to/gold.amr \ 69 | --in-amrs \ 70 | /path/to/predicted1.amr \ 71 | /path/to/predicted2.amr \ 72 | ... 73 | /path/to/predictedN.amr \ 74 | --amr-labels \ 75 | label1 \ 76 | label2 \ 77 | ... 78 | labelN \ 79 | --bootstrap-test 80 | ``` 81 | 82 | for each pair of predicted amr files, it tests the hypothesis that the 83 | prediction with largest Smatch is significantly greater than the smaller one. 84 | Use `--bootstrap-test-restarts` to set the number of samples (default `10,000`, 85 | note this has little effect on speed). Use `--out-boostrap-png 86 | /path/to/file.png` to save the distribution of score differences for each pair. 87 | Script calls the original `smatch` python module. In order to export components 88 | it needs the main branch after version `1.0.4`. 89 | 90 | ## Maximum Bayesian Smatch Ensemble (MBSE) 91 | 92 | The following script implements MBSE-A as in [(Lee et al 2022)](https://arxiv.org/abs/2112.07790), just do 93 | 94 | ``` 95 | python scripts/mbse.py \ 96 | --in-amrs \ 97 | /path/to/predicted1.amr \ 98 | /path/to/predicted2.amr \ 99 | ... 100 | /path/to/predictedN.amr \ 101 | --out-amr /path/to/ensemble.amr 102 | ``` 103 | -------------------------------------------------------------------------------- /scripts/add_wiki.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | if __name__ == '__main__': 6 | 7 | famr, fwiki, wiki_folder = sys.argv[1:] 8 | 9 | famr = open(famr) 10 | fwiki = open(fwiki) 11 | ftrn = open(f"{wiki_folder}/trn.wikis") 12 | wiki_ht = {} 13 | for line in ftrn: 14 | if len(line.strip().split('\t')) != 2: 15 | continue 16 | (n,w) = line.strip().split('\t') 17 | wiki_ht[n] = w 18 | 19 | # FIXME: Temporary fix for AMR3.0 20 | amr3_file = f"{wiki_folder}/amr3trn.wikis" 21 | if os.path.isfile(amr3_file): 22 | ftrn3 = open(amr3_file) 23 | for line in ftrn3: 24 | if len(line.strip().split('\t')) != 2: 25 | continue 26 | (n,w) = line.strip().split('\t') 27 | if n[-1:]==' ': 28 | n = n[:-1] 29 | if n not in wiki_ht: 30 | wiki_ht[n] = w 31 | 32 | all_wikis = [] 33 | wikis = [] 34 | for line in fwiki: 35 | if line.strip() == "": 36 | all_wikis.append(wikis) 37 | wikis = [] 38 | else: 39 | wikis.append(line.strip().split()) 40 | 41 | lc = 0 42 | while True: 43 | line = famr.readline() 44 | if not line: 45 | break 46 | line = line.rstrip() 47 | if line.strip()=="" : 48 | lc += 1 49 | if ":name" in line: 50 | 51 | #get name 52 | namelines = [] 53 | nextline = famr.readline() 54 | namelines.append(nextline.rstrip()) 55 | tok = "" 56 | while "op" in nextline: 57 | tok += nextline[nextline.find("\"")+1:nextline.rfind("\"")] + " " 58 | if ")" in nextline: 59 | break 60 | nextline = famr.readline() 61 | namelines.append(nextline.rstrip()) 62 | tok = tok.strip() 63 | 64 | #get wiki of the name 65 | #print tok 66 | if tok in wiki_ht: 67 | wiki = wiki_ht[tok] 68 | line = line.replace(":name",":wiki " + wiki + "\t:name") 69 | else: 70 | if tok != "": 71 | for i in range(len(all_wikis[lc])): 72 | if tok.split()[0] in all_wikis[lc][i][0] :# or (all_wikis[lc][i][1] != '-' and all_wikis[lc][i][1] == tok) or tok in all_wikis[lc][i][0] : 73 | wiki = all_wikis[lc][i][1] 74 | if wiki != '-': 75 | wiki = "\""+wiki+"\"" 76 | line = line.replace(":name",":wiki " + wiki + "\t:name") 77 | break 78 | 79 | print(line) 80 | print("\n".join(namelines)) 81 | else: 82 | print(line) 83 | -------------------------------------------------------------------------------- /scripts/amr_latex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import collections 3 | import sys 4 | import re 5 | import string 6 | import os 7 | 8 | def replace_symbols(line): 9 | 10 | line = line.replace("\\","") 11 | line = line.replace("$","\$") 12 | line = line.replace("#","*") 13 | line = line.replace("&","and") 14 | line = line.replace("%","\%") 15 | line = line.replace("_","-") 16 | line = line.replace("^","hat") 17 | line = line.replace("(","LBR") 18 | line = line.replace(")","RBR") 19 | line = line.replace("{","LBR") 20 | line = line.replace("}","RBR") 21 | 22 | return line 23 | 24 | def get_node_depth(amr): 25 | 26 | node_TO_edges = collections.defaultdict(list) 27 | for e in amr.edges: 28 | s, y, t = e 29 | node_TO_edges[s].append(e) 30 | 31 | new_edges = [] 32 | 33 | seen = set() 34 | seen.add(amr.root) 35 | 36 | node_TO_lvl = {} 37 | node_TO_lvl[amr.root] = 0 38 | 39 | def helper(root, prefix='0'): 40 | if root not in node_TO_edges: 41 | return 42 | 43 | for i, e in enumerate(node_TO_edges[root]): 44 | s, y, t = e 45 | assert s == root 46 | if t in seen: 47 | continue 48 | seen.add(t) 49 | new_prefix = '{}.{}'.format(prefix, i) 50 | node_TO_lvl[t] = new_prefix.count('.') 51 | 52 | helper(t, prefix=new_prefix) 53 | 54 | helper(amr.root) 55 | 56 | return node_TO_lvl 57 | 58 | def get_tikz_latex(amr, tokens, nodes, edges, alignments): 59 | 60 | for i in range(len(tokens)): 61 | tokens[i] = replace_symbols(tokens[i]) 62 | for node in nodes: 63 | nodes[node] = replace_symbols(nodes[node]) 64 | 65 | latex_str = "" 66 | 67 | latex_str += "\\begin{center}\n\\begin{tikzpicture}[scale=1.5]\n" 68 | for i in range(0,len(tokens)): 69 | word = tokens[i] 70 | latex_str += "\\draw(" + str(float(i)*0.8) + ",0) node {" + word[0:10] + "};\n" 71 | 72 | children = {} 73 | for node in nodes: 74 | children[node] = [] 75 | for edge in edges: 76 | if edge[0] == node: 77 | children[node].append(edge[2]) 78 | 79 | node_keys = nodes.keys() 80 | node_TO_lvl = get_node_depth(amr) 81 | levels = {} 82 | for node in nodes: 83 | lvl = node_TO_lvl[node] 84 | if lvl not in levels: 85 | levels[lvl] = [] 86 | levels[lvl].append(node) 87 | max_lvl = max(levels.keys()) 88 | 89 | node_names = {} 90 | for lvl in levels: 91 | y = 0.5 + (max_lvl - lvl) * 1.5 92 | for node in levels[lvl]: 93 | x=-0.8 94 | if node in alignments: 95 | x = float(alignments[node])*0.8 96 | node_names[node] = node.replace(".","_").replace("#", "X") 97 | latex_str += "\\node [draw,rounded corners] (" + str(node_names[node]) + ") at (" + str(x) + "," + str(y) + ") {" + nodes[node] + "};\n" 98 | ''' 99 | plotted = [] 100 | previous_plotted = [] 101 | level = 0 102 | while len(plotted) != len(nodes): 103 | for node in nodes: 104 | if node not in plotted and (len(children[node]) == 0 or all(child in previous_plotted for child in children[node])): 105 | #plot this nodes 106 | x=-0.8 107 | if node in alignments: 108 | x = float(alignments[node])*0.8 109 | y = 0.5 + level * 1.5 110 | node_names[node] = node.replace(".","_") 111 | latex_str += "\\node [draw,rounded corners] (" + str(node_names[node]) + ") at (" + str(x) + "," + str(y) + ") {" + nodes[node] + "};\n" 112 | plotted.append(node) 113 | level += 1 114 | previous_plotted = plotted 115 | ''' 116 | 117 | for edge in edges: 118 | latex_str += "\\draw [-latex,thick] (" + node_names[edge[0]] + ") -- node {\\footnotesize " + replace_symbols(edge[1]) + "} (" + node_names[edge[2]] + ");\n" 119 | 120 | latex_str += "\\end{tikzpicture}\n\\end{center}\n" 121 | 122 | return latex_str 123 | -------------------------------------------------------------------------------- /scripts/doc-amr/pack_amrs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | from amr_io import read_amr 7 | from ipdb import set_trace 8 | from docamr_utils import get_sen_ends 9 | 10 | def connect_sen_amrs(amr): 11 | 12 | if len(amr.roots) <= 1: 13 | return 14 | 15 | node_id = amr.add_node("document") 16 | amr.root = str(node_id) 17 | for (i,root) in enumerate(amr.roots): 18 | amr.edges.append((amr.root, ":snt"+str(i+1), root)) 19 | 20 | 21 | def make_packed_amrs(amrs, max_tok=400, randomize=True): 22 | packed_amrs = [] 23 | 24 | keys = [k for k in amrs.keys()] 25 | 26 | indices = np.array(range(len(amrs))) 27 | if randomize: 28 | indices = np.random.permutation(len(amrs)) 29 | 30 | amr = copy.deepcopy(amrs[keys[indices[0]]]) 31 | for idx in indices[1:]: 32 | next_amr = amrs[keys[idx]] 33 | if len(amr.tokens) + len(next_amr.tokens) <= max_tok: 34 | amr = amr + copy.deepcopy(next_amr) 35 | else: 36 | connect_sen_amrs(amr) 37 | get_sen_ends(amr) 38 | packed_amrs.append(amr) 39 | amr = copy.deepcopy(next_amr) 40 | 41 | connect_sen_amrs(amr) 42 | get_sen_ends(amr) 43 | packed_amrs.append(amr) 44 | 45 | return packed_amrs 46 | 47 | 48 | def main(args): 49 | 50 | assert args.out_amr 51 | assert args.in_amr 52 | 53 | amrs = read_amr(args.in_amr) 54 | 55 | with open(args.out_amr, 'w') as fid: 56 | packed = make_packed_amrs(amrs) 57 | for amr in packed: 58 | fid.write(amr.__str__()) 59 | 60 | def argument_parser(): 61 | 62 | parser = argparse.ArgumentParser(description='Read AMRs and Corefs and put them together', \ 63 | formatter_class=argparse.RawTextHelpFormatter) 64 | parser.add_argument( 65 | "--in-amr", 66 | help="path to AMR3 annoratations", 67 | type=str 68 | ) 69 | parser.add_argument( 70 | "--out-amr", 71 | help="Output file containing AMR in penman format", 72 | type=str, 73 | ) 74 | args = parser.parse_args() 75 | return args 76 | 77 | 78 | if __name__ == '__main__': 79 | main(argument_parser()) 80 | -------------------------------------------------------------------------------- /scripts/doc-amr/remove_amrs.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from transition_amr_parser.io import read_blocks 3 | import re 4 | 5 | regex = r"--avoid-indices ([\d\s]+)" 6 | 7 | 8 | def main(args): 9 | 10 | tqdm_amrs_str = read_blocks(args.in_amr) 11 | indices = re.findall(regex,args.arg_str) 12 | avoid_indices = indices[0].split() 13 | avoid_indices = [int(i) for i in avoid_indices] 14 | 15 | with open(args.out_amr, 'w') as fid: 16 | for idx, penman_str in enumerate(tqdm_amrs_str): 17 | if not idx in avoid_indices: 18 | fid.write(penman_str+'\n') 19 | 20 | 21 | 22 | 23 | if __name__ == '__main__': 24 | parser = ArgumentParser() 25 | parser.add_argument( 26 | "--in-amr", 27 | help="In file containing AMR in penman format", 28 | type=str 29 | ) 30 | parser.add_argument( 31 | "--arg-str", 32 | help="the arg string containing the indices needed to be removed", 33 | type=str 34 | ) 35 | 36 | parser.add_argument( 37 | "--out-amr", 38 | help="out amr after removal of avois indices", 39 | type=str, 40 | ) 41 | args = parser.parse_args() 42 | main(args) 43 | 44 | 45 | -------------------------------------------------------------------------------- /scripts/doc-amr/remove_sen.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from transition_amr_parser.io import read_blocks 3 | import re 4 | 5 | regex = r"--avoid-indices ([\d\s]+)" 6 | 7 | 8 | def main(args): 9 | 10 | lines = open(args.in_file).readlines() 11 | indices = re.findall(regex,args.arg_str) 12 | avoid_indices = indices[0].split() 13 | avoid_indices = [int(i) for i in avoid_indices] 14 | 15 | with open(args.out_file, 'w') as fid: 16 | for idx, line in enumerate(lines): 17 | if not idx in avoid_indices: 18 | fid.write(line) 19 | 20 | 21 | 22 | 23 | if __name__ == '__main__': 24 | parser = ArgumentParser() 25 | parser.add_argument( 26 | "--in-file", 27 | help="In file containing sen", 28 | type=str 29 | ) 30 | parser.add_argument( 31 | "--arg-str", 32 | help="the arg string containing the indices needed to be removed", 33 | type=str 34 | ) 35 | 36 | parser.add_argument( 37 | "--out-file", 38 | help="out file after removal of avoids indices", 39 | type=str, 40 | ) 41 | args = parser.parse_args() 42 | main(args) 43 | 44 | 45 | -------------------------------------------------------------------------------- /scripts/export_alignment_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | set -o pipefail 5 | 6 | # Argument handling 7 | HELP="\nbash $0 \n" 8 | # config file 9 | [ -z "$1" ] && echo -e "$HELP" && exit 1 10 | [ ! -f "$1" ] && "Missing $1" && exit 1 11 | config=$1 12 | 13 | # activate virtualenenv and set other variables 14 | . set_environment.sh 15 | 16 | set -o nounset 17 | 18 | # Load config 19 | echo "[Configuration file:]" 20 | echo $config 21 | . $config 22 | 23 | [ ! -f DATA/$TASK_TAG/aligned/ibm_neural_aligner/.done ] \ 24 | && printf "\nIs aligner training complete?\n" \ 25 | && exit 1 26 | 27 | zip -r ${TASK_TAG}_ibm_neural_aligner.zip \ 28 | DATA/$TASK_TAG/aligned/ibm_neural_aligner/log/model.latest.pt \ 29 | DATA/$TASK_TAG/aligned/ibm_neural_aligner/log/flags.json \ 30 | DATA/$TASK_TAG/aligned/ibm_neural_aligner/vocab.text.txt \ 31 | DATA/$TASK_TAG/aligned/ibm_neural_aligner/vocab.amr.txt \ 32 | DATA/$TASK_TAG/aligned/ibm_neural_aligner/.done.train 33 | 34 | echo "Created ${TASK_TAG}_ibm_neural_aligner.zip" 35 | -------------------------------------------------------------------------------- /scripts/export_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | set -o pipefail 5 | 6 | # Argument handling 7 | HELP="\nbash $0 \n" 8 | # config file 9 | [ -z "$1" ] && echo -e "$HELP" && exit 1 10 | [ ! -f "$1" ] && "Missing $1" && exit 1 11 | config=$1 12 | # random seed 13 | [ -z "$2" ] && echo -e "$HELP" && exit 1 14 | seed=$2 15 | 16 | # activate virtualenenv and set other variables 17 | . set_environment.sh 18 | 19 | set -o nounset 20 | 21 | # Load config 22 | echo "[Configuration file:]" 23 | echo $config 24 | . $config 25 | 26 | model_folder=${MODEL_FOLDER}seed${seed}/ 27 | model_name=$config_name 28 | 29 | # needed files 30 | checkpoint=$model_folder/$DECODING_CHECKPOINT 31 | 32 | echo "$checkpoint" 33 | 34 | [ ! -f "$checkpoint" ] && echo "Is $config training complete?" && exit 1 35 | 36 | echo "This will remove optimizer from ${checkpoint}." 37 | read -p "Do you wish to continue? Y/[N]" answer 38 | [ "$answer" != "Y" ] && exit 1 39 | 40 | # remove optimizer from checkpoint 41 | python scripts/remove_optimizer_state.py $checkpoint $checkpoint 42 | # zip all 43 | if [ -f "$model_folder/actions.vocab.nodes" ];then 44 | zip -r ${model_name}-seed${seed}.zip \ 45 | $checkpoint \ 46 | $model_folder/config.sh \ 47 | $model_folder/dict.actions_nopos.txt \ 48 | $model_folder/actions.vocab.nodes \ 49 | $model_folder/actions.vocab.others \ 50 | $model_folder/dict.en.txt \ 51 | $model_folder/machine_config.json 52 | else 53 | zip -r ${model_name}-seed${seed}.zip \ 54 | $checkpoint \ 55 | $model_folder/config.sh \ 56 | $model_folder/dict.actions_nopos.txt \ 57 | $model_folder/dict.en.txt \ 58 | $model_folder/machine_config.json 59 | fi 60 | -------------------------------------------------------------------------------- /scripts/install_satori.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | 4 | # activate conda 5 | # FIXME: Replace this with your conda 6 | eval "$(/nobackup/users/ramast/miniconda3/bin/conda shell.bash hook)" 7 | # Create local env if missing 8 | [ ! -d cenv_ppc ] && conda create -y -p ./cenv_ppc 9 | echo "conda activate ./cenv_ppc" 10 | conda activate ./cenv_ppc 11 | 12 | # accept POWER AI license 13 | export IBM_POWERAI_LICENSE_ACCEPT=yes 14 | 15 | # this may not be needed 16 | export PYTHONPATH=. 17 | 18 | set -o nounset 19 | 20 | # install python version to be used 21 | conda install -y pytorch==1.4.0 -c pytorch -c powerai 22 | 23 | # fairseq 24 | [ ! -d fairseq ] && git clone https://github.com/pytorch/fairseq.git 25 | cd fairseq 26 | git checkout v0.10.2 27 | pip install --editable . 28 | cd .. 29 | 30 | # smatch 31 | [ ! -d smatch ] && git clone https://github.com/snowblink14/smatch.git 32 | cd smatch 33 | git checkout v1.0.4 34 | pip install . 35 | cd .. 36 | 37 | # repo instal proper 38 | pip install --editable . 39 | 40 | # TODO: Install pytorch scatter 41 | 42 | # Tried to use this, but gcc is not available to load 43 | 44 | # module load gcc 45 | # pip install torch-scatter --no-cache-dir 46 | 47 | # This is what I did IBM's CCC PPC machines. Bottom line we need a GCC higher 48 | # than the one available by default 49 | 50 | # # install pytorch scatter 51 | # rm -Rf pytorch_scatter.ppc 52 | # git clone https://github.com/rusty1s/pytorch_scatter.git pytorch_scatter.ppc 53 | # cd pytorch_scatter.ppc 54 | # git checkout 1.3.2 55 | # Ensure modern GCC 56 | #export GCC_DIR=/opt/share/gcc-5.4.0/ppc64le/ 57 | #export PATH=/opt/share/cuda-9.0/ppc64le/bin:$GCC_DIR/bin:$PATH 58 | #export LD_LIBRARY_PATH=$GCC_DIR/lib:$LD_LIBRARY_PATH 59 | #export LD_LIBRARY_PATH=$GCC_DIR/lib64:$LD_LIBRARY_PATH 60 | #python setup.py develop 61 | #cd .. 62 | 63 | # check all ok 64 | python tests/correctly_installed.py 65 | -------------------------------------------------------------------------------- /scripts/merge_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | def merge_dir(dir, outfile): 5 | 6 | # collect amrs 7 | amrs = [] 8 | for filename in sorted(os.listdir(dir)): 9 | if not filename.startswith("amr"): 10 | continue 11 | with open(os.path.join(dir, filename), encoding='utf-8') as f: 12 | print(filename) 13 | for i,line in enumerate(f): 14 | if i in [0, 1]: 15 | continue 16 | if line.startswith('# ::align'): 17 | continue 18 | amrs.append(line) 19 | amrs.append('\n') 20 | 21 | # normalization 22 | amrs = ''.join(amrs) 23 | amrs = amrs.replace('\r','') 24 | amrs = amrs.replace('\n\n\n','\n\n') 25 | amrs = amrs.replace('\u0092',"'") 26 | amrs = amrs.replace('\u0085'," ") 27 | 28 | # write data 29 | with open(outfile,'w+', encoding='utf-8') as f: 30 | f.write(amrs) 31 | print(amrs.count('# ::snt')) 32 | 33 | if __name__ == '__main__': 34 | input_dir, output_dir = sys.argv[1:] 35 | os.makedirs(output_dir, exist_ok=True) 36 | merge_dir(f'{input_dir}/training/', f'{output_dir}/train.txt') 37 | merge_dir(f'{input_dir}/dev/', f'{output_dir}/dev.txt') 38 | merge_dir(f'{input_dir}/test/', f'{output_dir}/test.txt') 39 | -------------------------------------------------------------------------------- /scripts/parse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o pipefail 3 | set -o errexit 4 | # load local variables used below 5 | . set_environment.sh 6 | HELP="$0 " 7 | [ "$#" -lt 3 ] && echo "$HELP" && exit 1 8 | checkpoint=$1 9 | input_file=$2 10 | output_amr=$3 11 | set -o nounset 12 | 13 | amr-parse \ 14 | --in-checkpoint $checkpoint \ 15 | --in-tokenized-sentences $input_file \ 16 | --out-amr $output_amr \ 17 | --roberta-cache-path DATA/bart.large \ 18 | --batch-size 128 \ 19 | --roberta-batch-size 1 20 | -------------------------------------------------------------------------------- /scripts/plot_amr.py: -------------------------------------------------------------------------------- 1 | # https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.patches.FancyBboxPatch.html#matplotlib.patches.FancyBboxPatch 2 | # https://matplotlib.org/3.1.1/tutorials/text/annotations.html#placing-artist-at-the-anchored-location-of-the-axes 3 | # FIXME: Separate rendering and node position calculation 4 | # FIXME: Variable names messy 5 | from random import shuffle 6 | import argparse 7 | from transition_amr_parser.io import read_amr 8 | from transition_amr_parser.amr_latex import ( 9 | get_tikz_latex, 10 | save_graphs_to_tex, 11 | ) 12 | 13 | 14 | def argument_parser(): 15 | 16 | parser = argparse.ArgumentParser(description='AMR alignment plotter') 17 | # Single input parameters 18 | parser.add_argument( 19 | "--in-amr", 20 | help="AMR 2.0+ annotation file to be splitted", 21 | type=str, 22 | required=True 23 | ) 24 | parser.add_argument( 25 | "--shuffle", 26 | help="randomize input AMRs", 27 | action='store_true' 28 | ) 29 | parser.add_argument( 30 | "--jamr", 31 | help="Read from JAMR annotations", 32 | action='store_true' 33 | ) 34 | parser.add_argument( 35 | "--out-tex", 36 | help="output", 37 | type=str, 38 | required=True 39 | ) 40 | # latex / tikz variables 41 | parser.add_argument("--scale", type=float, default=1.0) 42 | parser.add_argument("--x-warp", type=float, default=1.0) 43 | parser.add_argument("--y-warp", type=float, default=1.0) 44 | # 45 | parser.add_argument( 46 | "--max-graphs", 47 | help="Will stop after plotting this amount", 48 | default=100, 49 | type=int, 50 | ) 51 | parser.add_argument( 52 | "--indices", nargs='+', 53 | help="Position on the AMR file of sentences to plot" 54 | ) 55 | parser.add_argument( 56 | "--has-nodes", nargs='+', 57 | help="filter for AMRs that have those nodes" 58 | ) 59 | parser.add_argument( 60 | "--has-repeated-nodes", 61 | help="filter for AMRs that have more than one node of same name", 62 | action='store_true' 63 | ) 64 | parser.add_argument( 65 | "--has-repeated-tokens", 66 | help="filter for AMRs that have more than one node of same name", 67 | action='store_true' 68 | ) 69 | parser.add_argument( 70 | "--has-edges", nargs='+', 71 | help="filter for AMRs that have those nodes" 72 | ) 73 | args = parser.parse_args() 74 | return args 75 | 76 | 77 | def fix_ner_alignments(amr): 78 | 79 | # fix alignments 80 | for (src, edge, trg) in amr.edges: 81 | if edge == ':name' and amr.nodes[trg] == 'name': 82 | ops = sorted(amr.children(trg), key=lambda x: [1])[::-1] 83 | if ( 84 | len(amr.alignments[trg]) > 1 85 | and len(amr.alignments[trg]) == len(ops) 86 | ): 87 | for idx, (nid, _) in enumerate(ops): 88 | amr.alignments[nid] = [amr.alignments[trg][idx]] 89 | 90 | return amr 91 | 92 | 93 | def skip_amr(amr, args): 94 | return ( 95 | args.has_nodes 96 | and not set(args.has_nodes) <= set(amr.nodes.values()) 97 | ) or ( 98 | args.has_edges 99 | and not set(args.has_edges) <= set([x[1][1:] for x in amr.edges]) 100 | ) or ( 101 | args.has_repeated_nodes 102 | and len(set(amr.nodes.values())) == len(amr.nodes.values()) 103 | ) or ( 104 | args.has_repeated_tokens 105 | and len(set(amr.tokens)) == len(amr.tokens) 106 | ) 107 | 108 | 109 | def main(args): 110 | 111 | # argument handling 112 | amrs = read_amr(args.in_amr, jamr=args.jamr) 113 | 114 | print(f'Read {args.in_amr}') 115 | num_amrs = len(amrs) 116 | if args.indices: 117 | indices = list(map(int, args.indices)) 118 | else: 119 | indices = list(range(num_amrs)) 120 | # write into file 121 | tex_file = args.out_tex 122 | if args.shuffle: 123 | shuffle(indices) 124 | 125 | # get one sample 126 | amr_strs = [] 127 | for index in indices: 128 | 129 | amr = amrs[index] 130 | 131 | # Fix NER 132 | amr = fix_ner_alignments(amr) 133 | 134 | # Remove ROOT 135 | if amr.tokens[-1] == '': 136 | amr.tokens = amr.tokens[:-1] 137 | 138 | if len(amr_strs) >= args.max_graphs: 139 | # too many graphs 140 | break 141 | 142 | # skip amr not meeting criteria 143 | if skip_amr(amr, args) or amr.edges == []: 144 | continue 145 | 146 | src, _, trg = amr.edges[0] 147 | 148 | # get latex string 149 | amr_str = get_tikz_latex( 150 | amr, 151 | # color_by_id={'a': 'red'}, 152 | # color_by_id_pair={(src, trg): 'red'}, 153 | scale=args.scale, 154 | x_warp=args.x_warp, 155 | y_warp=args.y_warp 156 | ) 157 | 158 | # plot 159 | amr_strs.append(amr_str) 160 | 161 | # open on the fly 162 | save_graphs_to_tex(tex_file, amr_str, plot_cmd='open') 163 | 164 | response = input('Quit [N/y]?') 165 | if response == 'y': 166 | break 167 | 168 | # write all graphs to a single tex 169 | print(f'Wrote {len(amr_strs)} amrs into {tex_file}') 170 | save_graphs_to_tex(tex_file, '\n'.join(amr_strs)) 171 | 172 | 173 | if __name__ == '__main__': 174 | # argument handling 175 | main(argument_parser()) 176 | -------------------------------------------------------------------------------- /scripts/plot_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import defaultdict 3 | import re 4 | # pip install python-dateutil 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from transition_amr_parser.io import read_experiment 8 | # from ipdb import set_trace 9 | 10 | 11 | def get_vectors(items, label, admit_none=False): 12 | 13 | def x_key(item): 14 | return int(item['epoch']) 15 | 16 | def y_reduce(items): 17 | if admit_none: 18 | vy = [float(x[label]) for x in items if x[label] is not None] 19 | else: 20 | vy = [float(x[label]) for x in items] 21 | return np.mean(vy) 22 | 23 | def get_y_std(items): 24 | if admit_none: 25 | vy = [float(x[label]) for x in items if x[label] is not None] 26 | else: 27 | vy = [float(x[label]) for x in items] 28 | return np.std(vy) 29 | 30 | # Cluster x-axis 31 | x_clusters = defaultdict(list) 32 | for item in items: 33 | x_clusters[x_key(item)].append(item) 34 | # get xy vectors 35 | x = np.array(sorted(x_clusters.keys())) 36 | y = np.array([y_reduce(x_clusters[x_i]) for x_i in x]) 37 | y_std = np.array([get_y_std(x_clusters[x_i]) for x_i in x]) 38 | 39 | return x, y, y_std 40 | 41 | 42 | def get_score_from_log(file_path, score_name): 43 | 44 | smatch_results_re = re.compile(r'^F-score: ([0-9\.]+)') 45 | 46 | results = [None] 47 | 48 | if 'smatch' in score_name: 49 | regex = smatch_results_re 50 | else: 51 | raise Exception(f'Unknown score type {score_name}') 52 | 53 | with open(file_path) as fid: 54 | for line in fid: 55 | if regex.match(line): 56 | results = regex.match(line).groups() 57 | results = [100*float(x) for x in results] 58 | break 59 | 60 | return results 61 | 62 | 63 | def matplotlib_render(plotting_data, out_png, title): 64 | 65 | # plot in matplotlib 66 | plt.figure(figsize=(10, 10)) 67 | # axis with extra space for legend 68 | ax = plt.axes([0.1, 0.1, 0.8, 0.7]) 69 | # second axis for Smatch 70 | ax_smatch = ax.twinx() 71 | colors = ['b', 'r', 'g', 'm', 'y'] 72 | tags = sorted(plotting_data.keys()) 73 | handles = [] 74 | for i in range(len(tags)): 75 | 76 | color = colors[i % len(colors)] 77 | 78 | # train loss 79 | x, y, y_std = plotting_data[tags[i]]['train'] 80 | h = ax.plot(x, y, color)[0] 81 | # ax.fill_between(x, y - y_std, y + y_std, alpha=0.3) 82 | # h = ax.fill_between(x, y - y_std, y + y_std, color=color2, alpha=0.3) 83 | handles.append(h) 84 | 85 | # valid loss 86 | # x, y, _ = plotting_data[tags[i]]['valid'] 87 | # ax.plot(x, y, '--' + color) 88 | 89 | # dev decoding score 90 | x, y, y_std = plotting_data[tags[i]]['valid-dec'] 91 | ax_smatch.plot(x, y, color) 92 | ax_smatch.fill_between(x, y - y_std, y + y_std, alpha=0.3) 93 | ax_smatch.set(ylim=(80, 85)) 94 | 95 | ax.set_xlabel('epoch') 96 | ax.set_ylabel('loss') 97 | ax_smatch.set_ylabel('Smatch') 98 | 99 | plt.legend(handles, tags, bbox_to_anchor=(0, 1, 1, 0)) 100 | if title: 101 | plt.title(title) 102 | if out_png: 103 | print(f'wrote {out_png}') 104 | plt.savefig(out_png) 105 | else: 106 | plt.show() 107 | 108 | 109 | def main(args): 110 | 111 | data = [] 112 | for config in args.in_configs: 113 | data.extend(read_experiment(config)) 114 | 115 | # Cluster by experiment 116 | experiments = defaultdict(list) 117 | for item in data: 118 | experiments[item['experiment_key']].append(item) 119 | 120 | # For each experiment collect separate data for train, valid and score 121 | # aggregate stats for multiple seeds and produce vectors for later 122 | # plotting 123 | plotting_data = defaultdict(dict) 124 | for exp_tag, exp_data in experiments.items(): 125 | etime = np.median([ 126 | x['epoch_time'] for x in exp_data if x['epoch_time']]) / (60**2) 127 | print(f'Collecting data for {exp_tag} ({etime:.2f} h/epoch)') 128 | for sset in ['train', 'valid']: 129 | valid_data = [x for x in exp_data if x['set'] == sset] 130 | plotting_data[exp_tag][sset] = \ 131 | get_vectors(valid_data, f'{sset}_loss') 132 | sset = 'valid-dec' 133 | score_data = [x for x in exp_data if x['set'] == sset] 134 | plotting_data[exp_tag][sset] = \ 135 | get_vectors(score_data, 'score', admit_none=True) 136 | 137 | # Render picture in matplotlib 138 | matplotlib_render(plotting_data, args.out_png, args.title) 139 | 140 | 141 | def argument_parser(): 142 | 143 | parser = argparse.ArgumentParser(description='AMR results plotter') 144 | # Single input parameters 145 | parser.add_argument( 146 | 'in_configs', 147 | nargs='+', 148 | help="One or more config fils", 149 | type=str, 150 | ) 151 | parser.add_argument( 152 | '--title', 153 | help="Title of plot" 154 | ) 155 | 156 | parser.add_argument( 157 | '-o', '--out-png', 158 | help="Save into a file instead of plotting" 159 | ) 160 | args = parser.parse_args() 161 | return args 162 | 163 | 164 | if __name__ == '__main__': 165 | main(argument_parser()) 166 | -------------------------------------------------------------------------------- /scripts/read_propbank.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import glob 3 | import json 4 | from tqdm import tqdm 5 | from transition_amr_parser.io import read_frame 6 | 7 | if __name__ == '__main__': 8 | 9 | # Argument handling 10 | in_propank_folder, out_json = sys.argv[1:] 11 | 12 | # Read propbank into dict 13 | propbank = {} 14 | num_files = 0 15 | for xml_file in tqdm(glob.glob(f'{in_propank_folder}/*.xml')): 16 | propbank.update(read_frame(xml_file)) 17 | num_files += 1 18 | if not num_files: 19 | print('No XML files found!') 20 | exit(1) 21 | 22 | num_preds = len(propbank) 23 | num_examples = sum([len(x['examples']) for x in propbank.values()]) 24 | print(f'{num_files} files {num_preds} predicates {num_examples} examples read') 25 | 26 | # Write it into json 27 | with open(out_json, 'w') as fid: 28 | fid.write(json.dumps(propbank)) 29 | -------------------------------------------------------------------------------- /scripts/remove_optimizer_state.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from fairseq_ext.utils import remove_optimizer_state 3 | 4 | 5 | if __name__ == '__main__': 6 | 7 | if len(sys.argv[1:]) == 1: 8 | checkpoint_path = sys.argv[1] 9 | out_checkpoint_path = checkpoint_path 10 | elif len(sys.argv[1:]) == 2: 11 | checkpoint_path, out_checkpoint_path = sys.argv[1:] 12 | 13 | remove_optimizer_state(checkpoint_path, out_checkpoint_path) 14 | -------------------------------------------------------------------------------- /scripts/remove_wiki.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | 5 | if __name__ == '__main__': 6 | 7 | # argument handling 8 | amr_file, new_amr_file = sys.argv[1:] 9 | 10 | with open(amr_file, encoding='utf-8') as fid: 11 | amrs = fid.read() 12 | amrs = re.sub(':wiki ".+?"( )?','', amrs) 13 | amrs = re.sub(':wiki -( )?','', amrs) 14 | l = amrs.count('# ::snt') 15 | with open(new_amr_file, 'w+', encoding='utf-8') as f: 16 | f.write(amrs) 17 | print(new_amr_file) 18 | print(l) 19 | -------------------------------------------------------------------------------- /scripts/sanity_check_amr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import json 4 | from transition_amr_parser.io import read_amr 5 | from collections import defaultdict 6 | from tqdm import tqdm 7 | 8 | 9 | def get_propbank_name(amr_pred): 10 | items = amr_pred.split('-') 11 | prop_pred = '-'.join(items[:-1]) + '.' + items[-1] 12 | if prop_pred.endswith('.91') or prop_pred in ['have-half-life.01']: 13 | pass 14 | else: 15 | prop_pred = prop_pred.replace('-', '_') 16 | return prop_pred 17 | 18 | 19 | if __name__ == '__main__': 20 | 21 | # Argument handling 22 | in_amr, in_propbank_json = sys.argv[1:] 23 | 24 | amrs = read_amr(in_amr) 25 | with open(in_propbank_json) as fid: 26 | propbank = json.loads(fid.read()) 27 | 28 | pred_regex = re.compile('.+-[0-9]+$') 29 | 30 | amr_alerts = defaultdict(list) 31 | sid = 0 32 | num_preds = 0 33 | for amr in tqdm(amrs): 34 | predicate_ids = [ 35 | k for k, v in amr.nodes.items() if pred_regex.match(v) 36 | ] 37 | num_preds += len(predicate_ids) 38 | for pred_id in predicate_ids: 39 | pred = get_propbank_name(amr.nodes[pred_id]) 40 | if pred not in propbank: 41 | amr_alerts['predicate not in propbank'].append( 42 | (sid, pred_id, pred) 43 | ) 44 | else: 45 | probank_roles = propbank[pred]['roles'] 46 | # TODO: Identify obligatory args 47 | required_roles = set() 48 | required_location = set() 49 | for k, v in probank_roles.items(): 50 | if '(must be specified)' in v['descr']: 51 | required_roles |= set([k]) 52 | elif 'must' in v['descr']: 53 | # FIXME: not used right now 54 | required_location = set([k]) 55 | 56 | # Get roles 57 | roles = [ 58 | trip[1][1:].replace('-of', '') 59 | for trip in amr.edges 60 | if trip[0] == pred_id and trip[1].startswith(':ARG') 61 | ] 62 | # Check no required missing 63 | missing_roles = required_roles - set(roles) 64 | if missing_roles: 65 | amr_alerts['missing required role'].append( 66 | (sid, pred_id, pred, " ".join(list(missing_roles))) 67 | ) 68 | # Check no forbiden used 69 | forbidden_roles = set(roles) - set(probank_roles.keys()) 70 | if forbidden_roles: 71 | amr_alerts['role not in propbank'].append( 72 | (sid, pred_id, pred, " ".join(list(forbidden_roles))) 73 | ) 74 | sid += 1 75 | 76 | print(f'{sid+1} sentences {num_preds} predicates') 77 | for name, alerts in amr_alerts.items(): 78 | if alerts: 79 | print(f'{len(alerts)} {name}') 80 | -------------------------------------------------------------------------------- /scripts/split_amrs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tqdm import tqdm 3 | import sys 4 | import os 5 | import penman 6 | from transition_amr_parser.io import read_blocks 7 | from ipdb import set_trace 8 | 9 | 10 | def main(): 11 | 12 | in_amr, max_split_size, output_basename = sys.argv[1:] 13 | dirname = os.path.dirname(output_basename) 14 | os.makedirs(dirname, exist_ok=True) 15 | 16 | amrs = read_blocks(in_amr, return_tqdm=False) 17 | max_split_size = int(max_split_size) 18 | 19 | num_amrs = len(amrs) 20 | indices = list(range(num_amrs)) 21 | chunk_indices = [ 22 | indices[i:i + max_split_size] 23 | for i in range(0, num_amrs, max_split_size) 24 | ] 25 | 26 | for chunk_n, indices in enumerate(tqdm(chunk_indices)): 27 | split_file = f'{output_basename}.{chunk_n}' 28 | with open(split_file, 'w') as fid: 29 | for i in indices: 30 | fid.write(f'{amrs[i]}\n') 31 | print(split_file) 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /scripts/tokenize_amr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transition_amr_parser.amr import protected_tokenizer 3 | 4 | 5 | def parse_arguments(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--in-amr", type=str, help="AMR file to be tokenized", 8 | required=True) 9 | parser.add_argument("--simple", help="Use bare minimum tokenization", 10 | action='store_true') 11 | return parser.parse_args() 12 | 13 | 14 | def main(args): 15 | 16 | # read 17 | raw_amr = [] 18 | with open(args.in_amr) as fid: 19 | for line in fid: 20 | raw_amr.append(line.rstrip()) 21 | 22 | # append tok line, ignoring previously existing ones 23 | existing_tokenization = False 24 | out_raw_amr = [] 25 | for line in raw_amr: 26 | if line.strip().startswith('# ::snt'): 27 | out_raw_amr.append(line) 28 | # get tokens and also append 29 | sentence = line.split('# ::snt')[-1].strip() 30 | tokens, _ = protected_tokenizer(sentence, args.simple) 31 | tokens_str = ' '.join(tokens) 32 | out_raw_amr.append(f'# ::tok {tokens_str}') 33 | elif line.strip().startswith('# ::tok'): 34 | # ignore existing tokens 35 | existing_tokenization = True 36 | else: 37 | out_raw_amr.append(line) 38 | 39 | if existing_tokenization: 40 | print( 41 | f'\033[93mWARNING\033[0m:' 42 | f' Overwrote existing tokenization in {args.in_amr}' 43 | ) 44 | 45 | # write 46 | with open(args.in_amr, 'w') as fid: 47 | for line in out_raw_amr: 48 | fid.write(f'{line}\n') 49 | 50 | 51 | if __name__ == '__main__': 52 | main(parse_arguments()) 53 | -------------------------------------------------------------------------------- /scripts/vimdiff_amr_files.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import subprocess 3 | 4 | 5 | def get_one_amr(fid): 6 | amr = [] 7 | line = fid.readline() 8 | while line.strip(): 9 | amr.append(line) 10 | line = fid.readline() 11 | return amr 12 | 13 | 14 | def write(file_name, content): 15 | with open(file_name, 'w') as fid: 16 | fid.write(content) 17 | 18 | 19 | if __name__ == '__main__': 20 | 21 | amr1_file, amr2_file = sys.argv[1:] 22 | 23 | different_amrs = [] 24 | num_amrs = 0 25 | with open(amr1_file) as fid1, open(amr2_file) as fid2: 26 | while True: 27 | amr1 = get_one_amr(fid1) 28 | amr2 = get_one_amr(fid2) 29 | penman1 = ''.join([x for x in amr1 if x[0] != '#']) 30 | penman2 = ''.join([x for x in amr2 if x[0] != '#']) 31 | if penman1 != penman2: 32 | different_amrs.append((num_amrs, penman1, penman2)) 33 | num_amrs += 1 34 | print(f'\r{num_amrs}', end='') 35 | if amr1 == [] and amr2 == []: 36 | break 37 | 38 | print(f'\n{len(different_amrs)}/{num_amrs} different AMRs') 39 | 40 | for n, p1, p2 in different_amrs: 41 | input(f'\nPress any key to compare sentence {n}') 42 | write('tmp1', p1) 43 | write('tmp2', p2) 44 | subprocess.call(['vimdiff', 'tmp1', 'tmp2']) 45 | -------------------------------------------------------------------------------- /service/amr.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | import "wordvec.proto"; 3 | service AMRServer { 4 | rpc process(AMRInput) returns(AMRResponse) {} 5 | }; 6 | /** 7 | * This contains information about a sentence, used as input by the parser 8 | */ 9 | message AMRInput{ 10 | message WordInfo { 11 | string token=1; 12 | string lemma=2; 13 | } 14 | repeated WordInfo word_infos=1; 15 | WordVectors word_vectors=2; 16 | bool doc_mode=3; 17 | }; 18 | /** 19 | * The parser produces a single string with the amr parse of the sentence. 20 | */ 21 | message AMRResponse { 22 | string amr_parse=1; 23 | } -------------------------------------------------------------------------------- /service/amr2.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | service AMRBatchServer { 3 | rpc process(AMRBatchInput) returns(AMRBatchResponse) {} 4 | }; 5 | /** 6 | * This contains information about a batch of sentences, used as input by the parser 7 | */ 8 | message AMRBatchInput{ 9 | message Sentence { 10 | repeated string tokens=1; 11 | } 12 | repeated Sentence sentences=1; 13 | bool doc_mode=2; 14 | }; 15 | /** 16 | * The parser produces a list of strings with the amr parse of the sentences. 17 | */ 18 | message AMRBatchResponse { 19 | repeated string amr_parse=1; 20 | } 21 | -------------------------------------------------------------------------------- /service/amr_client.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import grpc 4 | import torch 5 | import json 6 | import amr_pb2 7 | import amr_pb2_grpc 8 | import argparse 9 | 10 | def argument_parser(): 11 | parser = argparse.ArgumentParser(description='AMR parser') 12 | parser.add_argument( 13 | "--port", 14 | help="GRPC port", 15 | type=str 16 | ) 17 | args = parser.parse_args() 18 | 19 | # Sanity checks 20 | assert args.port 21 | 22 | return args 23 | 24 | def get_input_from_sentence(sentence,mode): 25 | tokens = sentence.split() 26 | input_tokens = [] 27 | for token in tokens: 28 | input_tokens.append(amr_pb2.AMRInput.WordInfo(token=token)) 29 | 30 | if mode.lower()=='doc' or mode.lower()=='document': 31 | doc_mode = True 32 | else: 33 | doc_mode = False 34 | return amr_pb2.AMRInput(word_infos=input_tokens,doc_mode=doc_mode) 35 | 36 | def run(): 37 | # NOTE(gRPC Python Team): .close() is possible on a channel and should be 38 | # used in circumstances in which the with statement does not fit the needs 39 | # of the code. 40 | # Argument handling 41 | args = argument_parser() 42 | channel = grpc.insecure_channel('localhost:' + args.port) 43 | stub = amr_pb2_grpc.AMRServerStub(channel) 44 | sentence = input("Enter the sentence: ") 45 | mode = input("Enter the mode: ") 46 | amr_input = get_input_from_sentence(sentence,mode) 47 | response = stub.process(amr_input) 48 | print("AMR parse received: \n" + response.amr_parse) 49 | 50 | if __name__ == '__main__': 51 | logging.basicConfig() 52 | run() -------------------------------------------------------------------------------- /service/wordvec.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | service WordVectorsServer { 3 | rpc vectorize(VectorizeRequest) returns(WordVectors) {} 4 | } 5 | enum WordVectorType { 6 | BERT_LARGE_EP5=0; 7 | } 8 | /** 9 | * Tokenized sentence 10 | */ 11 | message VectorizeRequest { 12 | WordVectorType type=1; 13 | repeated string tokens=2; 14 | } 15 | /** 16 | * Concatenated vector for the entire sentence. 17 | * 'size' must match the number of tokens. 18 | */ 19 | message WordVectors { 20 | WordVectorType type=1; 21 | int32 size=3; 22 | int32 dimension=2; 23 | // Concatenated full vectors 24 | repeated float data=4; 25 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | 4 | VERSION = '0.5.4' 5 | 6 | install_requires = [ 7 | "torch==1.13.1", 8 | 'numpy<=1.23.5', 9 | 'ipython<=8.12', # python 3.8 vs ipython 8.13 incompatibility 10 | 'tqdm>=4.55.0', 11 | 'packaging>=20.8', 12 | 'requests>=2.25.1', 13 | # for data (ELMO embeddings) 14 | 'h5py>=3.0.0', 15 | 'python-dateutil>=2.8.1', 16 | # for scoringy 17 | 'penman>=1.1.0', 18 | # needs tools to be importable > 1.0.4. As of now, no official release 19 | 'smatch', 20 | # for debugging 21 | 'ipdb', 22 | 'line_profiler>=4.0.2', 23 | 'pyinstrument>=4.4.0', 24 | # for aws download 25 | 'boto3>=1.26.1', 26 | 'progressbar', 27 | ] 28 | 29 | # platform dependent fairseq version 30 | if sys.platform == 'darwin': 31 | install_requires.append("fairseq==0.10.0") 32 | else: 33 | install_requires.append("fairseq==0.10.2") 34 | 35 | if __name__ == '__main__': 36 | setup( 37 | name='transition_amr_parser', 38 | version=VERSION, 39 | description="Trasition-based neural parser", 40 | package_dir={"": "src"}, 41 | # packages=['fairseq_ext', 'transition_amr_parser'], 42 | # packages=['neural_parser'], 43 | packages=find_packages("src", exclude=('cenv_*', 'configs', 'tests', 'DATA','dist','docker','run','scripts','service','*egg-info')), 44 | package_data={'': ['*.txt', '*.md', '*.opt', '*.cu', '*.cpp']}, 45 | entry_points={ 46 | 'console_scripts': [ 47 | 'amr-parse = transition_amr_parser.parse:main', 48 | 'amr-machine = transition_amr_parser.amr_machine:main', 49 | ] 50 | }, 51 | py_modules=['fairseq_ext', 'transition_amr_parser',"ibm_neural_aligner"], 52 | install_requires=install_requires, 53 | classifiers=[ 54 | "Programming Language :: Python :: 3.8", 55 | "License :: OSI Approved :: Apache Software License", 56 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 57 | "Natural Language :: English", 58 | ], 59 | ) 60 | 61 | -------------------------------------------------------------------------------- /src/fairseq_ext/__init__.py: -------------------------------------------------------------------------------- 1 | # to register all the user defined modules to fairseq 2 | import fairseq_ext.criterions # noqa 3 | import fairseq_ext.models # noqa 4 | import fairseq_ext.tasks # noqa 5 | -------------------------------------------------------------------------------- /src/fairseq_ext/amr_reform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/amr_reform/__init__.py -------------------------------------------------------------------------------- /src/fairseq_ext/amr_spec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/amr_spec/__init__.py -------------------------------------------------------------------------------- /src/fairseq_ext/binarize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from fairseq.data.indexed_dataset import __best_fitting_dtype, MMapIndexedDatasetBuilder, IndexedDatasetBuilder 4 | from fairseq.tokenizer import tokenize_line 5 | 6 | 7 | # TODO move this file into data folder 8 | def make_builder(out_file, impl, vocab_size=None, dtype=None): 9 | if impl == 'mmap': 10 | if dtype is None: 11 | dtype = __best_fitting_dtype(vocab_size) 12 | return MMapIndexedDatasetBuilder(out_file, dtype=dtype) 13 | else: 14 | return IndexedDatasetBuilder(out_file) 15 | 16 | 17 | def binarize_file(input_file, out_file_pref, impl, dtype=np.int64, tokenize=tokenize_line): 18 | out_file = out_file_pref + '.bin' 19 | index_file = out_file_pref + '.idx' 20 | ds = make_builder(out_file, impl=impl, dtype=dtype) 21 | with open(input_file, 'r') as f: 22 | for line in f: 23 | if line.strip(): 24 | line = tokenize_line(line) 25 | line = list(map(int, line)) 26 | line = torch.tensor(line) 27 | ds.add_item(line) 28 | else: 29 | raise Exception('empty line') 30 | 31 | ds.finalize(index_file) 32 | 33 | return 34 | -------------------------------------------------------------------------------- /src/fairseq_ext/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | 4 | 5 | # automatically infer the user module name (in case there is a change during the development) 6 | user_module_name = os.path.split(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))[1] 7 | submodule_name = os.path.split(os.path.abspath(os.path.dirname(__file__)))[1] 8 | 9 | 10 | # automatically import any Python files in the criterions/ directory 11 | # this is necessary for fairseq to register the user defined criterions 12 | for file in os.listdir(os.path.dirname(__file__)): 13 | if file.endswith('.py') and not file.startswith('_'): 14 | module = file[:file.find('.py')] 15 | importlib.import_module(user_module_name + '.' + submodule_name + '.' + module) 16 | -------------------------------------------------------------------------------- /src/fairseq_ext/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/data/__init__.py -------------------------------------------------------------------------------- /src/fairseq_ext/extract_bart/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/extract_bart/__init__.py -------------------------------------------------------------------------------- /src/fairseq_ext/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | 4 | 5 | # automatically infer the user module name (in case there is a change during the development) 6 | user_module_name = os.path.split(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))[1] 7 | submodule_name = os.path.split(os.path.abspath(os.path.dirname(__file__)))[1] 8 | 9 | 10 | # automatically import any Python files in the models/ directory 11 | # this is necessary for fairseq to register the user defined models 12 | models_dir = os.path.dirname(__file__) 13 | for file in os.listdir(models_dir): 14 | path = os.path.join(models_dir, file) 15 | if (file.endswith('.py') or os.path.isdir(path)) and not file.startswith('_'): 16 | module = file[:file.find('.py')] if file.endswith('.py') else file 17 | importlib.import_module(user_module_name + '.' + submodule_name + '.' + module) 18 | -------------------------------------------------------------------------------- /src/fairseq_ext/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/modules/__init__.py -------------------------------------------------------------------------------- /src/fairseq_ext/roberta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/roberta/__init__.py -------------------------------------------------------------------------------- /src/fairseq_ext/roberta/binarize_embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import shutil 4 | import time 5 | 6 | from ..data import indexed_dataset 7 | from ..utils import time_since 8 | 9 | 10 | def dataset_dest_prefix(args, output_prefix, lang): 11 | base = "{}/{}".format(args.embdir, output_prefix) 12 | lang_part = ( 13 | ".{}-{}.{}".format(args.source_lang, args.target_lang, lang) if lang is not None else "" 14 | ) 15 | return "{}{}".format(base, lang_part) 16 | 17 | 18 | def dataset_dest_file(args, output_prefix, lang, extension): 19 | base = dataset_dest_prefix(args, output_prefix, lang) 20 | return "{}.{}".format(base, extension) 21 | 22 | 23 | def get_scatter_indices(word2piece, reverse=False): 24 | if reverse: 25 | indices = range(len(word2piece))[::-1] 26 | else: 27 | indices = range(len(word2piece)) 28 | # we will need as well the wordpiece to word indices 29 | wp_indices = [ 30 | [index] * (len(span) if isinstance(span, list) else 1) 31 | for index, span in zip(indices, word2piece) 32 | ] 33 | wp_indices = [x for span in wp_indices for x in span] 34 | return torch.tensor(wp_indices) 35 | 36 | 37 | def make_binary_bert_features(args, input_prefix, output_prefix, tokenize): 38 | 39 | # Load pretrained embeddings extractor 40 | if args.pretrained_embed.startswith('roberta'): 41 | from .pretrained_embeddings import PretrainedEmbeddings 42 | 43 | pretrained_embeddings = PretrainedEmbeddings( 44 | args.pretrained_embed, 45 | args.bert_layers 46 | ) 47 | elif args.pretrained_embed.startswith('bert'): 48 | from .pretrained_embeddings_bert import PretrainedEmbeddings 49 | 50 | pretrained_embeddings = PretrainedEmbeddings( 51 | args.pretrained_embed, 52 | args.bert_layers 53 | ) 54 | else: 55 | raise ValueError('arg.pretrained_embed should be either roberta.* or bert-*') 56 | 57 | # will store pre-extracted BERT layer 58 | indexed_data = indexed_dataset.make_builder( 59 | dataset_dest_file(args, output_prefix, 'en.bert', "bin"), 60 | impl=args.dataset_impl, 61 | dtype=np.float32 62 | ) 63 | 64 | # will store wordpieces and wordpiece to word mapping 65 | indexed_wordpieces = indexed_dataset.make_builder( 66 | dataset_dest_file(args, output_prefix, 'en.wordpieces', "bin"), 67 | impl=args.dataset_impl, 68 | ) 69 | 70 | indexed_wp2w = indexed_dataset.make_builder( 71 | dataset_dest_file(args, output_prefix, 'en.wp2w', "bin"), 72 | impl=args.dataset_impl, 73 | ) 74 | 75 | num_sents = 0 76 | input_file = input_prefix + '.en' 77 | 78 | start = time.time() 79 | with open(input_file, 'r') as fid: 80 | for sentence in fid: 81 | 82 | # we only have tokenized data so we feed whitespace separated 83 | # tokens 84 | sentence = " ".join(tokenize(str(sentence).rstrip())) 85 | 86 | # extract embeddings, average them per token and return 87 | # wordpieces anyway 88 | word_features, worpieces_roberta, word2piece = \ 89 | pretrained_embeddings.extract(sentence) 90 | 91 | # note that data needs to be stored as a 1d array. Also check 92 | # that number nof woprds matches with embedding size 93 | assert word_features.shape[1] == len(sentence.split()) 94 | indexed_data.add_item(word_features.cpu().view(-1)) 95 | 96 | # just store the wordpiece indices, ignore BOS/EOS tokens 97 | indexed_wordpieces.add_item(worpieces_roberta) 98 | indexed_wp2w.add_item( 99 | get_scatter_indices(word2piece, reverse=True) 100 | ) 101 | 102 | # udpate number of sents 103 | num_sents += 1 104 | if not num_sents % 100: 105 | print("\r%d sentences (time: %s)" % (num_sents, time_since(start)), end='') 106 | print("") 107 | 108 | # close indexed data files 109 | indexed_data.finalize( 110 | dataset_dest_file(args, output_prefix, 'en.bert', "idx") 111 | ) 112 | 113 | indexed_wordpieces.finalize( 114 | dataset_dest_file(args, output_prefix, 'en.wordpieces', "idx") 115 | ) 116 | indexed_wp2w.finalize( 117 | dataset_dest_file(args, output_prefix, 'en.wp2w', "idx") 118 | ) 119 | 120 | # copy the source sentence file to go together with the embeddings 121 | shutil.copyfile(input_file, dataset_dest_prefix(args, output_prefix, 'en')) 122 | 123 | 124 | def make_roberta_embeddings(args, tokenize=None): 125 | ''' 126 | Makes BERT features for source words 127 | ''' 128 | 129 | assert tokenize 130 | 131 | if args.trainpref: 132 | make_binary_bert_features(args, args.trainpref, "train", tokenize) 133 | 134 | if args.validpref: 135 | for k, validpref in enumerate(args.validpref.split(",")): 136 | outprefix = "valid{}".format(k) if k > 0 else "valid" 137 | make_binary_bert_features(args, validpref, outprefix, tokenize) 138 | 139 | if args.testpref: 140 | for k, testpref in enumerate(args.testpref.split(",")): 141 | outprefix = "test{}".format(k) if k > 0 else "test" 142 | make_binary_bert_features(args, testpref, outprefix, tokenize) 143 | -------------------------------------------------------------------------------- /src/fairseq_ext/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | 4 | 5 | # automatically infer the user module name (in case there is a change during the development) 6 | user_module_name = os.path.split(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))[1] 7 | 8 | 9 | # automatically import any Python files in the tasks/ directory 10 | # this is necessary for fairseq to register the user defined tasks 11 | for file in os.listdir(os.path.dirname(__file__)): 12 | if file.endswith('.py') and not file.startswith('_'): 13 | task_name = file[:file.find('.py')] 14 | importlib.import_module(user_module_name + '.tasks.' + task_name) 15 | -------------------------------------------------------------------------------- /src/fairseq_ext/tests/test_action_info_graphmp_tofile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from fairseq.data import Dictionary 5 | 6 | from fairseq_ext.amr_spec.action_info_binarize_graphmp import (binarize_actstates_tofile, 7 | binarize_actstates_tofile_workers, 8 | load_actstates_fromfile) 9 | 10 | # import sys 11 | # import importlib 12 | # sys.path.insert(0, '..') 13 | # importlib.import_module('fairseq_ext') 14 | # sys.path.pop(0) 15 | 16 | 17 | if __name__ == '__main__': 18 | if len(sys.argv) > 1: 19 | num_workers = int(sys.argv[1]) 20 | else: 21 | num_workers = 1 22 | 23 | # split = 'dev' 24 | split = 'train' 25 | 26 | en_file = f'/cephfs_nese/TRANSFER/rjsingh/DDoS/DDoS/jzhou/transition-amr-parser/EXP/data/o5_act-states/oracle/{split}.en' 27 | actions_file = f'/cephfs_nese/TRANSFER/rjsingh/DDoS/DDoS/jzhou/transition-amr-parser/EXP/data/o5_act-states/oracle/{split}.actions' 28 | actions_dict = Dictionary.load( 29 | '/cephfs_nese/TRANSFER/rjsingh/DDoS/DDoS/jzhou/transition-amr-parser/EXP/data/o5_act-states/processed/dict.actions_nopos.txt' 30 | ) 31 | out_file_pref = f'/cephfs_nese/TRANSFER/rjsingh/DDoS/DDoS/jzhou/transition-amr-parser/tmp/{split}.en-actions.actions' 32 | 33 | os.makedirs(os.path.dirname(out_file_pref), exist_ok=True) 34 | 35 | # res = binarize_actstates_tofile(en_file, actions_file, out_file_pref, actions_dict=actions_dict) 36 | res = binarize_actstates_tofile_workers(en_file, actions_file, out_file_pref, actions_dict=actions_dict, 37 | num_workers=num_workers) 38 | print( 39 | "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( 40 | 'actions', 41 | actions_file, 42 | res['nseq'], 43 | res['ntok'], 44 | 100 * res['nunk'] / res['ntok'], 45 | actions_dict.unk_word, 46 | ) 47 | ) 48 | 49 | os.system(f'ls -lh {os.path.dirname(out_file_pref)}') 50 | 51 | tgt_actstates = load_actstates_fromfile(out_file_pref, actions_dict) 52 | 53 | breakpoint() 54 | -------------------------------------------------------------------------------- /src/fairseq_ext/tests/test_action_info_tofile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from fairseq.data import Dictionary 5 | 6 | from fairseq_ext.amr_spec.action_info_binarize import (binarize_actstates_tofile, 7 | binarize_actstates_tofile_workers, 8 | load_actstates_fromfile) 9 | 10 | # import sys 11 | # import importlib 12 | # sys.path.insert(0, '..') 13 | # importlib.import_module('fairseq_ext') 14 | # sys.path.pop(0) 15 | 16 | 17 | if __name__ == '__main__': 18 | if len(sys.argv) > 1: 19 | num_workers = int(sys.argv[1]) 20 | else: 21 | num_workers = 1 22 | 23 | split = 'dev' 24 | split = 'train' 25 | 26 | en_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/oracle/{split}.en' 27 | actions_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/oracle/{split}.actions' 28 | actions_dict = Dictionary.load( 29 | '/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/databin/dict.actions_nopos.txt' 30 | ) 31 | out_file_pref = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/tmp/{split}.en-actions.actions' 32 | 33 | # en_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/data/o3align_roberta-base-last_act-noeos-states-2LAroot/oracle/{split}.en' 34 | # actions_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/data/o3align_roberta-base-last_act-noeos-states-2LAroot/oracle/{split}.actions' 35 | # actions_dict = Dictionary.load( 36 | # '/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/data/o3align_roberta-base-last_act-noeos-states-2LAroot/processed/dict.actions_nopos.txt' 37 | # ) 38 | # out_file_pref = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/data/o3align_roberta-base-last_act-noeos-states-2LAroot/processed/{split}.en-actions.actions' 39 | 40 | os.makedirs(os.path.dirname(out_file_pref), exist_ok=True) 41 | 42 | # binarize_actstates_tofile(en_file, actions_file, out_file_pref, actions_dict=actions_dict) 43 | binarize_actstates_tofile_workers(en_file, actions_file, out_file_pref, actions_dict=actions_dict, 44 | num_workers=num_workers) 45 | 46 | os.system(f'ls -lh {os.path.dirname(out_file_pref)}') 47 | 48 | tgt_vocab_masks, tgt_actnode_masks, tgt_src_cursors, \ 49 | tgt_actedge_masks, tgt_actedge_cur_nodes, tgt_actedge_pre_nodes, tgt_actedge_directions = \ 50 | load_actstates_fromfile(out_file_pref, actions_dict) 51 | 52 | import pdb; pdb.set_trace() 53 | -------------------------------------------------------------------------------- /src/fairseq_ext/tests/test_action_info_tolist.py: -------------------------------------------------------------------------------- 1 | from fairseq.data import Dictionary 2 | 3 | import sys 4 | # import importlib 5 | # sys.path.insert(0, '..') 6 | # importlib.import_module('fairseq_ext') 7 | # sys.path.pop(0) 8 | from fairseq_ext.amr_spec.action_info_binarize import binarize_actstates_tolist, binarize_actstates_tolist_workers 9 | 10 | 11 | if __name__ == '__main__': 12 | if len(sys.argv) > 1: 13 | num_workers = int(sys.argv[1]) 14 | else: 15 | num_workers = 1 16 | 17 | split = 'dev' 18 | # split = 'train' 19 | 20 | en_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/oracle/{split}.en' 21 | actions_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/oracle/{split}.actions' 22 | actions_dict = Dictionary.load( 23 | '/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/databin/dict.actions_nopos.txt' 24 | ) 25 | 26 | # tgt_vocab_masks, tgt_actnode_masks, tgt_src_cursors = binarize_actstates_tolist(en_file, actions_file, 27 | # actions_dict=actions_dict) 28 | # TODO not working for num_workers > 1 29 | tgt_vocab_masks, tgt_actnode_masks, tgt_src_cursors, \ 30 | tgt_actedge_masks, tgt_actedge_cur_nodes, tgt_actedge_pre_nodes, tgt_actedge_directions = \ 31 | binarize_actstates_tolist_workers(en_file, actions_file, actions_dict=actions_dict, num_workers=num_workers) 32 | 33 | import pdb 34 | pdb.set_trace() 35 | -------------------------------------------------------------------------------- /src/fairseq_ext/tests/test_amr_action_bpe.py: -------------------------------------------------------------------------------- 1 | from fairseq_ext.data.amr_bpe import AMRActionBPEEncoder, AMRActionBartDictionary 2 | 3 | 4 | if __name__ == '__main__': 5 | # file paths 6 | encoder_json_path = 'DATA/gpt2_bpe/encoder.json' 7 | vocab_bpe_path = 'DATA/gpt2_bpe/vocab.bpe' 8 | dict_txt_path = 'DATA/gpt2_bpe/dict.txt' 9 | node_file_path = 'DATA/AMR2.0/oracles/o10/train.actions.vocab.nodes' 10 | others_file_path = 'DATA/AMR2.0/oracles/o10/train.actions.vocab.others' 11 | 12 | # build the bpe encoder 13 | act_bpe = AMRActionBPEEncoder.build_bpe_encoder(encoder_json_path, # or None to use cached 14 | vocab_bpe_path, # or None to use cached 15 | # add new symbols 16 | node_freq_min=5, 17 | node_file_path=node_file_path, 18 | others_file_path=others_file_path 19 | ) 20 | 21 | actions = 'SHIFT SHIFT clear-06 ROOT SHIFT SHIFT thing prepare-01 >RA(:ARG1-of) SHIFT prior-to >RA(:time) ' \ 22 | 'SHIFT SHIFT SHIFT COPY >RA(:op1) SHIFT SHIFT - SHIFT construct-01 >LA(:polarity) >LA(:ARG1) >RA(:ARG1) ' \ 23 | 'SHIFT SHIFT SHIFT base-02 >RA(:ARG1-of) SHIFT SHIFT SHIFT COPY SHIFT COPY >LA(:mod) SHIFT simulate-01 >LA(:ARG1) >RA(:ARG2) SHIFT SHIFT' 24 | bpe_token_ids, bpe_tokens, tok_to_subtok_start, subtok_origin_index = act_bpe.encode_actions(actions) 25 | 26 | breakpoint() 27 | 28 | # build the action dictionary 29 | act_dict = AMRActionBartDictionary(dict_txt_path, # or None to use cached 30 | node_freq_min=5, 31 | node_file_path=node_file_path, 32 | others_file_path=others_file_path) 33 | 34 | ids, bpe_tokens, tok_to_subtok_start, subtok_origin_index = act_dict.encode_actions(actions) 35 | 36 | breakpoint() 37 | 38 | print(act_dict.decode_actions(ids)) 39 | -------------------------------------------------------------------------------- /src/fairseq_ext/tests/test_amr_action_unk.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from tqdm import tqdm 4 | 5 | from fairseq_ext.data.amr_bpe import AMRActionBPEEncoder, AMRActionBartDictionary 6 | from fairseq_ext.amr_reform.o10_action_reformer_subtok import AMRActionReformerSubtok 7 | from transition_amr_parser.amr_machine import AMRStateMachine 8 | 9 | 10 | if __name__ == '__main__': 11 | # file paths 12 | encoder_json_path = 'DATA/gpt2_bpe/encoder.json' 13 | vocab_bpe_path = 'DATA/gpt2_bpe/vocab.bpe' 14 | dict_txt_path = 'DATA/gpt2_bpe/dict.txt' 15 | node_file_path = 'DATA/AMR2.0/oracles/o10/train.actions.vocab.nodes' 16 | others_file_path = 'DATA/AMR2.0/oracles/o10/train.actions.vocab.others' 17 | 18 | split = 'train' 19 | en_file = f'/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart-o10/DATA/AMR2.0/oracles/o10/{split}.tokens' 20 | actions_file = f'/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart-o10/DATA/AMR2.0/oracles/o10/{split}.actions' 21 | machine_config = f'/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart-o10/DATA/AMR2.0/oracles/o10/machine_config.json' 22 | 23 | # # build the bpe encoder 24 | # act_bpe = AMRActionBPEEncoder.build_bpe_encoder(encoder_json_path, # or None to use cached 25 | # vocab_bpe_path, # or None to use cached 26 | # # add new symbols 27 | # node_freq_min=5, 28 | # node_file_path=node_file_path, 29 | # others_file_path=others_file_path 30 | # ) 31 | 32 | # actions = 'SHIFT SHIFT clear-06 ROOT SHIFT SHIFT thing prepare-01 >RA(:ARG1-of) SHIFT prior-to >RA(:time) ' \ 33 | # 'SHIFT SHIFT SHIFT COPY >RA(:op1) SHIFT SHIFT - SHIFT construct-01 >LA(:polarity) >LA(:ARG1) >RA(:ARG1) ' \ 34 | # 'SHIFT SHIFT SHIFT base-02 >RA(:ARG1-of) SHIFT SHIFT SHIFT COPY SHIFT COPY >LA(:mod) SHIFT simulate-01 >LA(:ARG1) >RA(:ARG2) SHIFT SHIFT' 35 | # bpe_token_ids, bpe_tokens, tok_to_subtok_start, subtok_origin_index = act_bpe.encode_actions(actions) 36 | 37 | # breakpoint() 38 | 39 | # build the action dictionary 40 | act_dict = AMRActionBartDictionary(dict_txt_path, # or None to use cached 41 | node_freq_min=5, 42 | node_file_path=node_file_path, 43 | others_file_path=others_file_path) 44 | 45 | # ids, bpe_tokens, tok_to_subtok_start, subtok_origin_index = act_dict.encode_actions(actions) 46 | 47 | # breakpoint() 48 | 49 | # print(act_dict.decode_actions(ids)) 50 | 51 | # check for unk symbol in the data 52 | machine = AMRStateMachine.from_config(machine_config) 53 | 54 | replaced = Counter() 55 | current_unk = [] 56 | 57 | def replaced_consumer(word, idx): 58 | if idx == act_dict.unk_index and word != act_dict.unk_word: 59 | replaced.update([word]) 60 | current_unk.append(word) 61 | 62 | with open(en_file, 'r') as f, open(actions_file, 'r') as g: 63 | for tokens, actions in tqdm(zip(f, g)): 64 | if tokens.strip(): 65 | tokens = tokens.strip().split('\t') 66 | actions = actions.strip().split('\t') 67 | 68 | if actions[-1] != 'CLOSE': 69 | actions = actions.copy() 70 | actions.append('CLOSE') 71 | 72 | actions_states = AMRActionReformerSubtok.reform_actions_and_get_states(tokens, actions, 73 | act_dict, machine) 74 | v = actions_states['actions_nopos_out'] 75 | 76 | current_unk = [] 77 | 78 | ids = act_dict.encode_line( 79 | line=[act if act != 'CLOSE' else act_dict.eos_word for act in v], 80 | line_tokenizer=lambda x: x, # already tokenized 81 | add_if_not_exist=False, 82 | consumer=replaced_consumer, 83 | append_eos=False, 84 | reverse_order=False 85 | ) 86 | 87 | if current_unk: 88 | print(replaced) 89 | print(current_unk) 90 | print(actions) 91 | breakpoint() 92 | -------------------------------------------------------------------------------- /src/fairseq_ext/tests/test_composite_embeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from fairseq.data import Dictionary 3 | from fairseq_ext.extract_bart.composite_embeddings import CompositeEmbeddingBART 4 | 5 | 6 | if __name__ == '__main__': 7 | vocab_path = '/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart/EXP/data/graphmp-swaparc-ptrlast_o8.3_act-states/processed/dict.actions_nopos.txt' 8 | vocab = Dictionary.load(vocab_path) 9 | 10 | bart = torch.hub.load('pytorch/fairseq', 'bart.base') 11 | 12 | cemb = CompositeEmbeddingBART(bart, bart.model.decoder.embed_tokens, vocab) 13 | 14 | indices = torch.tensor([[1, 3, 8], [10, 5000, 666]]) 15 | 16 | indices = indices.cuda() 17 | cemb.to('cuda') 18 | 19 | embeddings = cemb(indices, update=True) 20 | 21 | breakpoint() 22 | 23 | # test backprop 24 | optimizer = torch.optim.SGD(cemb.parameters(), lr=1) 25 | for i in range(2): 26 | print() 27 | optimizer.zero_grad() 28 | print(cemb.base_embeddings.weight[:1].sum()) 29 | print(cemb.base_embeddings.weight[:2]) 30 | ll = cemb(torch.tensor([0, 1, 2]).cuda(), update=True).sum() * 10 31 | ll.backward() 32 | print(cemb.base_embeddings.weight.grad) 33 | optimizer.step() 34 | print(cemb.base_embeddings.weight[:1].sum()) 35 | print(cemb.base_embeddings.weight[:2]) 36 | print() 37 | 38 | breakpoint() 39 | -------------------------------------------------------------------------------- /src/fairseq_ext/tests/test_composite_embeddings_mapping.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from tqdm import tqdm 4 | import torch 5 | from fairseq.data import Dictionary 6 | from fairseq_ext.extract_bart.composite_embeddings import CompositeEmbeddingBART, transform_action_symbol 7 | 8 | 9 | if __name__ == '__main__': 10 | vocab_path = '/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart-o10/EXP/data/o10_act-states/processed/dict.actions_nopos.txt' 11 | vocab = Dictionary.load(vocab_path) 12 | 13 | bart = torch.hub.load('pytorch/fairseq', 'bart.base') 14 | 15 | cemb = CompositeEmbeddingBART(bart, bart.model.decoder.embed_tokens, vocab) 16 | 17 | trans_actions = [] 18 | for sym in tqdm(vocab.symbols): 19 | new_sym = transform_action_symbol(sym) # str 20 | splitted = cemb.sub_tokens(new_sym) # list 21 | # trans_actions.append((new_sym, splitted)) 22 | trans_actions.append(new_sym + ' --> ' + '|' + '|'.join(splitted) + '|' + '\n') 23 | 24 | tmp_dir = 'fairseq_ext/tests_data' 25 | os.makedirs(tmp_dir, exist_ok=True) 26 | with open(os.path.join(tmp_dir, 'dict.actions_nopos.bartmap.txt'), 'w') as f: 27 | f.writelines(trans_actions) 28 | 29 | breakpoint() 30 | -------------------------------------------------------------------------------- /src/fairseq_ext/tests/test_factored_embeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from fairseq.data import Dictionary 3 | from fairseq_ext.modules.factored_embeddings import FactoredEmbeddings 4 | 5 | 6 | if __name__ == '__main__': 7 | vocab_path = '/n/tata_ddos_ceph/jzhou/transition-amr-parser-o8/EXP/data/graphmp-swaparc-ptrlast_o8.3_act-states/processed/dict.actions_nopos.txt' 8 | embed_dim = 256 9 | vocab = Dictionary.load(vocab_path) 10 | femb = FactoredEmbeddings(vocab, embed_dim) 11 | 12 | indices = torch.tensor([[1, 3, 8], [10, 5000, 666]]).cuda() 13 | femb.to('cuda') 14 | 15 | embeddings = femb(indices) 16 | 17 | breakpoint() 18 | -------------------------------------------------------------------------------- /src/fairseq_ext/tokenizer.py: -------------------------------------------------------------------------------- 1 | def tokenize_line_tab(line): 2 | line = line.strip() 3 | return line.split('\t') 4 | -------------------------------------------------------------------------------- /src/fairseq_ext/utils_font.py: -------------------------------------------------------------------------------- 1 | # FONT_COLORORS 2 | FONT_COLOR = { 3 | 'black': 30, 'red': 31, 'green': 32, 'yellow': 33, 'blue': 34, 4 | 'magenta': 35, 'cyan': 36, 'light gray': 37, 'dark gray': 90, 5 | 'light red': 91, 'light green': 92, 'light yellow': 93, 6 | 'light blue': 94, 'light magenta': 95, 'light cyan': 96, 'white': 97 7 | } 8 | 9 | # BG FONT_COLORORS 10 | BACKGROUND_COLOR = { 11 | 'black': 40, 'red': 41, 'green': 42, 'yellow': 43, 'blue': 44, 12 | 'magenta': 45, 'cyan': 46, 'light gray': 47, 'dark gray': 100, 13 | 'light red': 101, 'light green': 102, 'light yellow': 103, 14 | 'light blue': 104, 'light magenta': 105, 'light cyan': 106, 15 | 'white': 107 16 | } 17 | 18 | 19 | def white_background(string): 20 | return "\033[107m%s\033[0m" % string 21 | 22 | 23 | def red_background(string): 24 | return "\033[101m%s\033[0m" % string 25 | 26 | 27 | def black_font(string): 28 | return "\033[30m%s\033[0m" % string 29 | 30 | 31 | def yellow_font(string): 32 | return "\033[93m%s\033[0m" % string 33 | 34 | 35 | def stack_style(string): 36 | return black_font(white_background(string)) 37 | 38 | 39 | def ordered_exit(signum, frame): 40 | print("\nStopped by user\n") 41 | exit(0) 42 | -------------------------------------------------------------------------------- /src/fairseq_ext/utils_import.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import importlib 4 | 5 | 6 | # ========== adapted from 7 | # https://github.com/pytorch/fairseq/blob/83e615d66905b8ca7483122a37da1a85f13f4b8e/fairseq/utils.py#L431 8 | # to avoid error in our setup 9 | # ========== 10 | def import_user_module(args): 11 | module_path = getattr(args, "user_dir", None) 12 | if module_path is not None: 13 | module_path = os.path.abspath(args.user_dir) 14 | if not os.path.exists(module_path): 15 | fairseq_rel_path = os.path.join(os.path.dirname(__file__), args.user_dir) 16 | if os.path.exists(fairseq_rel_path): 17 | module_path = fairseq_rel_path 18 | else: 19 | fairseq_rel_path = os.path.join( 20 | os.path.dirname(__file__), "..", args.user_dir 21 | ) 22 | if os.path.exists(fairseq_rel_path): 23 | module_path = fairseq_rel_path 24 | else: 25 | raise FileNotFoundError(module_path) 26 | 27 | # ensure that user modules are only imported once 28 | import_user_module.memo = getattr(import_user_module, "memo", set()) 29 | if module_path not in import_user_module.memo: 30 | import_user_module.memo.add(module_path) 31 | 32 | module_parent, module_name = os.path.split(module_path) 33 | if module_name not in sys.modules: 34 | sys.path.insert(0, module_parent) 35 | importlib.import_module(module_name) 36 | # else: 37 | # raise ImportError( 38 | # "Failed to import --user-dir={} because the corresponding module name " 39 | # "({}) is not globally unique. Please rename the directory to " 40 | # "something unique and try again.".format(module_path, module_name) 41 | # ) 42 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/README.md: -------------------------------------------------------------------------------- 1 | # Install (Compatible w. AMR Parser) 2 | 3 | ``` 4 | cd transition-amr-parser 5 | 6 | conda create --name torch-1.4 python=3.6 7 | conda activate torch-1.4 8 | conda install -y pytorch==1.4.0 torchvision==0.5.0 cudatoolkit=10.1 -c pytorch 9 | pip install h5py # Required for elmo embeddings. 10 | pip install -e . 11 | conda install -c dglteam "dgl-cuda10.1<0.5" 12 | ``` 13 | 14 | Changes for CPU: 15 | 16 | ``` 17 | conda install -y pytorch==1.4.0 torchvision==0.5.0 -c pytorch 18 | pip install dgl==0.4.3.post2 19 | ``` 20 | 21 | For GCN support, need to install latest torch-geometric. 22 | 23 | # Install (with newer torch for easy GCN support) 24 | 25 | ``` 26 | conda create -n ibm-amr-aligner python=3.8 -y 27 | conda activate ibm-amr-aligner 28 | 29 | # Use torch 1.8, since newer causes issue in torch-geometric. 30 | conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=11.1 -c pytorch -c conda-forge -y 31 | 32 | pip install h5py # Required for elmo embeddings. 33 | # (NOT TESTED) # conda install -c dglteam dgl-cuda11.1 -y # Installs DGL for TreeLSTM support. 34 | conda install pyg -c pyg -c conda-forge -y # Installs torch-geometric for GCN support. 35 | 36 | # The next step is tricky. Need to install AMR parser, but requires modifying `setup.py` 37 | 38 | # Step 1: 39 | vim setup.py # Comment out line about torch 1.4. 40 | 41 | # Step 2: 42 | pip install -e . 43 | ``` 44 | 45 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/ibm_neural_aligner/__init__.py -------------------------------------------------------------------------------- /src/ibm_neural_aligner/alignment_decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from vocab import PADDING_IDX 5 | 6 | 7 | class AlignmentDecoder(object): 8 | 9 | def batch_decode(self, batch_map, model_output): 10 | """ 11 | For each node, find most probable alignments. 12 | """ 13 | x_t = batch_map['text_tokens'] 14 | y_a = model_output['labels'] 15 | y_a_mask = model_output['labels_mask'] 16 | y_a_node_ids = model_output['label_node_ids'] 17 | align = model_output['batch_align'] 18 | 19 | batch_size, len_t = x_t.shape 20 | len_a = y_a.shape[-1] 21 | device = x_t.device 22 | 23 | for i_b in range(batch_size): 24 | 25 | # variables 26 | 27 | indexa = torch.arange(len_a).to(device) 28 | indext = torch.arange(len_t).to(device) 29 | 30 | # select 31 | 32 | b_x_t = x_t[i_b] 33 | b_y_a_mask = y_a_mask[i_b].view(-1) 34 | b_align = align[i_b] 35 | 36 | # mask 37 | 38 | b_x_t_mask = b_x_t != PADDING_IDX 39 | b_indexa = indexa[b_y_a_mask] 40 | b_indext = indext[b_x_t_mask] 41 | 42 | n = b_y_a_mask.sum().item() 43 | nt = b_x_t_mask.sum().item() 44 | 45 | assert b_align.shape == (n, nt, 1) 46 | 47 | # decode 48 | 49 | argmax = b_align.squeeze(2).argmax(1) 50 | 51 | assert argmax.shape == (n,) 52 | 53 | # node alignments 54 | 55 | node_alignments = [] 56 | for j in range(n): 57 | node_id = y_a_node_ids[i_b, b_indexa[j]].item() 58 | idx_txt = argmax[j].item() 59 | node_alignments.append((node_id, [idx_txt])) 60 | 61 | # fix order 62 | 63 | node_id_list = [x[0] for x in node_alignments] 64 | order = np.argsort(node_id_list) 65 | 66 | node_alignments = [node_alignments[idx] for idx in order] 67 | b_align = b_align[order] 68 | argmax = argmax[order] 69 | 70 | # result 71 | 72 | info = {} 73 | info['node_alignments'] = node_alignments 74 | info['posterior'] = b_align 75 | info['argmax'] = argmax 76 | 77 | yield info 78 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/dummy_align.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | from transition_amr_parser.io import read_amr 4 | 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--in-amr', default=None, required=True, type=str) 8 | parser.add_argument('--out-amr', default=None, required=True, type=str) 9 | args = parser.parse_args() 10 | 11 | 12 | def dummy_align(amr): 13 | amr = copy.deepcopy(amr) 14 | alignments = {} 15 | for k in sorted(amr.nodes.keys()): 16 | alignments[k] = [0] 17 | amr.alignments = alignments 18 | return amr 19 | 20 | 21 | if __name__ == '__main__': 22 | corpus = read_amr(args.in_amr, jamr=False) 23 | with open(args.out_amr, 'w') as f: 24 | for amr in corpus: 25 | amr = dummy_align(amr) 26 | f.write(f'{amr.__str__()}\n') 27 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/gcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | try: 5 | from torch_geometric.data import Batch, Data 6 | import torch_geometric.nn as gnn 7 | except: 8 | pass 9 | 10 | from vocab_definitions import MaskInfo 11 | 12 | 13 | class GCNEncoder(nn.Module): 14 | def __init__(self, embed, size, mode='gcn', dropout_p=0, num_layers=None): 15 | super().__init__() 16 | 17 | num_layers = 2 if num_layers is None else num_layers 18 | 19 | self.enc = GCN(embed, size, mode=mode, num_layers=num_layers) 20 | 21 | self.embed = embed 22 | self.size = size 23 | self.output_size = self.enc.output_size 24 | 25 | self.dropout_p = dropout_p 26 | self.dropout = nn.Dropout(p=dropout_p) 27 | 28 | @property 29 | def device(self): 30 | return next(self.parameters()).device 31 | 32 | def forward(self, batch_map): 33 | """ 34 | Returns: 35 | 36 | - output: BxLxD 37 | - labels: BxL from AMR vocab. 38 | - labels_mask: True if label, else False. Useful for padding and edge labels. 39 | - label_node_ids: Roughly, torch.arange(len(nodes)). 40 | """ 41 | device = batch_map['device'] 42 | batch_size = len(batch_map['items']) 43 | 44 | data = Batch.from_data_list([x['geometric_data'].clone().to(device) for x in batch_map['items']]) 45 | 46 | node_lengths = batch_map['amr_node_mask'].sum(-1).tolist() 47 | edge_lengths = [x['geometric_data'].y.shape[0] - n for x, n in zip(batch_map['items'], node_lengths)] 48 | max_node_length = max(node_lengths) 49 | size = self.enc.output_size 50 | 51 | gcn_output = self.enc(batch_map, data) 52 | 53 | shape = (sum(node_lengths) + sum(edge_lengths), size) 54 | assert gcn_output.shape == shape, (shape, gcn_output.shape) 55 | 56 | if True: 57 | new_h = torch.zeros(batch_size, max_node_length, size, dtype=torch.float, device=device) 58 | labels = torch.zeros(batch_size, max_node_length, dtype=torch.long, device=device) 59 | labels_mask = torch.zeros(batch_size, max_node_length, dtype=torch.bool, device=device) 60 | label_node_ids = torch.full((batch_size, max_node_length), -1, dtype=torch.long, device=device) 61 | 62 | offset = 0 63 | for i_b in range(batch_size): 64 | n = node_lengths[i_b] 65 | n_e = edge_lengths[i_b] 66 | if batch_map['add_edges'] == False: 67 | assert n_e == 0 68 | 69 | # 70 | new_h[i_b, :n] = gcn_output[offset:offset + n] 71 | labels[i_b, :n] = data.y[offset:offset + n] 72 | labels_mask[i_b, :n] = True 73 | label_node_ids[i_b, :n] = torch.arange(n, dtype=torch.long, device=device) 74 | 75 | # 76 | offset += n + n_e 77 | 78 | output = new_h 79 | output = self.dropout(output) 80 | 81 | return output, labels, labels_mask, label_node_ids 82 | 83 | 84 | class GCN(torch.nn.Module): 85 | def __init__(self, embed, size, mode='gcn', num_layers=2): 86 | super().__init__() 87 | 88 | self.num_layers = num_layers 89 | self.embed = embed 90 | self.size = size 91 | self.output_size = size 92 | 93 | input_size = embed.output_size 94 | 95 | self.W_node = nn.Linear(input_size, size) 96 | 97 | if mode == 'gcn': 98 | for i in range(num_layers): 99 | setattr(self, 'conv{}'.format(i + 1), gnn.GCNConv(size, size)) 100 | elif mode == 'gcn_transformer': 101 | for i in range(num_layers): 102 | setattr(self, 'conv{}'.format(i + 1), gnn.TransformerConv(size, size)) 103 | elif mode == 'gcn_film': 104 | for i in range(num_layers): 105 | setattr(self, 'conv{}'.format(i + 1), gnn.FiLMConv(size, size)) 106 | elif mode == 'gcn_gated': 107 | self.conv1 = gnn.GatedGraphConv(size, num_layers=num_layers) 108 | self.mode = mode 109 | 110 | self.mask_vec = nn.Parameter(torch.FloatTensor(input_size).normal_()) 111 | 112 | def compute_node_features(self, node_tokens, mask=None): 113 | if mask is None: 114 | return self.W_node(self.embed(node_tokens)) 115 | else: 116 | m = torch.cat(mask, 0) 117 | e = self.embed(node_tokens) 118 | e[m == MaskInfo.masked] = self.mask_vec 119 | return self.W_node(e) 120 | 121 | def forward(self, batch_map, data): 122 | batch_size = len(batch_map['items']) 123 | 124 | data.x = self.compute_node_features(data.y, mask=batch_map['mask_for_gcn']) 125 | 126 | # Hacky way to support any graphs with no edges. 127 | if any([data[i_b].edge_index.shape[0] == 0 for i_b in range(batch_size)]): 128 | return data.x 129 | 130 | x, edge_index = data.x, data.edge_index 131 | 132 | if self.mode == 'gcn_gated': 133 | x = self.conv1(x, edge_index) 134 | 135 | else: 136 | for i in range(self.num_layers): 137 | x = getattr(self, 'conv{}'.format(i + 1))(x, edge_index) 138 | if i < self.num_layers - 1: 139 | x = torch.relu(x) 140 | 141 | return x 142 | 143 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/gypsum/setup_amr2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TASK="AMR2.0" 4 | CACHE="cache-amr2" 5 | 6 | mkdir -p $CACHE 7 | 8 | cp ./DATA/${TASK}/aligned/cofill/train.txt ./${CACHE}/train.aligned.txt 9 | 10 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/dev.txt ./DATA/${TASK}/corpora/dev.txt.no_wiki 11 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/test.txt ./DATA/${TASK}/corpora/test.txt.no_wiki 12 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/train.txt ./DATA/${TASK}/corpora/train.txt.no_wiki 13 | 14 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/dev.txt.no_wiki --out-amr ./${CACHE}/dev.txt.no_wiki 15 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/test.txt.no_wiki --out-amr ./${CACHE}/test.txt.no_wiki 16 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/train.txt.no_wiki --out-amr ./${CACHE}/train.txt.no_wiki 17 | 18 | python src/ibm_neural_aligner/vocab.py \ 19 | --in-amrs \ 20 | ./DATA/${TASK}/aligned/cofill/dev.txt \ 21 | ./DATA/${TASK}/aligned/cofill/test.txt \ 22 | ./DATA/${TASK}/aligned/cofill/train.txt \ 23 | \ 24 | ./DATA/${TASK}/corpora/dev.txt \ 25 | ./DATA/${TASK}/corpora/test.txt \ 26 | ./DATA/${TASK}/corpora/train.txt \ 27 | \ 28 | ./DATA/${TASK}/corpora/dev.txt.no_wiki \ 29 | ./DATA/${TASK}/corpora/test.txt.no_wiki \ 30 | ./DATA/${TASK}/corpora/train.txt.no_wiki \ 31 | --out-text ./${CACHE}/vocab.text.txt \ 32 | --out-amr ./${CACHE}/vocab.amr.txt 33 | 34 | python src/ibm_neural_aligner/pretrained_embeddings.py --cuda --cache-dir ./${CACHE}/ --vocab ./${CACHE}/vocab.text.txt 35 | python src/ibm_neural_aligner/pretrained_embeddings.py --cuda --cache-dir ./${CACHE}/ --vocab ./${CACHE}/vocab.amr.txt 36 | 37 | cp src/ibm_neural_aligner/setup_amr2.sh $CACHE/setup_data.sh 38 | 39 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/gypsum/setup_amr3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TASK="AMR3.0" 4 | CACHE="cache-amr3" 5 | 6 | mkdir -p $CACHE 7 | 8 | cp ./DATA/${TASK}/aligned/cofill/train.txt ./${CACHE}/train.aligned.txt 9 | 10 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/dev.txt ./DATA/${TASK}/corpora/dev.txt.no_wiki 11 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/test.txt ./DATA/${TASK}/corpora/test.txt.no_wiki 12 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/train.txt ./DATA/${TASK}/corpora/train.txt.no_wiki 13 | 14 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/dev.txt.no_wiki --out-amr ./${cache}/dev.txt.no_wiki 15 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/test.txt.no_wiki --out-amr ./${cache}/test.txt.no_wiki 16 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/train.txt.no_wiki --out-amr ./${cache}/train.txt.no_wiki 17 | 18 | python src/ibm_neural_aligner/vocab.py \ 19 | --in-amrs \ 20 | ./DATA/${TASK}/aligned/cofill/dev.txt \ 21 | ./DATA/${TASK}/aligned/cofill/test.txt \ 22 | ./DATA/${TASK}/aligned/cofill/train.txt \ 23 | \ 24 | ./DATA/${TASK}/corpora/dev.txt \ 25 | ./DATA/${TASK}/corpora/test.txt \ 26 | ./DATA/${TASK}/corpora/train.txt \ 27 | \ 28 | ./DATA/${TASK}/corpora/dev.txt.no_wiki \ 29 | ./DATA/${TASK}/corpora/test.txt.no_wiki \ 30 | ./DATA/${TASK}/corpora/train.txt.no_wiki \ 31 | --out-text ./${CACHE}/vocab.text.txt \ 32 | --out-amr ./${CACHE}/vocab.amr.txt 33 | 34 | python src/ibm_neural_aligner/pretrained_embeddings.py --cuda --cache-dir ./${CACHE}/ --vocab ./${CACHE}/vocab.text.txt 35 | python src/ibm_neural_aligner/pretrained_embeddings.py --cuda --cache-dir ./${CACHE}/ --vocab ./${CACHE}/vocab.amr.txt 36 | 37 | cp src/ibm_neural_aligner/setup_amr3.sh $CACHE/setup_data.sh 38 | 39 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/gypsum/view_sweep.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import json 4 | import os 5 | 6 | 7 | errors = collections.Counter() 8 | 9 | class DidNotEval(ValueError): 10 | pass 11 | 12 | class DidNotTrain(ValueError): 13 | pass 14 | 15 | class NoModel(ValueError): 16 | pass 17 | 18 | 19 | def main(args): 20 | with open(args.file_list) as f: 21 | file_list = f.read().strip().split('\n') 22 | print('file_list', len(file_list)) 23 | 24 | def readfile(path): 25 | eval_path, train_path = path.split() 26 | 27 | path = eval_path 28 | slurm_out = os.path.join(path, 'slurm.out') 29 | eval_json = os.path.join(path, 'train.aligned.txt.eval.json') 30 | flags_json = os.path.join(train_path, 'flags.json') 31 | 32 | if not os.path.exists(slurm_out): 33 | print('did not eval {} {}'.format(train_path, eval_path)) 34 | raise DidNotEval('') 35 | 36 | if not os.path.exists(flags_json): 37 | print('did not train {} {}'.format(train_path, eval_path)) 38 | raise DidNotTrain('') 39 | 40 | with open(flags_json) as f: 41 | train_flags = json.loads(f.read()) 42 | 43 | eval_flags = None 44 | try: 45 | flags_json = os.path.join(eval_path, 'flags.json') 46 | with open(flags_json) as f: 47 | eval_flags = json.loads(f.read()) 48 | except: 49 | 50 | with open(slurm_out) as f: 51 | for i, line in enumerate(f): 52 | if line.startswith('{'): 53 | if line[1] == "'": 54 | eval_flags = eval(line) 55 | else: 56 | eval_flags = json.loads(line.strip()) 57 | break 58 | 59 | train_slurm = os.path.join(train_path, 'slurm.out') 60 | 61 | if eval_flags is None: 62 | print('nothing found', slurm_out, train_slurm) 63 | raise ValueError 64 | 65 | model_path = eval_flags['load'] 66 | 67 | if not os.path.exists(model_path): 68 | print('no model {} {}'.format(train_path, eval_path)) 69 | raise NoModel 70 | 71 | if os.path.exists(slurm_out) and not os.path.exists(eval_json): 72 | print('possible error {} {} {} {}'.format(slurm_out, eval_flags['hostname'], train_slurm, train_flags['hostname'])) 73 | errors['train-{}'.format(train_flags['hostname'])] += 1 74 | errors['eval-{}'.format(eval_flags['hostname'])] += 1 75 | 76 | if not os.path.exists(eval_json): 77 | raise ValueError 78 | 79 | # read eval_json 80 | with open(eval_json) as f: 81 | o = json.loads(f.read()) 82 | o['path'] = path 83 | o['train_flags'] = train_flags 84 | o['eval_flags'] = eval_flags 85 | return o 86 | 87 | def try_map(items, func): 88 | for x in items: 89 | try: 90 | yield func(x) 91 | except ValueError: 92 | continue 93 | 94 | def groupby(data): 95 | groups = collections.defaultdict(list) 96 | 97 | for ex in data: 98 | groups[ex['train_flags']['log_dir']].append(ex) 99 | 100 | return groups 101 | 102 | for k, v in sorted(errors.items(), key=lambda x: x[1]): 103 | print(k, v) 104 | 105 | data = [x for x in try_map(file_list, readfile)] 106 | for ex in data: 107 | recall = ex['Corpus Recall using spans for gold']['recall'] 108 | ex['recall'] = recall 109 | 110 | groups = groupby(data) 111 | 112 | for group in sorted(groups.values(), key=lambda x: max(map(lambda x: x['recall'], x))): 113 | print(group[0]['train_flags']['log_dir']) 114 | for ex in sorted(group, key=lambda x: x['path']): 115 | print(ex['recall'], ex['path']) 116 | 117 | print('data', len(data), 'groups', len(groups)) 118 | 119 | if __name__ == '__main__': 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--file-list', default='eval_json.2021-11-05a.txt', type=str) 122 | args = parser.parse_args() 123 | 124 | print(args.__dict__) 125 | 126 | main(args) 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/install.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | 4 | # activate normal env 5 | . set_environment.sh 6 | 7 | # load normal env 8 | # install DGL, in addition to normall install in README 9 | conda install -y -c dglteam "dgl-cuda10.1<0.5" 10 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/leamr_align.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | 4 | from austin_amr_utils.amr_readers import AMR_Reader 5 | from transition_amr_parser.io import read_amr2 6 | 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--in-amr', default=None, required=True, type=str) 10 | parser.add_argument('--out-amr', default=None, required=True, type=str) 11 | args = parser.parse_args() 12 | 13 | corpus = AMR_Reader().load(args.in_amr) 14 | 15 | with open(args.out_amr,'w') as f: 16 | for amr in corpus: 17 | f.write(amr.amr_string().strip() + '\n\n') 18 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/lexicon.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import os 4 | 5 | from amr_utils import safe_read as safe_read_ 6 | from tqdm import tqdm 7 | 8 | def safe_read(path, **kwargs): 9 | kwargs['ibm_format'], kwargs['tokenize'] = True, False 10 | return safe_read_(path, **kwargs) 11 | 12 | #path_old = './DATA/AMR2.0/aligned/cofill/dev.txt' 13 | #path_neu = './tmp_out/dev.aligned.txt' 14 | path_old = './DATA/AMR2.0/aligned/cofill/train.txt' 15 | path_neu = './tmp_out/train.aligned.txt' 16 | 17 | def build_lexicon(path): 18 | amrs = safe_read(path) 19 | lexicon = collections.defaultdict(collections.Counter) 20 | 21 | for amr in tqdm(amrs): 22 | for node_id, text_id_list in amr.alignments.items(): 23 | if len(text_id_list) > 1: 24 | continue 25 | 26 | for text_id in text_id_list: 27 | text = amr.tokens[text_id] 28 | node = amr.nodes[node_id] 29 | 30 | lexicon[node][text] += 1 31 | 32 | return lexicon 33 | 34 | lex_old = build_lexicon(path_old) 35 | lex_neu = build_lexicon(path_neu) 36 | 37 | def compare_lexicon(lex_old, lex_neu): 38 | for node, lex in sorted(lex_old.items(), key=lambda x: len(x[1])): 39 | 40 | row_old = [] 41 | for text in sorted(lex_old[node].keys()): 42 | row_old.append(text) 43 | 44 | row_neu = [] 45 | for text in sorted(lex_neu[node].keys()): 46 | row_neu.append(text) 47 | 48 | print(node) 49 | print('old', row_old) 50 | print('neu', row_neu) 51 | print('') 52 | 53 | def compare_lexicon_stats(lex_old, lex_neu): 54 | threshold = 10 55 | stats = collections.Counter() 56 | for node, lex in sorted(lex_old.items(), key=lambda x: len(x[1])): 57 | 58 | row_old = [] 59 | for text in sorted(lex_old[node].keys()): 60 | row_old.append(text) 61 | 62 | row_neu = [] 63 | for text in sorted(lex_neu[node].keys()): 64 | row_neu.append(text) 65 | 66 | stats['total'] += 1 67 | 68 | if len(row_old) >= len(row_neu): 69 | stats['old >= neu'] += 1 70 | else: 71 | print(f'old < neu, {len(row_old)} - {len(row_neu)} = {len(row_old) - len(row_neu)}') 72 | print(node) 73 | print('old', row_old) 74 | print('neu', row_neu) 75 | print('') 76 | 77 | if len(row_old) == len(row_neu): 78 | stats['old == neu'] += 1 79 | 80 | if len(row_old) <= threshold: 81 | stats['old <= t'] += 1 82 | 83 | if len(row_neu) <= threshold: 84 | stats['neu <= t'] += 1 85 | else: 86 | print(f'neu > t, {len(row_old)} - {len(row_neu)} = {len(row_old) - len(row_neu)}') 87 | print(node) 88 | print('old', row_old) 89 | print('neu', row_neu) 90 | print('') 91 | 92 | for k, v in stats.items(): 93 | if k == 'total': 94 | continue 95 | 96 | n = stats['total'] 97 | print(f'{k} : {v} / {n} ({v/n:.3f})') 98 | 99 | def view_lexicon(lexicon): 100 | for node, lex in sorted(lexicon.items(), key=lambda x: len(x[1])): 101 | 102 | row = [] 103 | for text in sorted(lexicon[node].keys()): 104 | row.append(text) 105 | 106 | print('{} {}'.format(node, row)) 107 | 108 | compare_lexicon(lex_old, lex_neu) 109 | compare_lexicon_stats(lex_old, lex_neu) 110 | 111 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/make_splits.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import collections 5 | import numpy as np 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--input', default=os.path.expanduser('~/data/AMR2.0/aligned/cofill/train.txt'), type=str) 9 | args = parser.parse_args() 10 | 11 | def readfile(path): 12 | data = [] 13 | b = None 14 | with open(path) as f: 15 | for line in f: 16 | if line.strip(): 17 | if b is None: 18 | b = '' 19 | b += line 20 | else: 21 | if b is not None: 22 | data.append(b) 23 | b = None 24 | if b is not None: 25 | data.append(b) 26 | return data 27 | 28 | def writefile(data, path): 29 | print('writing', path) 30 | with open(path, 'w') as f: 31 | for b in data: 32 | f.write(b) 33 | f.write('\n') 34 | 35 | 36 | # read 37 | data = readfile(args.input) 38 | print(len(data)) 39 | 40 | # shuffle 41 | np.random.seed(113) 42 | np.random.shuffle(data) 43 | 44 | # split 45 | n = 1000 46 | 47 | # train 48 | train = data[n:] 49 | 50 | # unseen dev 51 | unseen = data[:n] 52 | 53 | # seen dev 54 | seen = train[:n] 55 | 56 | # write 57 | writefile(train, args.input + '.train-v1') 58 | writefile(unseen, args.input + '.dev-unseen-v1') 59 | writefile(seen, args.input + '.dev-seen-v1') 60 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/metric_utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import numpy as np 4 | 5 | from ibm_neural_aligner.amr_utils import convert_amr_to_tree, compute_pairwise_distance, get_node_ids 6 | 7 | 8 | def fertility_proxy(amr, ignore_nodes=('country', '-', 'and', 'person', 'name')): 9 | """ Measures the average number of aligned words per sentence. 10 | 11 | Lower indicates higher fertility. 12 | """ 13 | alignments = amr.alignments.copy() 14 | 15 | for k in list(alignments.keys()): 16 | if ignore_nodes is not None and amr.nodes[k] in ignore_nodes: 17 | del alignments[k] 18 | 19 | return len(set([v[0] for k, v in alignments.items()])) 20 | 21 | 22 | def distortion_proxy(amr, pairwise_dist=None): 23 | """ Measures the difference between implied and actual distance. 24 | 25 | Lower indicates lower distortion. 26 | """ 27 | if len(amr.nodes) == 1 or len(amr.alignments) == 1: 28 | return 0, [] 29 | 30 | if pairwise_dist is None: 31 | tree = convert_amr_to_tree(amr) 32 | pairwise_dist = compute_pairwise_distance(tree) 33 | node_ids = get_node_ids(amr) 34 | 35 | c = collections.defaultdict(list) 36 | 37 | for i in range(len(node_ids)): 38 | for j in range(len(node_ids)): 39 | if i <= j: 40 | continue 41 | node1, node2 = node_ids[i], node_ids[j] 42 | if node1 not in amr.alignments or node2 not in amr.alignments: 43 | continue 44 | pos1, pos2 = amr.alignments[node1][0], amr.alignments[node2][0] 45 | c['i'].append(i) 46 | c['j'].append(j) 47 | c['pos1'].append(pos1) 48 | c['pos2'].append(pos2) 49 | 50 | actual_distance = np.abs(np.array(c['pos1']) - np.array(c['pos2'])) 51 | implied_distance = pairwise_dist[c['i'], c['j']].numpy() 52 | proxy = np.power(np.clip(actual_distance - implied_distance, 0, np.inf), 2) 53 | 54 | return proxy.mean(), proxy 55 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/pretrained_embeddings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import hashlib 3 | import json 4 | import os 5 | import sys 6 | 7 | import numpy as np 8 | import torch 9 | from tqdm import tqdm 10 | 11 | from ibm_neural_aligner.vocab_definitions import BOS_TOK, EOS_TOK, special_tokens 12 | from ibm_neural_aligner.standalone_elmo import batch_to_ids, ElmoCharacterEncoder, remove_sentence_boundaries 13 | 14 | 15 | # files for original elmo model 16 | weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' 17 | options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json' 18 | 19 | 20 | def maybe_download(remote_url, cache_dir): 21 | path = os.path.join(cache_dir, os.path.basename(remote_url)) 22 | if not os.path.exists(path): 23 | os.system(f'curl {remote_url} -o {path} -L') 24 | return path 25 | 26 | 27 | def hash_string_list(string_list): 28 | m = hashlib.sha256() 29 | for s in string_list: 30 | m.update(str.encode(s)) 31 | return m.hexdigest()[:8] 32 | 33 | 34 | def read_text_vocab_file(path): 35 | output = [] 36 | with open(path) as f: 37 | for token in f.read().splitlines(): 38 | output.append(token) 39 | return output 40 | 41 | 42 | def read_amr_vocab_file(path): 43 | output = [] 44 | with open(path) as f: 45 | for token in f.read().splitlines(): 46 | output.append(token) 47 | return output 48 | 49 | 50 | def get_character_embeddings_from_elmo(tokens, cache_dir, cuda=False): 51 | assert len(special_tokens) == 3 52 | assert tokens[1] == BOS_TOK and tokens[2] == EOS_TOK 53 | 54 | # Remove special tokens. 55 | vocab_to_cache = tokens[3:] 56 | 57 | size = 512 58 | batch_size = 1024 59 | 60 | char_embedder = ElmoCharacterEncoder( 61 | options_file=maybe_download(options_file, cache_dir=cache_dir), 62 | weight_file=maybe_download(weights_file, cache_dir=cache_dir), 63 | requires_grad=False) 64 | if cuda: 65 | char_embedder.cuda() 66 | 67 | all_vocab_to_cache = [BOS_TOK, EOS_TOK] + vocab_to_cache 68 | 69 | shape = (1 + len(all_vocab_to_cache), size) 70 | embeddings = np.zeros(shape, dtype=np.float32) 71 | 72 | for start in tqdm(range(0, len(all_vocab_to_cache), batch_size), desc='embed'): 73 | end = min(start + batch_size, len(all_vocab_to_cache)) 74 | batch = all_vocab_to_cache[start:end] 75 | batch_ids = batch_to_ids([[x] for x in batch]) 76 | if cuda: 77 | batch_ids = batch_ids.cuda() 78 | output = char_embedder(batch_ids) 79 | vec = remove_sentence_boundaries(output['token_embedding'], output['mask'])[0].squeeze(1) 80 | 81 | embeddings[1 + start:1 + end] = vec.cpu() 82 | 83 | return embeddings 84 | 85 | 86 | def read_embeddings(tokens, path=None, cache_dir=None): 87 | if path is None: 88 | token_hash = hash_string_list(tokens) 89 | if cache_dir: 90 | path = '{}/elmo.{}.npy'.format(cache_dir, token_hash) 91 | else: 92 | path = 'elmo.{}.npy'.format(token_hash) 93 | assert os.path.exists(path), path 94 | print('reading embeddings from {} for {} tokens'.format(path, len(tokens))) 95 | embeddings = np.load(path) 96 | assert embeddings.shape[0] == len(tokens) 97 | return embeddings 98 | 99 | 100 | def write_embeddings(path, embeddings): 101 | np.save(path, embeddings) 102 | 103 | with open(path + '.shape', 'w') as f: 104 | f.write(json.dumps(embeddings.shape)) 105 | 106 | 107 | def main(arg): 108 | 109 | tokens = read_text_vocab_file(args.vocab) 110 | token_hash = hash_string_list(tokens) 111 | 112 | print('found {} tokens with hash = {}'.format(len(tokens), token_hash)) 113 | path = f'{args.cache_dir}/elmo.{token_hash}.npy' 114 | 115 | if os.path.exists(path): 116 | print('embeddings found at {}, exiting'.format(path)) 117 | sys.exit() 118 | 119 | embeddings = get_character_embeddings_from_elmo(tokens, args.cache_dir, args.cuda) 120 | 121 | print(f'writing to {path}') 122 | write_embeddings(path, embeddings) 123 | 124 | 125 | if __name__ == '__main__': 126 | 127 | parser = argparse.ArgumentParser() 128 | parser.add_argument("--vocab", type=str, help="Vocab file.", 129 | required=True) 130 | parser.add_argument('--cuda', action='store_true', 131 | help='If true, then use GPU.') 132 | parser.add_argument('--cache-dir', type=str, required=True, 133 | help='Folder to save elmo weights and embeddings.') 134 | args = parser.parse_args() 135 | 136 | if not torch.cuda.is_available(): 137 | print('WARNING: CUDA not available. Falling back to CPU.') 138 | args.cuda = False 139 | 140 | main(args) 141 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/pretrained_embeddings.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | # this requires a special environment with allennlp 5 | [ -z "$1" ] && echo -e "\n$0 /path/to/embeddings/ (where vocab..txt are) \n" 6 | FOLDER=$1 7 | 8 | set -o nounset 9 | 10 | python ibm_neural_aligner/pretrained_embeddings.py --cuda --allow-cpu \ 11 | --vocab $FOLDER/vocab.text.txt \ 12 | --cache-dir $FOLDER/ 13 | python ibm_neural_aligner/pretrained_embeddings.py --cuda --allow-cpu \ 14 | --vocab $FOLDER/vocab.amr.txt \ 15 | --cache-dir $FOLDER/ 16 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/run_detailed_eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import json 4 | import os 5 | 6 | from tqdm import tqdm 7 | 8 | import numpy as np 9 | 10 | from evaluation import EvalAlignments 11 | from formatter import amr_to_pretty_format 12 | from transition_amr_parser.io import read_amr 13 | 14 | 15 | class CorpusRecall_WithGoldSpans_WithSomeNodes(object): 16 | 17 | def __init__(self): 18 | self.state = collections.defaultdict(list) 19 | 20 | def update(self, gold, pred, pred_ref): 21 | gold_align, pred_align = gold.alignments, pred.alignments 22 | total, correct = 0, 0 23 | 24 | for node_id in gold_align.keys(): 25 | 26 | # Ignore unaligned nodes 27 | if gold_align[node_id] is None: 28 | continue 29 | 30 | # Penalty for not predicting. 31 | if node_id not in pred_align or pred_align[node_id] is None: 32 | total += 1 33 | continue 34 | 35 | # Be fair. 36 | if node_id not in pred_ref.alignments: 37 | total += 1 38 | continue 39 | 40 | total += 1 41 | 42 | g0 = gold_align[node_id][0] - 1 43 | g1 = gold_align[node_id][-1] - 1 44 | 45 | p0 = pred_align[node_id][0] - 1 46 | p1 = pred_align[node_id][-1] - 1 47 | 48 | gset = set(range(g0, g1 + 1)) 49 | pset = set(range(p0, p1 + 1)) 50 | 51 | if len(set.intersection(pset, gset)) > 0: 52 | correct += 1 53 | 54 | self.state['total'].append(total) 55 | self.state['correct'].append(correct) 56 | 57 | def finish(self): 58 | total = np.sum(self.state['total']).item() 59 | correct = np.sum(self.state['correct']).item() 60 | if total: 61 | recall = correct / total 62 | else: 63 | recall = 0 64 | result = collections.OrderedDict() 65 | result['correct'] = correct 66 | result['total'] = total 67 | result['recall'] = recall 68 | 69 | return result 70 | 71 | 72 | def main(args): 73 | gold = read_amr(args.gold) 74 | neural = read_amr(args.neural) 75 | cofill = read_amr(args.cofill) 76 | 77 | d_gold = {amr.id: amr for amr in gold} 78 | d_neural = {amr.id: amr for amr in neural} 79 | d_cofill = {amr.id: amr for amr in cofill} 80 | 81 | keys = [k for k in d_gold.keys() if k in d_neural and k in d_cofill] 82 | 83 | gold = [d_gold[k] for k in keys] 84 | neural = [d_neural[k] for k in keys] 85 | cofill = [d_cofill[k] for k in keys] 86 | 87 | def check_1(): 88 | for g, p in zip(gold, neural): 89 | for k, v in g.nodes.items(): 90 | assert v == p.nodes[k], (k, v, p.nodes[k], g.id) 91 | 92 | for g, p in zip(gold, cofill): 93 | for k, v in g.nodes.items(): 94 | assert v == p.nodes[k], (k, v, p.nodes[k], g.id) 95 | 96 | def print_result(result, header=None): 97 | def format_value(val): 98 | if isinstance(val, float): 99 | return '{:.3f}'.format(val) 100 | return val 101 | 102 | underl = '-' * len(header) 103 | output = '{}\n{}\n'.format(header, underl) 104 | for k, v in result.items(): 105 | output += '- {} = {}\n'.format(k, format_value(v)) 106 | print(output) 107 | 108 | def run_eval(): 109 | m_n = CorpusRecall_WithGoldSpans_WithSomeNodes() 110 | m_c = CorpusRecall_WithGoldSpans_WithSomeNodes() 111 | 112 | for i, (g, p_n, p_c) in tqdm(enumerate(zip(gold, neural, cofill)), desc='eval'): 113 | m_n.update(g, p_n, p_c) 114 | m_c.update(g, p_c, p_c) 115 | 116 | res_n = m_n.finish() 117 | res_c = m_c.finish() 118 | 119 | print_result(res_n, 'Neural') 120 | print_result(res_c, 'COFILL') 121 | 122 | 123 | check_1() 124 | run_eval() 125 | 126 | 127 | if __name__ == '__main__': 128 | parser = argparse.ArgumentParser() 129 | parser.add_argument('--gold', default=None, required=True, type=str) 130 | parser.add_argument('--neural', default=None, required=True, type=str) 131 | parser.add_argument('--cofill', default=None, required=True, type=str) 132 | args = parser.parse_args() 133 | 134 | main(args) 135 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/run_eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | from amr_utils import safe_read 6 | from evaluation import EvalAlignments 7 | from formatter import amr_to_pretty_format 8 | from transition_amr_parser.io import read_amr2 9 | 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--gold', default=None, required=True, type=str) 13 | parser.add_argument('--pred', default=None, required=True, type=str) 14 | parser.add_argument('--out-json', default=None, type=str) 15 | parser.add_argument('--subset', action='store_true') 16 | parser.add_argument('--increment', action='store_true') 17 | args = parser.parse_args() 18 | 19 | if args.out_json is None: 20 | args.out_json = args.pred + '.eval.json' 21 | 22 | print('start eval') 23 | 24 | eval_output = EvalAlignments().run(args.gold, args.pred, flexible=True, subset=args.subset, increment=args.increment) 25 | 26 | print(eval_output) 27 | 28 | with open(args.out_json, 'w') as f: 29 | f.write(json.dumps(eval_output)) 30 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/tokenize_amr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transition_amr_parser.amr import protected_tokenizer 3 | 4 | 5 | def parse_arguments(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--in-amr", type=str, help="AMR file to be tokenized", 8 | required=True) 9 | parser.add_argument("--out-amr", type=str, help="Output AMR file.", 10 | required=True) 11 | return parser.parse_args() 12 | 13 | 14 | def main(args): 15 | """ 16 | Add `# ::tok` line with newly tokenized sentence. 17 | """ 18 | 19 | # read and write 20 | with open(args.in_amr) as f_in, open(args.out_amr, 'w') as f_out: 21 | for line in f_in: 22 | 23 | if line.startswith('# ::tok'): 24 | raise Exception("File already tokenized!") 25 | 26 | elif line.startswith('# ::snt'): 27 | f_out.write(line) 28 | 29 | # tokenize 30 | sentence = line.split('# ::snt')[-1].strip() 31 | tokens, _ = protected_tokenizer(sentence) 32 | tokens_str = ' '.join(tokens) 33 | f_out.write(f'# ::tok {tokens_str}\n') 34 | 35 | else: 36 | f_out.write(line) 37 | 38 | 39 | if __name__ == '__main__': 40 | main(parse_arguments()) 41 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/transformer_lm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Originally from: 3 | https://github.com/pytorch/examples/blob/13acec6d7c78dacd5e1fe9b0b4a325e1d39abc15/word_language_model/model.py 4 | """ 5 | 6 | import math 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | 12 | class PositionalEncoding(nn.Module): 13 | r"""Inject some information about the relative or absolute position of the tokens 14 | in the sequence. The positional encodings have the same dimension as 15 | the embeddings, so that the two can be summed. Here, we use sine and cosine 16 | functions of different frequencies. 17 | .. math:: 18 | \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) 19 | \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) 20 | \text{where pos is the word position and i is the embed idx) 21 | Args: 22 | d_model: the embed dim (required). 23 | dropout: the dropout value (default=0.1). 24 | max_len: the max. length of the incoming sequence (default=5000). 25 | Examples: 26 | >>> pos_encoder = PositionalEncoding(d_model) 27 | """ 28 | 29 | def __init__(self, d_model, dropout=0.1, max_len=5000): 30 | super().__init__() 31 | self.dropout = nn.Dropout(p=dropout) 32 | 33 | pe = torch.zeros(max_len, d_model) 34 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 35 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 36 | pe[:, 0::2] = torch.sin(position * div_term) 37 | pe[:, 1::2] = torch.cos(position * div_term) 38 | pe = pe.unsqueeze(0).transpose(0, 1) 39 | self.register_buffer('pe', pe) 40 | 41 | def forward(self, x): 42 | r"""Inputs of forward function 43 | Args: 44 | x: the sequence fed to the positional encoder model (required). 45 | Shape: 46 | x: [sequence length, batch size, embed dim] 47 | output: [sequence length, batch size, embed dim] 48 | Examples: 49 | >>> output = pos_encoder(x) 50 | """ 51 | 52 | x = x + self.pe[:x.size(0), :] 53 | return self.dropout(x) 54 | 55 | 56 | class TransformerModel(nn.Module): 57 | 58 | def __init__(self, ninp, nhead, nhid, nlayers, dropout=0.5): 59 | super().__init__() 60 | try: 61 | from torch.nn import TransformerEncoder, TransformerEncoderLayer 62 | except: 63 | raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.') 64 | self.model_type = 'Transformer' 65 | self.src_mask = None 66 | self.pos_encoder = PositionalEncoding(ninp, dropout) 67 | encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout) 68 | self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) 69 | self.ninp = ninp 70 | 71 | def _generate_square_subsequent_mask(self, sz): 72 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) 73 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) 74 | return mask 75 | 76 | def forward(self, src, src2=None, has_mask=True): 77 | if has_mask: 78 | device = src.device 79 | if self.src_mask is None or self.src_mask.size(0) != len(src): 80 | mask = self._generate_square_subsequent_mask(len(src)).to(device) 81 | self.src_mask = mask 82 | else: 83 | self.src_mask = None 84 | 85 | if src2 is None: 86 | src2 = self.pos_encoder(src * math.sqrt(self.ninp)) 87 | output = self.transformer_encoder(src, self.src_mask) 88 | return output 89 | 90 | 91 | class BiTransformer(nn.Module): 92 | def __init__(self, ninp, nhead, nhid, nlayers, dropout=0.5): 93 | super().__init__() 94 | 95 | self.fwd_enc = TransformerModel(ninp, nhead, nhid, nlayers, dropout) 96 | self.bwd_enc = TransformerModel(ninp, nhead, nhid, nlayers, dropout) 97 | 98 | def forward(self, src): 99 | assert len(src.shape) == 3 100 | 101 | src = self.fwd_enc.pos_encoder(src * math.sqrt(self.fwd_enc.ninp)) 102 | 103 | # FORWARD 104 | fwd_out = self.fwd_enc(src, src) 105 | 106 | # BACKWARD 107 | bwd_src = torch.flip(src, [1]) 108 | bwd_out = self.bwd_enc(bwd_src, bwd_src) 109 | bwd_out = torch.flip(bwd_out, [1]) 110 | 111 | output = torch.cat([fwd_out, bwd_out], -1) 112 | 113 | return output 114 | 115 | 116 | class TransformerLM(nn.Module): 117 | def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): 118 | super().__init__() 119 | self.ninp = ninp 120 | self.decoder = nn.Linear(ninp, ntoken) 121 | self.transformer_encoder = TransformerModel(ninp, nhead, nhid, nlayers, dropout) 122 | 123 | self.init_weights() 124 | 125 | def init_weights(self): 126 | initrange = 0.1 127 | nn.init.uniform_(self.encoder.weight, -initrange, initrange) 128 | nn.init.zeros_(self.decoder.weight) 129 | nn.init.uniform_(self.decoder.weight, -initrange, initrange) 130 | 131 | def forward(self, src, has_mask=True): 132 | src = self.encoder(src) 133 | output = self.transformer_encoder(src) 134 | output = self.decoder(output) 135 | return F.log_softmax(output, dim=-1) 136 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/view_manual_alignments.py: -------------------------------------------------------------------------------- 1 | """ 2 | not found manual_dev 3 | - overlap 150 4 | - notfound 0 5 | not found manual_test 6 | - overlap 188 7 | - notfound 12 8 | - edinburgh_1003.8 9 | - edinburgh_1003.9 10 | - edinburgh_1003.3 11 | - edinburgh_1003.4 12 | - edinburgh_1003.7 13 | - edinburgh_1003.10 14 | - edinburgh_1003.1 15 | - edinburgh_1003.2 16 | - edinburgh_1003.5 17 | - NATHANS_EXAMPLE 18 | - edinburgh_1003.6 19 | - AUSTINS_EXAMPLE 20 | """ 21 | 22 | 23 | import argparse 24 | import collections 25 | import json 26 | 27 | from tqdm import tqdm 28 | 29 | from transition_amr_parser.io import read_amr2 30 | 31 | 32 | MY_GLOBALS = {} 33 | MY_GLOBALS['found'] = set() 34 | 35 | 36 | def read_json(filename): 37 | with open(filename) as f: 38 | return json.loads(f.read()) 39 | 40 | 41 | def get_keys(corpus): 42 | if isinstance(corpus, (tuple, list)): 43 | d = {} 44 | deleted = set() 45 | for amr in corpus: 46 | if amr.id in deleted: 47 | continue 48 | if amr.id in d: 49 | del d[amr.id] 50 | print(f'deleted {amr.id}') 51 | continue 52 | d[amr.id] = amr 53 | return get_keys(d) 54 | return corpus.keys() 55 | 56 | 57 | def print_overlap(datasets, name_a, name_b): 58 | keys_a = set(get_keys(datasets[name_a])) 59 | keys_b = set(get_keys(datasets[name_b])) 60 | overlap = set.intersection(keys_a, keys_b) 61 | print(f'overlap\n- {name_a} = {len(keys_a)}\n- {name_b} = {len(keys_b)}\n- overlap = {len(overlap)}') 62 | 63 | # Update found. 64 | MY_GLOBALS['found'] = set.union(MY_GLOBALS['found'], overlap) 65 | 66 | 67 | def check_overlap_austin_and_manual(datasets): 68 | print_overlap(datasets, 'austin', 'manual_dev') 69 | print_overlap(datasets, 'austin', 'manual_test') 70 | 71 | 72 | def check_overlap_prince_and_manual(datasets): 73 | print_overlap(datasets, 'prince_amr', 'manual_dev') 74 | print_overlap(datasets, 'prince_amr', 'manual_test') 75 | 76 | 77 | def check_overlap_amr3_and_manual(datasets): 78 | print_overlap(datasets, 'amr3_train', 'manual_dev') 79 | print_overlap(datasets, 'amr3_train', 'manual_test') 80 | 81 | print_overlap(datasets, 'amr3_dev', 'manual_dev') 82 | print_overlap(datasets, 'amr3_dev', 'manual_test') 83 | 84 | print_overlap(datasets, 'amr3_test', 'manual_dev') 85 | print_overlap(datasets, 'amr3_test', 'manual_test') 86 | 87 | 88 | def check_notfound(datasets): 89 | 90 | for name in ['manual_dev', 'manual_test']: 91 | print(f'not found {name}') 92 | keys = set(datasets[name].keys()) 93 | overlap = set.intersection(MY_GLOBALS['found'], keys) 94 | notfound = {k for k in keys if k not in overlap} 95 | print(f'- overlap {len(overlap)}') 96 | print(f'- notfound {len(notfound)}') 97 | for k in notfound: 98 | print(f'- {k}') 99 | 100 | 101 | def main(): 102 | paths = {} 103 | 104 | # This has some useful information such as node names, but it is not clear 105 | # which are manual alignments. 106 | paths['austin'] = 'ldc+little_prince.subgraph_alignments.json' 107 | 108 | # This does not have node names, but does have AMR ids for manually aligned AMR. 109 | paths['manual_dev'] = "leamr/data-release/alignments/leamr_dev.subgraph_alignments.gold.json" 110 | paths['manual_test'] = "leamr/data-release/alignments/leamr_test.subgraph_alignments.gold.json" 111 | 112 | # Path to little prince data. 113 | paths['prince_amr'] = 'amr-bank-struct-v1.6.dummy_align.txt' 114 | 115 | # Path to amr3 data. 116 | paths['amr3_train'] = 'DATA/AMR3.0/corpora/train.dummy_align.txt' 117 | paths['amr3_dev'] = 'DATA/AMR3.0/corpora/dev.dummy_align.txt' 118 | paths['amr3_test'] = 'DATA/AMR3.0/corpora/test.dummy_align.txt' 119 | 120 | for k, v in paths.items(): 121 | print(k, v) 122 | 123 | datasets = {} 124 | datasets = {k: read_amr2(v, ibm_format=True, tokenize=False) if 'amr' in k else read_json(v) for k, v in paths.items()} 125 | 126 | check_overlap_austin_and_manual(datasets) 127 | check_overlap_prince_and_manual(datasets) 128 | check_overlap_amr3_and_manual(datasets) 129 | check_notfound(datasets) 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/vocab.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | from ipdb import set_trace 4 | from ibm_neural_aligner.vocab_definitions import ( 5 | PADDING_IDX, PADDING_TOK, BOS_IDX, BOS_TOK, EOS_IDX, EOS_TOK, special_tokens 6 | ) 7 | from transition_amr_parser.amr import protected_tokenizer 8 | from transition_amr_parser.io import read_amr 9 | 10 | 11 | def get_tokens(path, jamr=False, tokenize=False): 12 | local_tokens = set() 13 | local_graph_tokens = set() 14 | 15 | for amr in read_amr(path, jamr=jamr): 16 | 17 | if tokenize: 18 | assert amr.sentence 19 | tokens, _ = protected_tokenizer(amr.sentence) 20 | 21 | else: 22 | assert amr.tokens, \ 23 | f"Unless --tokenize used, {path} must contain a # ::tok field" 24 | tokens = amr.tokens 25 | 26 | # surface tokens 27 | local_tokens.update(tokens) 28 | # graph tokens 29 | for _, label, _ in amr.edges: 30 | local_graph_tokens.add(label) 31 | local_graph_tokens.update(amr.nodes.values()) 32 | 33 | return local_tokens, local_graph_tokens 34 | 35 | 36 | def main(args): 37 | 38 | summary = collections.defaultdict(list) 39 | 40 | # collect information for all AMR 41 | tokens = set() 42 | graph_tokens = set() 43 | for amr_file in args.in_amrs: 44 | 45 | print('reading {}\n'.format(amr_file)) 46 | txt_toks, amr_toks = get_tokens( 47 | amr_file, jamr=False, tokenize=args.tokenize 48 | ) 49 | tokens = set.union(tokens, txt_toks) 50 | graph_tokens = set.union(graph_tokens, amr_toks) 51 | 52 | o = {} 53 | o['txt'] = len(txt_toks) 54 | o['amr'] = len(amr_toks) 55 | o['success'] = True 56 | 57 | summary[amr_file].append(o) 58 | print(o) 59 | 60 | # graph_tokens.add('') 61 | graph_tokens.add('(') 62 | graph_tokens.add(')') 63 | for tok in special_tokens: 64 | if tok in tokens: 65 | tokens.remove(tok) 66 | if tok in graph_tokens: 67 | graph_tokens.remove(tok) 68 | 69 | # Add special symbols at the beginning 70 | # surface 71 | tokens = special_tokens + sorted(tokens) 72 | # graph 73 | # useful for linearized parse 74 | graph_tokens = special_tokens + sorted(graph_tokens) 75 | 76 | # print summary 77 | print('summary\n-------') 78 | 79 | for k, v in summary.items(): 80 | print(k) 81 | for vv in v: 82 | print(vv) 83 | print('') 84 | 85 | print('writing...') 86 | 87 | # write files 88 | print('found {} text tokens'.format(len(tokens))) 89 | with open(args.out_text, 'w') as f: 90 | for tok in tokens: 91 | f.write(tok + '\n') 92 | print('found {} amr tokens'.format(len(graph_tokens))) 93 | with open(args.out_amr, 'w') as f: 94 | for tok in graph_tokens: 95 | f.write(tok + '\n') 96 | 97 | 98 | if __name__ == '__main__': 99 | import argparse 100 | 101 | # Argument handling 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument( 104 | "--in-amrs", help="Read AMR files to determine vocabulary.", 105 | nargs='+', required=True) 106 | parser.add_argument( 107 | "--out-text", help="Output text vocab.", 108 | required=True) 109 | parser.add_argument( 110 | "--out-amr", help="Output amr vocab.", 111 | required=True) 112 | parser.add_argument( 113 | "--tokenize", help="Use JAMR-like tokenization instad of # ::tok.", 114 | action='store_true') 115 | args = parser.parse_args() 116 | 117 | print(json.dumps(args.__dict__)) 118 | 119 | main(args) 120 | -------------------------------------------------------------------------------- /src/ibm_neural_aligner/vocab_definitions.py: -------------------------------------------------------------------------------- 1 | class MaskInfo: 2 | unchanged = 0 3 | masked = 1 4 | unchanged_and_predict = 2 5 | 6 | PADDING_IDX = 0 7 | PADDING_TOK = '' 8 | 9 | BOS_IDX = 1 10 | BOS_TOK = '' 11 | 12 | EOS_IDX = 2 13 | EOS_TOK = '' 14 | 15 | special_tokens = [PADDING_TOK, BOS_TOK, EOS_TOK] 16 | 17 | assert special_tokens.index(PADDING_TOK) == PADDING_IDX 18 | assert special_tokens.index(BOS_TOK) == BOS_IDX 19 | assert special_tokens.index(EOS_TOK) == EOS_IDX 20 | -------------------------------------------------------------------------------- /src/transition_amr_parser/__init__.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import warnings 3 | # check for installation of torch-scatter 4 | try: 5 | import torch_scatter 6 | except: 7 | warnings.warn("torch-scatter is either not installed or not properly installed; please check for the appropriate version", UserWarning) 8 | raise Exception("please review README.d instructions on installing the appropriate version of torch-scatter") 9 | # cmd = ["pip", "install", "torch-scatter", "-f", "https://data.pyg.org/whl/torch-1.13.1+cu117.html"] 10 | # print("try downloading torch-scatter") 11 | # subprocess.call(cmd) 12 | 13 | # set this to true to start the debugger on any exception 14 | DEBUG_MODE = False 15 | if DEBUG_MODE: 16 | import sys 17 | import ipdb 18 | import traceback 19 | 20 | def debughook(etype, value, tb): 21 | traceback.print_exception(etype, value, tb) 22 | print() 23 | # post-mortem debugger 24 | ipdb.pm() 25 | sys.excepthook = debughook 26 | -------------------------------------------------------------------------------- /src/transition_amr_parser/action_pointer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/transition_amr_parser/action_pointer/__init__.py -------------------------------------------------------------------------------- /src/transition_amr_parser/action_pointer/amr_parser.py: -------------------------------------------------------------------------------- 1 | # Standalone AMR parser 2 | 3 | import os 4 | import json 5 | import torch 6 | from transition_amr_parser.model import AMRModel 7 | import transition_amr_parser.utils as utils 8 | from fairseq.models.roberta import RobertaModel 9 | from transition_amr_parser.roberta_utils import extract_features_aligned_to_words 10 | 11 | 12 | class AMRParser(): 13 | 14 | def __init__(self, model_path, roberta_cache_path=None, oracle_stats_path=None, config_path=None, model_use_gpu=False, roberta_use_gpu=False, verbose=False, logger=None): 15 | if not oracle_stats_path: 16 | model_folder = os.path.dirname(model_path) 17 | oracle_stats_path = os.path.join(model_folder, "train.rules.json") 18 | assert os.path.isfile(oracle_stats_path), \ 19 | f'Expected train.rules.json in {model_folder}' 20 | if not config_path: 21 | model_folder = os.path.dirname(model_path) 22 | config_path = os.path.join(model_folder, "config.json") 23 | assert os.path.isfile(config_path), \ 24 | f'Expected config.json in {model_folder}' 25 | self.model = self.load_model(model_path, oracle_stats_path, config_path, model_use_gpu) 26 | self.roberta = self.load_roberta(roberta_use_gpu, roberta_cache_path) 27 | self.logger = logger 28 | 29 | def load_roberta(self, roberta_use_gpu, roberta_cache_path=None): 30 | 31 | if not roberta_cache_path: 32 | # Load the Roberta Model from torch hub 33 | roberta = torch.hub.load('pytorch/fairseq', 'roberta.large') 34 | else: 35 | roberta = RobertaModel.from_pretrained(roberta_cache_path, checkpoint_file='model.pt') 36 | roberta.eval() 37 | if roberta_use_gpu: 38 | roberta.cuda() 39 | return roberta 40 | 41 | def load_model(self, model_path, oracle_stats_path, config_path, model_use_gpu): 42 | 43 | oracle_stats = json.load(open(oracle_stats_path)) 44 | config = json.load(open(config_path)) 45 | model = AMRModel( 46 | oracle_stats=oracle_stats, 47 | embedding_dim=config["embedding_dim"], 48 | action_embedding_dim=config["action_embedding_dim"], 49 | char_embedding_dim=config["char_embedding_dim"], 50 | hidden_dim=config["hidden_dim"], 51 | char_hidden_dim=config["char_hidden_dim"], 52 | rnn_layers=config["rnn_layers"], 53 | dropout_ratio=config["dropout_ratio"], 54 | pretrained_dim=config["pretrained_dim"], 55 | use_bert=config["use_bert"], 56 | use_gpu=model_use_gpu, 57 | use_chars=config["use_chars"], 58 | use_attention=config["use_attention"], 59 | use_function_words=config["use_function_words"], 60 | use_function_words_rels=config["use_function_words_rels"], 61 | parse_unaligned=config["parse_unaligned"], 62 | weight_inputs=config["weight_inputs"], 63 | attend_inputs=config["attend_inputs"] 64 | ) 65 | 66 | model.load_state_dict(torch.load(model_path)) 67 | model.eval() 68 | return model 69 | 70 | def get_embeddings(self, tokens): 71 | features = extract_features_aligned_to_words(self.roberta, tokens=tokens, use_all_layers=True, return_all_hiddens=True) 72 | embeddings = [] 73 | for tok in features: 74 | if str(tok) not in ['', '']: 75 | embeddings.append(tok.vector) 76 | embeddings = torch.stack(embeddings).detach().cpu().numpy() 77 | return embeddings 78 | 79 | def parse_sentence(self, tokens): 80 | # The model expects token at the end of the input sentence 81 | if tokens[-1] != "": 82 | tokens.append("") 83 | sent_rep = utils.vectorize_words(self.model, tokens, training=False, gpu=self.model.use_gpu) 84 | bert_emb = self.get_embeddings(tokens) 85 | amr = self.model.parse_sentence(tokens, sent_rep, bert_emb) 86 | return amr 87 | -------------------------------------------------------------------------------- /src/transition_amr_parser/action_pointer/roberta_utils.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import torch 3 | from spacy.tokens import Doc 4 | import copy 5 | from fairseq.models.roberta.alignment_utils import spacy_nlp 6 | from fairseq.data.data_utils import collate_tokens 7 | 8 | 9 | def get_tokens(roberta, word): 10 | return roberta.task.source_dictionary.encode_line(roberta.bpe.encode(word), append_eos=False, add_if_not_exist=False) 11 | 12 | 13 | def get_alignments_and_tokens(roberta, words): 14 | bpe_tokens = [] 15 | alignment_position = 1 16 | alignments = [] 17 | first_word_tokens = get_tokens(roberta, words[0]) 18 | bpe_tokens.extend(first_word_tokens) 19 | alignments.append([(alignment_position + i) for i in range(0, len(first_word_tokens))]) 20 | alignment_position = alignment_position + len(first_word_tokens) 21 | 22 | for word in words[1:]: 23 | tokens = get_tokens(roberta, " " + word) 24 | bpe_tokens.extend(tokens) 25 | alignments.append([(alignment_position + i) for i in range(0, len(tokens))]) 26 | alignment_position = alignment_position + len(tokens) 27 | 28 | final_bpe_tokens = [roberta.task.source_dictionary.index('')] + bpe_tokens + [roberta.task.source_dictionary.index('')] 29 | return alignments, torch.LongTensor(final_bpe_tokens) 30 | 31 | 32 | def align_features_to_words(roberta, features, alignment): 33 | """ 34 | Align given features to words. 35 | 36 | Args: 37 | roberta (RobertaHubInterface): RoBERTa instance 38 | features (torch.Tensor): features to align of shape `(T_bpe x C)` 39 | alignment: alignment between BPE tokens and words returned by 40 | func:`align_bpe_to_words`. 41 | """ 42 | assert features.dim() == 2 43 | 44 | bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices) 45 | assert bpe_counts[0] == 0 # shouldn't be aligned 46 | denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))]) 47 | weighted_features = features / denom.unsqueeze(-1) 48 | output = [weighted_features[0]] 49 | largest_j = -1 50 | for bpe_indices in alignment: 51 | output.append(weighted_features[bpe_indices].sum(dim=0)) 52 | largest_j = max(largest_j, *bpe_indices) 53 | for j in range(largest_j + 1, len(features)): 54 | output.append(weighted_features[j]) 55 | output = torch.stack(output) 56 | return output 57 | 58 | 59 | def extract_features_aligned_to_words_batched(model, sentences: list, use_all_layers: bool = True, return_all_hiddens: bool = False) -> torch.Tensor: 60 | nlp = spacy_nlp() 61 | bpe_toks = [] 62 | alignments = [] 63 | spacy_tokens = [] 64 | for sentence in sentences: 65 | toks = sentence.split() 66 | alignment, bpe_tok = get_alignments_and_tokens(model, toks) 67 | bpe_toks.append(bpe_tok) 68 | alignments.append(alignment) 69 | spacy_tokens.append(toks) 70 | 71 | bpe_toks_collated = collate_tokens(bpe_toks, pad_idx=1) 72 | 73 | features = model.extract_features(bpe_toks_collated, return_all_hiddens=return_all_hiddens) 74 | final_features = sum(features[1:])/(len(features)-1) 75 | 76 | results = [] 77 | for bpe_tok, final_feature, alignment, toks in zip(bpe_toks, final_features, alignments, spacy_tokens): 78 | aligned_feats = align_features_to_words(model, final_feature[0:bpe_tok.shape[0]], alignment) 79 | doc = Doc( 80 | nlp.vocab, 81 | words=[''] + [x for x in toks] + [''], 82 | ) 83 | doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i] 84 | results.append(copy.copy(doc)) 85 | 86 | return results 87 | 88 | 89 | def extract_features_aligned_to_words(model, tokens: list, use_all_layers: bool = True, return_all_hiddens: bool = False) -> torch.Tensor: 90 | nlp = spacy_nlp() 91 | alignment, bpe_tok = get_alignments_and_tokens(model, tokens) 92 | features = model.extract_features(bpe_tok, return_all_hiddens=return_all_hiddens) 93 | final_features = sum(features[1:])/(len(features)-1) 94 | final_features = final_features.squeeze(0) 95 | aligned_feats = align_features_to_words(model, final_features, alignment) 96 | doc = Doc( 97 | nlp.vocab, 98 | words=[''] + [x for x in tokens] + [''] 99 | ) 100 | doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i] 101 | return doc 102 | -------------------------------------------------------------------------------- /src/transition_amr_parser/add_id_to_amr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def amr_add_id(file_path,file_path_id): 4 | 5 | with open(file_path_id) as fid1: 6 | ids_list = [] 7 | for line in fid1.readlines(): 8 | if '# ::id ' in line: 9 | ids_list.append(line) 10 | 11 | 12 | with open(file_path) as fid2: 13 | raw_amr = [] 14 | ids_idx = 0 15 | for line in fid2.readlines(): 16 | if '::tok' in line : 17 | raw_amr.append(ids_list[ids_idx]) 18 | ids_idx+=1 19 | raw_amr.append(line) 20 | assert len(ids_list)==ids_idx 21 | 22 | with open(file_path.rstrip('.txt')+'_id-added.txt','w') as fid3: 23 | for line in raw_amr: 24 | fid3.write(line) 25 | 26 | 27 | 28 | 29 | 30 | if __name__ == '__main__': 31 | parser = argparse.ArgumentParser( 32 | description='Produces oracle sequences given AMR alignerd to sentence' 33 | ) 34 | # Single input parameters 35 | parser.add_argument( 36 | "--in-aligned-amr", 37 | help="In file containing AMR in penman format AND isi alignments ", 38 | type=str, 39 | default='DATA/AMR3.0/aligned/cofill_isi/train.txt' 40 | ) 41 | 42 | parser.add_argument( 43 | "--amr-with-id", 44 | help="add id to --in-aligned-amr using the file given", 45 | type=str, 46 | default='DATA/AMR3.0/aligned/cofill/train.txt' 47 | ) 48 | args = parser.parse_args() 49 | 50 | amr_add_id(args.in_aligned_amr,args.amr_with_id) 51 | -------------------------------------------------------------------------------- /src/transition_amr_parser/add_sentence_amrs_to_file.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from transition_amr_parser.io import read_blocks 3 | 4 | 5 | 6 | def main(args): 7 | 8 | tqdm_amrs_str = read_blocks(args.in_amr) 9 | 10 | 11 | with open(args.out_amr, 'a') as fid: 12 | for idx, penman_str in enumerate(tqdm_amrs_str): 13 | fid.write(penman_str+'\n') 14 | 15 | 16 | 17 | 18 | if __name__ == '__main__': 19 | parser = ArgumentParser() 20 | parser.add_argument( 21 | "--in-amr", 22 | help="In file containing AMR in penman format", 23 | type=str 24 | ) 25 | parser.add_argument( 26 | "--out-amr", 27 | help="path to save amr", 28 | type=str, 29 | ) 30 | args = parser.parse_args() 31 | main(args) -------------------------------------------------------------------------------- /src/transition_amr_parser/clbar.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 International Business Machines 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # This file is standalone and intended to be used as well separately of the 16 | # repository, hence the attached license above. 17 | 18 | import shutil 19 | from collections import Counter 20 | from datetime import datetime 21 | # external module 22 | import numpy as np 23 | 24 | 25 | def red_background(string): 26 | return "\033[101m%s\033[0m" % string 27 | 28 | 29 | def yellow_font(string): 30 | return "\033[93m%s\033[0m" % string 31 | 32 | 33 | def green_font(string): 34 | return "\033[92m%s\033[0m" % string 35 | 36 | 37 | def print_log(module, string): 38 | """formats printing of log to stdout""" 39 | timestamp = str(datetime.now()).split('.')[0] 40 | print(f'{timestamp} [{module}] {string}') 41 | 42 | 43 | def clbar( 44 | xy=None, # list of (x, y) tuples or Counter 45 | x=None, 46 | y=None, 47 | ylim=(None, None), 48 | ncol=None, # Max number of lines for display (defauly window size) 49 | # show only top and bottom values 50 | topx=None, 51 | botx=None, 52 | topy=None, 53 | boty=None, 54 | # normalize to sum to 1 55 | norm=False, 56 | xfilter=None, # f(x) returns bool to not skip this example in display 57 | yform=None # Function receiveing single y value returns string 58 | ): 59 | """Print data structure in command line""" 60 | # Sanity checks 61 | if x is None and y is None: 62 | if isinstance(xy, np.ndarray): 63 | labels = [f'{i}' for i in range(xy.shape[0])] 64 | xy = list(zip(labels, list(xy))) 65 | elif isinstance(xy, Counter): 66 | xy = [(str(x), y) for x, y in xy.items()] 67 | else: 68 | assert isinstance(xy, list), "Expected list of tuples" 69 | assert isinstance(xy[0], tuple), "Expected list of tuples" 70 | else: 71 | assert x is not None and y is not None 72 | assert isinstance(x, list) 73 | assert isinstance(y, list) or isinstance(y, np.ndarray) 74 | assert len(x) == len(list(y)) 75 | xy = list(zip(x, y)) 76 | 77 | # normalize 78 | if norm: 79 | z = sum([x[1] for x in xy]) 80 | xy = [(k, v / z) for k, v in xy] 81 | # show only top x 82 | if topx is not None: 83 | xy = sorted(xy, key=lambda x: float(x[0]))[-topx:] 84 | if botx is not None: 85 | xy = sorted(xy, key=lambda x: float(x[0]))[:botx] 86 | if boty is not None: 87 | xy = sorted(xy, key=lambda x: x[1])[:boty] 88 | if topy is not None: 89 | xy = sorted(xy, key=lambda x: x[1])[-topy:] 90 | # print list of tuples 91 | # determine variables to fit data to command line 92 | x_data, y_data = zip(*xy) 93 | width = max([ 94 | len(str(x)) if x is not None else len('None') for x in x_data 95 | ]) 96 | number_width = max([len(f'{y}') for y in y_data]) 97 | # max and min values 98 | if ylim[1] is not None: 99 | max_y_data = ylim[1] 100 | else: 101 | max_y_data = max(y_data) 102 | if ylim[0] is not None: 103 | min_y_data = ylim[0] 104 | else: 105 | min_y_data = min(y_data) 106 | # determine scaling factor from screen size 107 | data_range = max_y_data - min_y_data 108 | if ncol is None: 109 | ncol, _ = shutil.get_terminal_size((80, 20)) 110 | max_size = ncol - width - number_width - 3 111 | scale = max_size / data_range 112 | 113 | # plot 114 | print() 115 | blank = ' ' 116 | if yform: 117 | min_y_data_str = yform(min_y_data) 118 | print(f'{blank:<{width}}{min_y_data_str}') 119 | else: 120 | print(f'{blank:<{width}}{min_y_data}') 121 | for (x, y) in xy: 122 | 123 | # Filter example by x 124 | if xfilter and not xfilter(x): 125 | continue 126 | 127 | if y > max_y_data: 128 | # cropped bars 129 | num_col = int((ylim[1] - min_y_data) * scale) 130 | if num_col == 0: 131 | bar = '' 132 | else: 133 | half_width = (num_col // 2) 134 | if num_col % 2: 135 | bar = '\u25A0' * (half_width - 1) 136 | bar += '//' 137 | bar += '\u25A0' * (half_width - 1) 138 | else: 139 | bar = '\u25A0' * half_width 140 | bar += '//' 141 | bar += '\u25A0' * (half_width - 1) 142 | else: 143 | bar = '\u25A0' * int((y - min_y_data) * scale) 144 | if x is None: 145 | x = 'None' 146 | if yform: 147 | y = yform(y) 148 | print(f'{x:<{width}} {bar} {y}') 149 | else: 150 | print(f'{x:<{width}} {bar} {y}') 151 | print() 152 | -------------------------------------------------------------------------------- /src/transition_amr_parser/force_overlap_actions.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from transition_amr_parser.io import read_blocks 3 | import re 4 | 5 | arc_regex = re.compile(r'>[RL]A\((.*),(.*)\)') 6 | 7 | 8 | def decrement_pointers_to_future(action_lists, li, ai, ignored): 9 | 10 | pos = sum([len(alist) for alist in action_lists[:li]]) + ai + 1 11 | for (i,action_list) in enumerate(action_lists): 12 | for (j, action) in enumerate(action_list): 13 | if i > li or ( i==li and j>ai): 14 | if arc_regex.match(action): 15 | (idx, lbl) = arc_regex.match(action).groups() 16 | if int(idx)+ignored >= pos: 17 | idx = str(int(idx) - 1) 18 | action_lists[i][j] = action[:3]+"("+idx+","+lbl+")" 19 | 20 | def sanity_check(actions): 21 | for action in actions: 22 | if arc_regex.match(action): 23 | (idx, lbl) = arc_regex.match(action).groups() 24 | if arc_regex.match(actions[int(idx)]) or actions[int(idx)] in ['SHIFT','ROOT','CLOSE_SENTENCE']: 25 | import ipdb; ipdb.set_trace() 26 | print("*****bad pointer to from " + action + " to " + actions[int(idx)]) 27 | 28 | 29 | def force_overlap(actions, force_actions, start_idx): 30 | 31 | actions_per_token = [] 32 | this_token_actions = [] 33 | for action in actions: 34 | this_token_actions.append(action) 35 | if action in ['SHIFT','CLOSE_SENTENCE']: 36 | actions_per_token.append(this_token_actions) 37 | this_token_actions = [] 38 | 39 | start_action_index = sum([len(acts) for acts in actions_per_token[:start_idx]]) if start_idx else 0 40 | 41 | out_actions = "" 42 | overlap_actions = [] 43 | ignored = 0 44 | for ti in range(start_idx,len(actions_per_token)): 45 | useful_actions = [] 46 | for (ai,action) in enumerate(actions_per_token[ti]): 47 | if arc_regex.match(action): 48 | (idx, lbl) = arc_regex.match(action).groups() 49 | idx = str(int(idx) - start_action_index) 50 | if int(idx) >= 0: 51 | useful_actions.append(action[:3]+"("+idx+","+lbl+")") 52 | else: 53 | decrement_pointers_to_future(actions_per_token,ti,ai,ignored) 54 | ignored += 1 55 | else: 56 | useful_actions.append(action) 57 | overlap_actions.append(useful_actions) 58 | 59 | flat_actions = [] 60 | for actions in overlap_actions: 61 | flat_actions.extend(actions) 62 | sanity_check(flat_actions) 63 | 64 | out_force_actions = overlap_actions 65 | 66 | if force_actions is not None: 67 | out_force_actions.extend(force_actions[len(overlap_actions):]) 68 | 69 | #there can be a sanity check here 70 | return out_force_actions 71 | 72 | 73 | 74 | def force_overlap_all(all_windows, all_actions, all_force_actions, in_widx): 75 | 76 | all_out_force_actions = [] 77 | 78 | fidx = 0 79 | pidx = 0 80 | for (i, _) in enumerate(all_windows): 81 | if len(all_windows[i]) > in_widx: 82 | this_window = all_windows[i][in_widx] 83 | prev_window = all_windows[i][in_widx-1] 84 | actions = all_actions[pidx] 85 | force_actions = all_force_actions[fidx] 86 | 87 | start_idx = this_window[0] - prev_window[0] 88 | 89 | out_force_actions = force_overlap(actions, force_actions, start_idx) 90 | 91 | all_out_force_actions.append(out_force_actions) 92 | 93 | fidx += 1 94 | 95 | if len(all_windows[i]) >= in_widx: 96 | pidx += 1 97 | 98 | return all_out_force_actions 99 | 100 | def make_forced_overlap(in_pred, in_force, in_windows, in_widx, out_force): 101 | 102 | fpactions = open(in_pred) 103 | ffactions = open(in_force) 104 | fwindows = open(in_windows) 105 | 106 | all_windows = [eval(line.strip()) for line in fwindows] 107 | all_actions = [ line.strip().split() for line in fpactions ] 108 | all_force_actions = [ eval(line.strip()) for line in ffactions ] 109 | 110 | window_of_interest = in_widx 111 | 112 | ffactions.close() 113 | ffout = open(out_force, 'w') 114 | 115 | if window_of_interest == 0: 116 | return 117 | 118 | all_out_force_actions = force_overlap_all(all_windows, all_actions, all_force_actions, in_widx) 119 | 120 | for force_actions in all_out_force_actions: 121 | ffout.write(str(force_actions) + "\n") 122 | 123 | def main(args): 124 | make_forced_overlap(args.in_pred, args.in_force, args.in_windows, args.in_widx, args.out_force) 125 | 126 | if __name__ == '__main__': 127 | parser = ArgumentParser() 128 | parser.add_argument( 129 | "--in-pred", 130 | help="input pred actions to be forced in next window", 131 | type=str 132 | ) 133 | parser.add_argument( 134 | "--in-force", 135 | help="input force actions to be updated", 136 | type=str 137 | ) 138 | parser.add_argument( 139 | "--in-windows", 140 | help="info about sliding window", 141 | type=str, 142 | ) 143 | parser.add_argument( 144 | "--out-force", 145 | help="output force actions", 146 | type=str, 147 | ) 148 | parser.add_argument( 149 | "--in-widx", 150 | help="index of the window to be updated", 151 | default=1, 152 | type=int, 153 | ) 154 | 155 | args = parser.parse_args() 156 | main(args) 157 | 158 | 159 | -------------------------------------------------------------------------------- /tests/align_mode.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | set -o nounset 5 | # This will ensure that early exit shows tests having failed 6 | function check_tests_passed { 7 | if [ "$TESTS_PASSED" == "Y" ];then 8 | printf "[\033[92m OK \033[0m] $0\n" 9 | else 10 | printf "[\033[91m FAILED \033[0m] $0\n" 11 | fi 12 | } 13 | trap check_tests_passed EXIT 14 | TESTS_PASSED="N" 15 | 16 | # python tests/align_mode.py DATA/wiki25/aligned/cofill_isi/train.txt 17 | python tests/align_mode.py DATA/AMR2.0/aligned/cofill/train.txt 18 | 19 | # if we reach here, we are good 20 | TESTS_PASSED="Y" 21 | -------------------------------------------------------------------------------- /tests/all.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | # This will ensure that early exit shows tests having failed 3 | function check_tests_passed { 4 | if [ "$TESTS_PASSED" == "Y" ];then 5 | printf "[\033[92m OK \033[0m] $0\n" 6 | else 7 | printf "[\033[91m FAILED \033[0m] $0\n" 8 | fi 9 | } 10 | trap check_tests_passed EXIT 11 | TESTS_PASSED="N" 12 | 13 | # all conventional tests 14 | bash tests/correctly_installed.sh 15 | # small test with 25 sentences 16 | bash tests/minimal_test.sh 17 | # standalone parser 18 | bash tests/standalone.sh 19 | # oracle for wiki25 imperfect due to alignments 20 | bash tests/oracles/amr_o10.sh DATA/wiki25/aligned/cofill_isi/train.txt 21 | # if we reach here, we are good 22 | TESTS_PASSED="Y" 23 | -------------------------------------------------------------------------------- /tests/amr_io.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | set -o nounset 5 | 6 | # Seems not to be reading 7 | 8 | # passes IO test 9 | python tests/amr_io.py --no-isi \ 10 | --in-amr DATA/AMR2.0/aligned/cofill_isi/dev.txt \ 11 | --ignore-errors 'amr2-dev' \ 12 | # --out-amr tmp.amr 13 | 14 | # passes IO test 15 | python tests/amr_io.py --no-isi \ 16 | --in-amr DATA/AMR2.0/aligned/cofill_isi/train.txt \ 17 | --ignore-errors 'amr2-train' \ 18 | # --out-amr tmp.amr 19 | 20 | # passes IO test 21 | python tests/amr_io.py --no-isi \ 22 | --in-amr DATA/AMR3.0/aligned/cofill_isi/dev.txt \ 23 | --ignore-errors 'amr3-dev' \ 24 | # --out-amr tmp.amr 25 | 26 | # passes IO test 27 | python tests/amr_io.py --no-isi \ 28 | --in-amr DATA/AMR3.0/aligned/cofill_isi/train.txt \ 29 | --ignore-errors 'amr3-train' \ 30 | # --out-amr tmp.amr 31 | 32 | printf "[\033[92m OK \033[0m] $0\n" 33 | -------------------------------------------------------------------------------- /tests/correctly_installed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import subprocess 3 | # from ipdb import set_trace 4 | 5 | 6 | def main(): 7 | 8 | # Pytorch and CUDA 9 | passed = True 10 | print() 11 | import torch 12 | print(f'pytorch {torch.__version__}') 13 | if torch.cuda.is_available(): 14 | print(f'cuda {torch.version.cuda}') 15 | # happens when CUDA missconfigured 16 | assert torch.cuda.device_count(), "0 GPUs found" 17 | try: 18 | import apex 19 | print("Apex installed") 20 | except ImportError: 21 | print("Apex not installed") 22 | if torch.cuda.get_device_capability(0)[0] < 7: 23 | print("GPU wont support --fp") 24 | 25 | # sanity check try to use CUDA 26 | import torch 27 | torch.zeros((100, 100)).cuda() 28 | 29 | else: 30 | print("\033[93mNo CUDA available\033[0m") 31 | 32 | try: 33 | import smatch 34 | print("smatch installed") 35 | except ImportError as e: 36 | print("\033[93msmatch not installed\033[0m") 37 | sucess = False 38 | 39 | try: 40 | import torch_scatter 41 | print("pytorch-scatter installed") 42 | except ImportError: 43 | print("\033[93mpytorch-scatter not installed\033[0m") 44 | passed = False 45 | 46 | # if torch.cuda.is_available(): 47 | # try: 48 | # import torch_scatter.scatter_cuda 49 | # print("torch_scatter.scatter_cuda works") 50 | # except ImportError: 51 | # print( 52 | # "\033[93mmaybe LD_LIBRARY_PATH unconfigured?, " 53 | # "import torch_scatter.scatter_cuda dies\033[0m" 54 | # ) 55 | # passed = False 56 | 57 | # fairseq 58 | try: 59 | import fairseq 60 | print("fairseq works") 61 | except ImportError: 62 | print("\033[93mfairseq installation failed\033[0m") 63 | passed = False 64 | 65 | # If we get here we passed 66 | if passed: 67 | print(f'[\033[92mOK\033[0m] correctly installed\n') 68 | else: 69 | print(f'[\033[91mFAILED\033[0m] some modules missing\n') 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /tests/correctly_installed.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | set -o nounset 5 | 6 | trap 'case $? in 7 | 139) echo -e "\033[91mCode segfaulted!\033[0m (probably .cuda())\n";; 8 | esac' EXIT 9 | 10 | python tests/correctly_installed.py 11 | -------------------------------------------------------------------------------- /tests/create_wiki25_mockup.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | set -o nounset 4 | 5 | # This simulates conventional corpora using the 25 wiki sentences 6 | # Create original data 7 | mkdir -p DATA/wiki25/corpora/ 8 | cp DATA/wiki25.jkaln DATA/wiki25/corpora/train.txt 9 | # remove JAMR meta-data, which we do not have in reality 10 | sed -i.bak '/^# ::tok.*/d' DATA/wiki25/corpora/train.txt 11 | sed -i.bak '/^# ::node.*/d' DATA/wiki25/corpora/train.txt 12 | sed -i.bak '/^# ::edge.*/d' DATA/wiki25/corpora/train.txt 13 | sed -i.bak '/^# ::root.*/d' DATA/wiki25/corpora/train.txt 14 | sed -i.bak '/^# ::alignments.*/d' DATA/wiki25/corpora/train.txt 15 | [ ! -f DATA/wiki25/corpora/dev.txt ] \ 16 | && ln -s ./train.txt DATA/wiki25/corpora/dev.txt 17 | [ ! -f DATA/wiki25/corpora/test.txt ] \ 18 | && ln -s ./train.txt DATA/wiki25/corpora/test.txt 19 | 20 | touch DATA/wiki25/corpora/.done 21 | 22 | # Simulate aligned data from wiki25 23 | mkdir -p DATA/wiki25/aligned/cofill_isi/ 24 | [ ! -f DATA/wiki25/aligned/cofill_isi/train.txt ] \ 25 | && ln -s ../../../wiki25.jkaln DATA/wiki25/aligned/cofill_isi/train.txt 26 | echo "DATA/wiki25/aligned/cofill_isi/train.txt" 27 | [ ! -f DATA/wiki25/aligned/cofill_isi/dev.txt ] \ 28 | && ln -s ../../../wiki25.jkaln DATA/wiki25/aligned/cofill_isi/dev.txt 29 | echo "DATA/wiki25/aligned/cofill_isi/dev.txt" 30 | [ ! -f DATA/wiki25/aligned/cofill_isi/test.txt ] \ 31 | && ln -s ../../../wiki25.jkaln DATA/wiki25/aligned/cofill_isi/test.txt 32 | touch DATA/wiki25/aligned/cofill_isi/.done 33 | echo "DATA/wiki25/aligned/cofill_isi/test.txt" 34 | -------------------------------------------------------------------------------- /tests/download_little_prince.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | set -o nounset 4 | 5 | # [ -d DATA/LP/ ] && rm -R DATA/LP 6 | mkdir -p DATA/LP/corpora/ 7 | 8 | # Download data 9 | 10 | if [ ! -f DATA/LP/corpora/dev.txt ];then 11 | wget --no-check-certificate -O DATA/LP/corpora/dev.txt.tmp https://amr.isi.edu/download/amr-bank-struct-v1.6-dev.txt 12 | sed '1,2d' DATA/LP/corpora/dev.txt.tmp > DATA/LP/corpora/dev.txt 13 | rm DATA/LP/corpora/dev.txt.tmp 14 | fi 15 | 16 | if [ ! -f DATA/LP/corpora/train.txt ];then 17 | wget --no-check-certificate -O DATA/LP/corpora/train.txt.tmp https://amr.isi.edu/download/amr-bank-struct-v1.6-training.txt 18 | sed '1,2d' DATA/LP/corpora/train.txt.tmp > DATA/LP/corpora/train.txt 19 | rm DATA/LP/corpora/train.txt.tmp 20 | fi 21 | 22 | if [ ! -f DATA/LP/corpora/test.txt ];then 23 | wget --no-check-certificate -O DATA/LP/corpora/test.txt.tmp https://amr.isi.edu/download/amr-bank-struct-v1.6-test.txt 24 | sed '1,2d' DATA/LP/corpora/test.txt.tmp > DATA/LP/corpora/test.txt 25 | rm DATA/LP/corpora/test.txt.tmp 26 | fi 27 | -------------------------------------------------------------------------------- /tests/fairseq_data_iterator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test data iterator with e.g. 3 | 4 | . set_environment.sh 5 | 6 | arguments=" 7 | DATA/amr/features/o3+Word100_RoBERTa-base/ 8 | --gen-subset train 9 | --batch-size 128 10 | " 11 | 12 | # do not use @profile 13 | #python tests/fairseq_data_iterator.py $arguments 14 | 15 | # Use @profile 16 | kernprof -l tests/fairseq_data_iterator.py $arguments 17 | python -m line_profiler fairseq_data_iterator.py.lprof 18 | """ 19 | 20 | from fairseq import tasks, utils 21 | from fairseq_ext import options 22 | from fairseq_ext.utils_import import import_user_module 23 | from fairseq.data import data_utils, FairseqDataset 24 | from tqdm import tqdm 25 | 26 | 27 | def get_batch_iterator( 28 | dataset, max_tokens=None, max_sentences=None, max_positions=None, 29 | ignore_invalid_inputs=False, required_batch_size_multiple=1, 30 | seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, 31 | large_sent_first=False 32 | ): 33 | """ 34 | Get an iterator that yields batches of data from the given dataset. 35 | 36 | Args: 37 | dataset (~fairseq.data.FairseqDataset): dataset to batch 38 | max_tokens (int, optional): max number of tokens in each batch 39 | (default: None). 40 | max_sentences (int, optional): max number of sentences in each 41 | batch (default: None). 42 | max_positions (optional): max sentence length supported by the 43 | model (default: None). 44 | ignore_invalid_inputs (bool, optional): don't raise Exception for 45 | sentences that are too long (default: False). 46 | required_batch_size_multiple (int, optional): require batch size to 47 | be a multiple of N (default: 1). 48 | seed (int, optional): seed for random number generator for 49 | reproducibility (default: 1). 50 | num_shards (int, optional): shard the data iterator into N 51 | shards (default: 1). 52 | shard_id (int, optional): which shard of the data iterator to 53 | return (default: 0). 54 | num_workers (int, optional): how many subprocesses to use for data 55 | loading. 0 means the data will be loaded in the main process 56 | (default: 0). 57 | epoch (int, optional): the epoch to start the iterator from 58 | (default: 0). 59 | 60 | Returns: 61 | ~fairseq.iterators.EpochBatchIterator: a batched iterator over the 62 | given dataset split 63 | """ 64 | assert isinstance(dataset, FairseqDataset) 65 | 66 | # get indices ordered by example size 67 | with data_utils.numpy_seed(seed): 68 | indices = dataset.ordered_indices() 69 | # invert order to start by bigger ones 70 | if large_sent_first: 71 | indices = indices[::-1] 72 | 73 | # filter examples that are too large 74 | if max_positions is not None: 75 | indices = data_utils.filter_by_size( 76 | indices, dataset.size, max_positions, 77 | raise_exception=(not ignore_invalid_inputs), 78 | ) 79 | 80 | # create mini-batches with given size constraints 81 | batch_sampler = data_utils.batch_by_size( 82 | indices, dataset.num_tokens, max_tokens=max_tokens, 83 | max_sentences=max_sentences, 84 | required_batch_size_multiple=required_batch_size_multiple, 85 | ) 86 | 87 | return batch_sampler 88 | 89 | 90 | def main(args): 91 | 92 | # Load dataset 93 | import_user_module(args) 94 | task = tasks.setup_task(args) 95 | task.load_dataset(args.gen_subset) 96 | dataset = task.dataset(args.gen_subset) 97 | 98 | # Get iterator over batches 99 | batch_index_iterator = get_batch_iterator( 100 | dataset=dataset, 101 | max_tokens=args.max_tokens, 102 | max_sentences=args.max_sentences, 103 | max_positions=None, 104 | ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, 105 | required_batch_size_multiple=args.required_batch_size_multiple, 106 | num_shards=args.num_shards, 107 | shard_id=args.shard_id, 108 | num_workers=args.num_workers, 109 | large_sent_first=False 110 | ) 111 | 112 | # collate batch of sentences into single tensor for all data 113 | for batch_ids in tqdm(batch_index_iterator): 114 | samples = [dataset[i] for i in batch_ids] 115 | dataset.collater(samples) 116 | 117 | 118 | def cli_main(): 119 | parser = options.get_generation_parser() 120 | args = options.parse_args_and_arch(parser) 121 | main(args) 122 | 123 | 124 | if __name__ == '__main__': 125 | cli_main() 126 | -------------------------------------------------------------------------------- /tests/fairseq_data_iterator.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | [ -z "$1" ] || [ -z "$2" ] && \ 5 | echo -e "\nbash $0 \n" && \ 6 | exit 1 7 | features_folder=$1 8 | embeddings_folder=$2 9 | set -o nounset 10 | 11 | # pyinstrument tests/fairseq_data_iterator.py \ 12 | python tests/fairseq_data_iterator.py \ 13 | $features_folder \ 14 | --emb-dir $embeddings_folder \ 15 | --user-dir fairseq_ext \ 16 | --task amr_action_pointer_bart \ 17 | --gen-subset train \ 18 | --max-tokens 3584 \ 19 | --path dummpy.pt 20 | -------------------------------------------------------------------------------- /tests/minimal_test.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | if [ -z $1 ];then 4 | 5 | # Standard mini-test with wiki25 6 | config=configs/wiki25-structured-bart-base-neur-al-sampling5.sh 7 | 8 | ELMO_WEIGHTS="DATA/wiki25/aligned/ibm_neural_aligner/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" 9 | 10 | if [ -f "$ELMO_WEIGHTS" ]; then 11 | echo "$ELMO_WEIGHTS exists." 12 | 13 | # Backup weights because expensive to download. 14 | mv $ELMO_WEIGHTS ./tmp.elmo 15 | 16 | # Delete previous runs is exist. 17 | rm -Rf DATA/wiki25/* 18 | 19 | # Restore elmo weights. 20 | mkdir -p DATA/wiki25/aligned/ibm_neural_aligner 21 | mv tmp.elmo $ELMO_WEIGHTS 22 | else 23 | echo "$ELMO_WEIGHTS does not exist." 24 | 25 | # Delete previous runs if exists 26 | rm -Rf DATA/wiki25/* 27 | fi 28 | 29 | # replace code above with less restrictive deletion 30 | # rm -R -f DATA/wiki25/embeddings 31 | # rm -R -f DATA/wiki25/features 32 | # rm -R -f DATA/wiki25/oracles 33 | # rm -R -f DATA/wiki25/models 34 | 35 | # simulate completed corpora extraction and alignment 36 | bash tests/create_wiki25_mockup.sh 37 | 38 | else 39 | 40 | # custom config mini-test 41 | config=$1 42 | fi 43 | set -o nounset 44 | 45 | bash run/run_experiment.sh $config 46 | 47 | # check if final result is there 48 | . $config 49 | 50 | if [ -f "${MODEL_FOLDER}seed42/beam10/valid_${DECODING_CHECKPOINT}.wiki.smatch" ];then 51 | printf "\n[\033[92mOK\033[0m] $0\n" 52 | else 53 | printf "\n[\033[91mFAILED\033[0m] $0\n" 54 | fi 55 | -------------------------------------------------------------------------------- /tests/minimal_test_lsf.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | if [ -z $1 ];then 4 | 5 | # Standard mini-test with wiki25 6 | config=configs/wiki25-structured-bart-base-neur-al-sampling5.sh 7 | 8 | ELMO_WEIGHTS="DATA/wiki25/aligned/ibm_neural_aligner/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" 9 | 10 | if [ -f "$ELMO_WEIGHTS" ]; then 11 | echo "$ELMO_WEIGHTS exists." 12 | 13 | # Backup weights because expensive to download. 14 | mv $ELMO_WEIGHTS ./tmp.elmo 15 | 16 | # Delete previous runs is exist. 17 | rm -Rf DATA/wiki25/* 18 | 19 | # Restore elmo weights. 20 | mkdir -p DATA/wiki25/aligned/ibm_neural_aligner 21 | mv tmp.elmo $ELMO_WEIGHTS 22 | else 23 | echo "$ELMO_WEIGHTS does not exist." 24 | 25 | # Delete previous runs is exist 26 | rm -Rf DATA/wiki25/* 27 | fi 28 | 29 | # replace code above with less restrictive deletion 30 | # rm -R -f DATA/wiki25/embeddings 31 | # rm -R -f DATA/wiki25/features 32 | # rm -R -f DATA/wiki25/oracles 33 | # rm -R -f DATA/wiki25/models 34 | 35 | # simulate completed corpora extraction and alignment 36 | bash tests/create_wiki25_mockup.sh 37 | 38 | else 39 | 40 | # custom config mini-test 41 | config=$1 42 | fi 43 | set -o nounset 44 | 45 | # Run local test 46 | bash run/lsf/run_experiment.sh $config 47 | 48 | # check if final result is there 49 | . $config 50 | 51 | if [ -f "${MODEL_FOLDER}seed42/beam10/valid_${DECODING_CHECKPOINT}.wiki.smatch" ];then 52 | printf "\n[\033[92mOK\033[0m] $0\n" 53 | else 54 | printf "\n[\033[91mFAILED\033[0m] $0\n" 55 | fi 56 | -------------------------------------------------------------------------------- /tests/neural_aligner.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | if [ -z $1 ];then 4 | 5 | # Standard mini-test with wiki25, sampling 6 | config=configs/wiki25-structured-bart-base-neur-al.sh 7 | 8 | else 9 | 10 | # custom config mini-test 11 | config=$1 12 | fi 13 | . set_environment.sh 14 | set -o nounset 15 | 16 | # load config 17 | . $config 18 | 19 | # Clean-up 20 | [ -d "$ALIGNED_FOLDER" ] && rm -R "$ALIGNED_FOLDER" 21 | mkdir -p "$ALIGNED_FOLDER" 22 | 23 | # Train aligner 24 | bash run/train_aligner.sh $config 25 | 26 | # Align data. 27 | mkdir -p $ALIGNED_FOLDER/version_20210709c_exp_0_seed_0_write_amr2 28 | python -u src/ibm_neural_aligner/main.py \ 29 | --no-jamr \ 30 | --cuda --allow-cpu \ 31 | --vocab-text $ALIGN_VOCAB_TEXT \ 32 | --vocab-amr $ALIGN_VOCAB_AMR \ 33 | --write-single \ 34 | --single-input ${AMR_TRAIN_FILE_WIKI}.no_wiki \ 35 | --single-output $ALIGNED_FOLDER/version_20210709c_exp_0_seed_0_write_amr2/alignment.trn.out.pred \ 36 | --cache-dir $ALIGNED_FOLDER \ 37 | --verbose \ 38 | --load $ALIGN_MODEL \ 39 | --load-flags $ALIGN_MODEL_FLAGS \ 40 | --batch-size 8 \ 41 | --max-length 0 42 | 43 | # results should be written to 44 | if [ -f "$ALIGNED_FOLDER/version_20210709c_exp_0_seed_0_write_amr2/alignment.trn.out.pred" ];then 45 | printf "\n[\033[92mOK\033[0m] $0\n\n" 46 | else 47 | printf "\n[\033[91mFAILED\033[0m] $0\n\n" 48 | fi 49 | -------------------------------------------------------------------------------- /tests/oracles/amr_o10.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | [ -z $1 ] && echo "$0 " && exit 1 5 | gold_amr=$1 6 | set -o nounset 7 | 8 | oracle_folder=DATA/unit_test_$(basename $(dirname $gold_amr))/ 9 | mkdir -p $oracle_folder 10 | 11 | # get actions from oracle 12 | python src/transition_amr_parser/amr_machine.py \ 13 | --in-aligned-amr $gold_amr \ 14 | --out-machine-config $oracle_folder/machine_config.json \ 15 | --out-actions $oracle_folder/train.actions \ 16 | --out-tokens $oracle_folder/train.tokens \ 17 | --use-copy 1 \ 18 | --absolute-stack-positions \ 19 | # --if-oracle-error stop 20 | # --reduce-nodes all 21 | 22 | # play actions on state machine 23 | python src/transition_amr_parser/amr_machine.py \ 24 | --in-machine-config $oracle_folder/machine_config.json \ 25 | --in-tokens $oracle_folder/train.tokens \ 26 | --in-actions $oracle_folder/train.actions \ 27 | --out-amr $oracle_folder/train_oracle.amr 28 | 29 | # score 30 | echo "Computing Smatch (make take long for 1K or more sentences)" 31 | python scripts/smatch_aligner.py \ 32 | --in-amr $oracle_folder/train_oracle.amr \ 33 | --in-reference-amr $gold_amr \ 34 | # --stop-if-different 35 | 36 | printf "[\033[92m OK \033[0m] $0\n" 37 | -------------------------------------------------------------------------------- /tests/oracles/amr_o10_doc.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | [ -z $1 ] && echo "$0 " && exit 1 5 | gold_amr=$1 6 | set -o nounset 7 | 8 | oracle_folder=DATA/AMR3.0/oracles/o10_pinitos_doc_v0.4/ 9 | mkdir -p $oracle_folder 10 | NORM=no-merge 11 | #make_doc_amr 12 | python scripts/doc-amr/get_doc_amr_from_sen.py \ 13 | --in-amr $gold_amr \ 14 | --coref-fof DATA/AMR3.0/coref/train_coref.fof \ 15 | --fof-path DATA/AMR3.0/amr_annotation_3.0/ \ 16 | --norm $NORM \ 17 | --out-amr $oracle_folder/train_${NORM}.docamr 18 | 19 | # get actions from oracle 20 | python src/transition_amr_parser/amr_machine.py \ 21 | --in-aligned-amr $oracle_folder/train_${NORM}.docamr \ 22 | --out-machine-config $oracle_folder/machine_config.json \ 23 | --out-actions $oracle_folder/train.actions \ 24 | --out-tokens $oracle_folder/train.tokens \ 25 | --use-copy 1 \ 26 | --absolute-stack-positions 27 | # --reduce-nodes all 28 | 29 | # play actions on state machine 30 | python src/transition_amr_parser/amr_machine.py \ 31 | --in-machine-config $oracle_folder/machine_config.json \ 32 | --in-tokens $oracle_folder/train.tokens \ 33 | --in-actions $oracle_folder/train.actions \ 34 | --out-amr $oracle_folder/train_oracle_no-merge.amr 35 | 36 | sed 's@\~[0-9]\{1,\}@@g' $oracle_folder/train_oracle_no-merge.amr > $oracle_folder/train_oracle_no-merge.amr.no_isi 37 | sed 's@\~[0-9]\{1,\}@@g' $oracle_folder/train_no-merge.docamr > $oracle_folder/train_no-merge.docamr.no_isi 38 | # score 39 | echo "Computing Smatch (make take long for 1K or more sentences)" 40 | doc-smatch -r 1 --significant 4 --coref-subscore \ 41 | -f $oracle_folder/train_${NORM}.docamr.no_isi \ 42 | $oracle_folder/train_oracle_no-merge.amr.no_isi \ 43 | 44 | printf "[\033[92m OK \033[0m] $0\n" 45 | -------------------------------------------------------------------------------- /tests/smatch.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | . set_environment.sh 4 | set -o nounset 5 | 6 | # DOES NOT PASS Smatch test (due to Smatch read BUGs) 7 | # :mod 277703234 in dev[97] 8 | python scripts/smatch_aligner.py \ 9 | --in-amr DATA/AMR2.0/corpora/dev.txt \ 10 | --in-reference-amr DATA/AMR2.0/corpora/dev.txt \ 11 | # --stop-if-different 12 | 13 | exit 14 | 15 | # DOES NOT PASS Smatch test (due to Smatch read BUGs) 16 | # bolt12_10511_2844.2 ignores :mod "A" 17 | # bolt12_12120_6501.3 ignores b2 :mod 106 18 | # bolt12_12120_6501.4 ignores b :mod 920 19 | # bolt12_12120_6501.5 ignores b :mod 17, b :mod 14 20 | # ... (stopped counting) 21 | python scripts/smatch_aligner.py \ 22 | --in-amr DATA/AMR2.0/corpora/train.txt \ 23 | --in-reference-amr DATA/AMR2.0/corpora/train.txt \ 24 | --stop-if-different 25 | 26 | # DOES NOT PASS Smatch test (due to Smatch read BUGs) 27 | # :mod 277703234 in dev[97] 28 | python scripts/smatch_aligner.py \ 29 | --in-amr DATA/AMR2.0/corpora/dev.txt \ 30 | --in-reference-amr DATA/AMR2.0/corpora/dev.txt \ 31 | --stop-if-different 32 | 33 | # DOES NOT PASS Smatch test (due to Smatch read BUGs) 34 | # bolt12_10511_2844.2 ignores :mod "A" 35 | # bolt12_12120_6501.3 ignores b2 :mod 106 36 | # bolt12_12120_6501.4 ignores b :mod 920 37 | # bolt12_12120_6501.5 ignores b :mod 17, b :mod 14 38 | # ... (stopped counting) 39 | python scripts/smatch_aligner.py \ 40 | --in-amr DATA/AMR2.0/corpora/train.txt \ 41 | --in-reference-amr DATA/AMR2.0/corpora/train.txt \ 42 | --stop-if-different 43 | -------------------------------------------------------------------------------- /tests/standalone-doc.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | if [ -z $1 ];then 4 | 5 | # Standard mini-test with wiki25 6 | config=configs/both_doc+sen_trainsliding_ws400x100.sh 7 | 8 | else 9 | 10 | # custom config mini-test 11 | config=$1 12 | fi 13 | . set_environment.sh 14 | set -o nounset 15 | 16 | # load config 17 | . $config 18 | 19 | # use first seed 20 | seed=$(echo $SEEDS | sed 's@ .*@@g') 21 | # rest, from config 22 | sset=test 23 | 24 | reference_amr=$ORACLE_FOLDER/${sset}_docAMR.docamr 25 | # wiki=$LINKER_CACHE_PATH/${sset}.wiki 26 | checkpoint=${MODEL_FOLDER}seed${seed}/$DECODING_CHECKPOINT 27 | 28 | force_actions=$ORACLE_FOLDER/${sset}.force_actions 29 | # where to put results 30 | FOLDER=${MODEL_FOLDER}seed${seed}/beam${BEAM_SIZE}/ 31 | results_prefix=$FOLDER/${sset}_$DECODING_CHECKPOINT 32 | 33 | # needs data and model 34 | [ ! -f "$checkpoint" ] \ 35 | && echo "Missing $checkpoint" \ 36 | && exit 1 37 | 38 | # prepare unit test folder 39 | # [ -d "$FOLDER" ] && rm -R $FOLDER/ 40 | mkdir -p $FOLDER 41 | 42 | [ ! -f "$reference_amr" ] \ 43 | && echo "Missing $reference_amr" \ 44 | && exit 1 45 | 46 | [ ! -f "$force_actions" ] \ 47 | && echo "Missing $force_actions" \ 48 | && force_actions="" 49 | 50 | # # extract sentences from test 51 | # grep '# ::tok ' $ALIGNED_FOLDER/${sset}.txt \ 52 | # | sed 's@# ::tok @@g' > ${results_prefix}.tokens 53 | # 54 | # # run first seed of model 55 | # echo "amr-parse --beam ${BEAM_SIZE} --batch-size 128 -c $checkpoint -i ${results_prefix}.tokens -o ${results_prefix}.amr" 56 | # amr-parse --beam ${BEAM_SIZE} --batch-size 128 -c $checkpoint -i ${results_prefix}.tokens -o ${results_prefix}.amr 57 | 58 | # extract sentences from test 59 | # grep '# ::tok ' $reference_amr \ 60 | # | sed 's@# ::tok @@g' > ${results_prefix}.sentences 61 | cp $ORACLE_FOLDER/${sset}.en ${results_prefix}.sentences 62 | sed -e 's/[[:space:]]\+/ /g' ${results_prefix}.sentences > ${results_prefix}.sentences_notab 63 | # run first seed of model 64 | cmd="amr-parse --fp16 --beam ${BEAM_SIZE} --batch-size ${BATCH_SIZE} -c $checkpoint -i ${results_prefix}.sentences_notab -o ${results_prefix}.amr --sliding --window-size 400 --window-overlap 100 --in-actions $force_actions" 65 | echo "$cmd" 66 | eval "$cmd" 67 | 68 | # GRAPH POST-PROCESSING 69 | 70 | # if [ "$LINKER_CACHE_PATH" == "" ];then 71 | 72 | # # just copy AMR to wiki AMR 73 | # cp ${results_prefix}.amr ${results_prefix}.wiki.amr 74 | 75 | # # TODO: Unelegant detection of linker method (temporary) 76 | # elif [ -f "${LINKER_CACHE_PATH}/trn.wikis" ];then 77 | 78 | # # Legacy linker 79 | # python scripts/add_wiki.py \ 80 | # ${results_prefix}.amr $wiki $LINKER_CACHE_PATH \ 81 | # > ${results_prefix}.wiki.amr 82 | 83 | # else 84 | 85 | # # BLINK cache 86 | # python scripts/retyper.py \ 87 | # --inputfile ${results_prefix}.amr \ 88 | # --outputfile ${results_prefix}.wiki.amr \ 89 | # --skipretyper \ 90 | # --wikify \ 91 | # --blinkcachepath $LINKER_CACHE_PATH \ 92 | # --blinkthreshold 0.0 93 | 94 | # fi 95 | 96 | ## Change rep of docamr to docAMR for smatch 97 | 98 | echo -e "\n Changing rep of dev/test data to docAMR " 99 | doc-amr \ 100 | --in-doc-amr-pairwise ${results_prefix}.amr \ 101 | --rep docAMR \ 102 | --pairwise-coref-rel same-as \ 103 | --out-amr ${results_prefix}_docAMR.amr 104 | results_prefix=${results_prefix}_docAMR 105 | 106 | ##### SMATCH evaluation 107 | if [[ "$EVAL_METRIC" == "smatch" ]]; then 108 | 109 | # Smatch evaluation without wiki 110 | 111 | # until smatch is fixed, we need to remove the ISI alignment annotations 112 | sed 's@\~[0-9]\{1,\}@@g' ${results_prefix}.amr > ${results_prefix}.amr.no_isi 113 | 114 | echo "Computing SMATCH between ---" 115 | echo "$reference_amr" 116 | echo "${results_prefix}.amr" 117 | doc-smatch -r 1 --significant 4 \ 118 | -f $reference_amr \ 119 | ${results_prefix}.amr.no_isi \ 120 | | tee ${results_prefix}.smatch 121 | 122 | elif [[ "$EVAL_METRIC" == "wiki.smatch" ]]; then 123 | 124 | # Smatch evaluation without wiki 125 | 126 | # until smatch is fixed, we need to remove the ISI alignment annotations 127 | sed 's@\~[0-9]\{1,\}@@g' ${results_prefix}.wiki.amr > ${results_prefix}.wiki.amr.no_isi 128 | 129 | # compute score 130 | echo "Computing SMATCH between ---" 131 | echo "$reference_amr" 132 | echo "${results_prefix}.wiki.amr" 133 | doc-smatch -r 1 --significant 4 \ 134 | -f $reference_amr \ 135 | ${results_prefix}.wiki.amr.no_isi \ 136 | | tee ${results_prefix}.wiki.smatch 137 | 138 | fi 139 | -------------------------------------------------------------------------------- /tests/standalone.sh: -------------------------------------------------------------------------------- 1 | set -o errexit 2 | set -o pipefail 3 | if [ -z $1 ];then 4 | 5 | # Standard mini-test with wiki25 6 | config=configs/wiki25-structured-bart-base-neur-al-sampling5.sh 7 | 8 | else 9 | 10 | # custom config mini-test 11 | config=$1 12 | fi 13 | . set_environment.sh 14 | set -o nounset 15 | 16 | # load config 17 | . $config 18 | 19 | # use first seed 20 | seed=$(echo $SEEDS | sed 's@ .*@@g') 21 | # rest, from config 22 | sset=test 23 | 24 | reference_amr_wiki=$AMR_TEST_FILE_WIKI 25 | wiki=$LINKER_CACHE_PATH/${sset}.wiki 26 | checkpoint=${MODEL_FOLDER}seed${seed}/$DECODING_CHECKPOINT 27 | 28 | # where to put results 29 | FOLDER=${MODEL_FOLDER}seed${seed}/beam${BEAM_SIZE}/ 30 | results_prefix=$FOLDER/${sset}_$DECODING_CHECKPOINT 31 | 32 | # needs data and model 33 | [ ! -f "$checkpoint" ] \ 34 | && echo "Missing $checkpoint" \ 35 | && exit 1 36 | 37 | # prepare unit test folder 38 | [ -d "$FOLDER" ] && rm -R $FOLDER/ 39 | mkdir -p $FOLDER 40 | 41 | [ ! -f "$reference_amr_wiki" ] \ 42 | && echo "Missing $reference_amr_wiki" \ 43 | && exit 1 44 | 45 | # # extract sentences from test 46 | # grep '# ::tok ' $ALIGNED_FOLDER/${sset}.txt \ 47 | # | sed 's@# ::tok @@g' > ${results_prefix}.tokens 48 | # 49 | # # run first seed of model 50 | # echo "amr-parse --beam ${BEAM_SIZE} --batch-size 128 -c $checkpoint -i ${results_prefix}.tokens -o ${results_prefix}.amr" 51 | # amr-parse --beam ${BEAM_SIZE} --batch-size 128 -c $checkpoint -i ${results_prefix}.tokens -o ${results_prefix}.amr 52 | 53 | # extract sentences from test 54 | grep '# ::snt ' $reference_amr_wiki \ 55 | | sed 's@# ::snt @@g' > ${results_prefix}.sentences 56 | 57 | # run first seed of model; --fp16 58 | cmd="amr-parse --beam ${BEAM_SIZE} --batch-size ${BATCH_SIZE} --tokenize -c $checkpoint -i ${results_prefix}.sentences -o ${results_prefix}.amr --out-tokens ${results_prefix}.tokens --out-actions ${results_prefix}.actions" 59 | echo "$cmd" 60 | eval "$cmd" 61 | 62 | # GRAPH POST-PROCESSING 63 | 64 | if [ "$LINKER_CACHE_PATH" == "" ];then 65 | 66 | # just copy AMR to wiki AMR 67 | cp ${results_prefix}.amr ${results_prefix}.wiki.amr 68 | 69 | # TODO: Unelegant detection of linker method (temporary) 70 | elif [ -f "${LINKER_CACHE_PATH}/trn.wikis" ];then 71 | 72 | # Legacy linker 73 | python scripts/add_wiki.py \ 74 | ${results_prefix}.amr $wiki $LINKER_CACHE_PATH \ 75 | > ${results_prefix}.wiki.amr 76 | 77 | else 78 | 79 | # BLINK cache 80 | python scripts/retyper.py \ 81 | --inputfile ${results_prefix}.amr \ 82 | --outputfile ${results_prefix}.wiki.amr \ 83 | --skipretyper \ 84 | --wikify \ 85 | --blinkcachepath $LINKER_CACHE_PATH \ 86 | --blinkthreshold 0.0 87 | 88 | fi 89 | 90 | 91 | ##### SMATCH evaluation 92 | if [[ "$EVAL_METRIC" == "smatch" ]]; then 93 | 94 | # Smatch evaluation without wiki 95 | 96 | # until smatch is fixed, we need to remove the ISI alignment annotations 97 | sed 's@\~[0-9]\{1,\}@@g' ${results_prefix}.amr > ${results_prefix}.amr.no_isi 98 | 99 | echo "Computing SMATCH between ---" 100 | echo "$reference_amr_wiki" 101 | echo "${results_prefix}.amr" 102 | smatch.py -r 10 --significant 4 \ 103 | -f $reference_amr_wiki \ 104 | ${results_prefix}.amr.no_isi \ 105 | | tee ${results_prefix}.smatch 106 | 107 | elif [[ "$EVAL_METRIC" == "wiki.smatch" ]]; then 108 | 109 | # Smatch evaluation without wiki 110 | 111 | # until smatch is fixed, we need to remove the ISI alignment annotations 112 | sed 's@\~[0-9]\{1,\}@@g' ${results_prefix}.wiki.amr > ${results_prefix}.wiki.amr.no_isi 113 | 114 | # compute score 115 | echo "Computing SMATCH between ---" 116 | echo "$reference_amr_wiki" 117 | echo "${results_prefix}.wiki.amr" 118 | smatch.py -r 10 --significant 4 \ 119 | -f $reference_amr_wiki \ 120 | ${results_prefix}.wiki.amr.no_isi \ 121 | | tee ${results_prefix}.wiki.smatch 122 | 123 | fi 124 | -------------------------------------------------------------------------------- /tests/tokenizer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from transition_amr_parser.io import protected_tokenizer 4 | from random import shuffle 5 | 6 | 7 | def read_raw_amr(amr_file): 8 | 9 | # AMR file with ::snt and ::tok fields (JAMR) 10 | tokens = [] 11 | sents = [] 12 | with open(amr_file) as fid: 13 | for line in fid: 14 | if line.strip().startswith('# ::snt'): 15 | sents.append(line.split('# ::snt')[-1].strip()) 16 | elif line.strip().startswith('# ::tok'): 17 | tokens.append(line.split('# ::tok')[-1].strip()) 18 | assert len(tokens) == len(sents) 19 | return sents, tokens 20 | 21 | 22 | def main(amr_file, do_break=False): 23 | 24 | # # indices to ignore 25 | # ignore_indices =[ 26 | # 384, 385, 973, 1541, 27 | # 669, # 'a 28 | # 865, # 120. 29 | # 1335, # gov't 30 | # 1411, # !!!) 31 | # 1520, # PA. 32 | # ] 33 | ignore_indices = [] 34 | 35 | # read data 36 | sents, tokens = read_raw_amr(amr_file) 37 | 38 | # random order 39 | indices = list(range(len(tokens))) 40 | shuffle(indices) 41 | 42 | # simple tokenizer 43 | count = 0 44 | for index in indices: 45 | new_tokens = ' '.join(protected_tokenizer(sents[index], simple=True)[0]) 46 | if tokens[index] == new_tokens: 47 | count += 1 48 | elif do_break and index not in ignore_indices: 49 | print(index) 50 | print(sents[index]) 51 | print(tokens[index]) 52 | print(new_tokens) 53 | import ipdb; ipdb.set_trace(context=30) 54 | protected_tokenizer(sents[index]) 55 | 56 | perc = count * 100. / len(tokens) 57 | print(f'simple match {count}/{len(tokens)} {perc:.2f} %') 58 | 59 | # JAMR like tokenizer 60 | count = 0 61 | for index in indices: 62 | new_tokens = ' '.join(protected_tokenizer(sents[index])[0]) 63 | 64 | if tokens[index] == new_tokens: 65 | count += 1 66 | elif do_break and index not in ignore_indices: 67 | print(index) 68 | print(sents[index]) 69 | print(tokens[index]) 70 | print(new_tokens) 71 | import ipdb; ipdb.set_trace(context=30) 72 | protected_tokenizer(sents[index]) 73 | 74 | perc = count * 100. / len(tokens) 75 | print(f'JAMR-like match {count}/{len(tokens)} {perc:.2f} %') 76 | 77 | 78 | if __name__ == '__main__': 79 | main(sys.argv[1], do_break=False) 80 | --------------------------------------------------------------------------------