├── .gitignore
├── DATA
    └── wiki25.jkaln
├── LICENSE
├── README.md
├── configs
    ├── amr2.0-structured-bart-large-joint-voc-neur-al.sh
    ├── amr2.0-structured-bart-large-joint-voc.sh
    ├── amr2.0-structured-bart-large-neur-al-importance-sampling5.sh
    ├── amr2.0-structured-bart-large-neur-al-sampling5.sh
    ├── amr2.0-structured-bart-large-neur-al.sh
    ├── amr2.0-structured-bart-large.sh
    ├── amr2joint_ontowiki2_g2g-structured-bart-large.sh
    ├── amr3.0-structured-bart-large-doc-sliding-ws300x200.sh
    ├── amr3.0-structured-bart-large-doc-truncate-sliding-finetune-ws200x100.sh
    ├── amr3.0-structured-bart-large-doc-truncate-sliding-ws300x200.sh
    ├── amr3.0-structured-bart-large-doc-truncate.sh
    ├── amr3.0-structured-bart-large-doc_MODE-doc+sen-truncate-sliding-ws200x100.sh
    ├── amr3.0-structured-bart-large-doc_MODE-doc+sen-truncate.sh
    ├── amr3.0-structured-bart-large-joint-voc-neur-al.sh
    ├── amr3.0-structured-bart-large-joint-voc.sh
    ├── amr3.0-structured-bart-large-neur-al-sampling5.sh
    ├── amr3.0-structured-bart-large-neur-al.sh
    ├── amr3.0-structured-bart-large.sh
    ├── amr3joint_ontowiki2_g2g-structured-bart-large.sh
    ├── both_doc+sen.sh
    ├── both_doc+sen_packed.sh
    ├── both_doc+sen_trainsliding_ws400x100.sh
    ├── doc-finetune-from-conll-good-ws300x200-lr00005.sh
    ├── gold_doc+sen.sh
    ├── little_prince-structured-bart-base-neur-al.sh
    ├── wiki25-structured-bart-base-joint-voc.sh
    ├── wiki25-structured-bart-base-neur-al-importance-sampling.sh
    ├── wiki25-structured-bart-base-neur-al-mini.sh
    ├── wiki25-structured-bart-base-neur-al-sampling5.sh
    ├── wiki25-structured-bart-base-neur-al.sh
    └── wiki25-structured-bart-base.sh
├── docker
    └── Dockerfile
├── pyproject.toml
├── run
    ├── align.sh
    ├── amr_actions.sh
    ├── lsf
    │   ├── README.md
    │   ├── align.sh
    │   ├── final_test.sh
    │   ├── parse.sh
    │   ├── run_experiment.sh
    │   └── run_model_eval.sh
    ├── parse.sh
    ├── preprocess.sh
    ├── run_experiment.sh
    ├── run_model_eval.sh
    ├── run_model_eval_sliding.sh
    ├── status.py
    ├── status.sh
    ├── test.sh
    ├── train.sh
    └── train_aligner.sh
├── scripts
    ├── Blinker.py
    ├── README.md
    ├── add_wiki.py
    ├── amr_latex.py
    ├── convert_jamr_alignments_to_offsets.py
    ├── doc-amr
    │   ├── docamr_utils.py
    │   ├── get_doc_amr_from_sen.py
    │   ├── pack_amrs.py
    │   ├── remove_amrs.py
    │   └── remove_sen.py
    ├── export_alignment_model.sh
    ├── export_model.sh
    ├── install_satori.sh
    ├── jamr2isi.py
    ├── mbse.py
    ├── merge_files.py
    ├── parse.sh
    ├── play.py
    ├── plot_amr.py
    ├── plot_amr_latex.py
    ├── plot_results.py
    ├── read_propbank.py
    ├── remove_optimizer_state.py
    ├── remove_wiki.py
    ├── retyper.py
    ├── sanity_check_amr.py
    ├── smatch_aligner.py
    ├── split_amrs.py
    ├── tokenize_amr.py
    ├── triple_stats.py
    └── vimdiff_amr_files.py
├── service
    ├── amr.proto
    ├── amr2.proto
    ├── amr_client.py
    ├── amr_server.py
    └── wordvec.proto
├── setup.py
├── src
    ├── fairseq_ext
    │   ├── __init__.py
    │   ├── amr_reform
    │   │   ├── __init__.py
    │   │   └── o10_action_reformer_subtok.py
    │   ├── amr_spec
    │   │   ├── __init__.py
    │   │   ├── action_info.py
    │   │   ├── action_info_bartsv.py
    │   │   ├── action_info_binarize.py
    │   │   ├── action_info_binarize_bartsv.py
    │   │   ├── action_info_binarize_graphmp.py
    │   │   ├── action_info_binarize_graphmp_amr1.py
    │   │   ├── action_info_graphmp.py
    │   │   ├── action_info_graphmp_amr1.py
    │   │   ├── old_action_info.py
    │   │   └── old_action_info_binarize.py
    │   ├── average_checkpoints.py
    │   ├── binarize.py
    │   ├── criterions
    │   │   ├── __init__.py
    │   │   ├── label_smoothed_cross_entropy_pointer.py
    │   │   └── label_smoothed_cross_entropy_pointer_alignment.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── amr_action_pointer_bartsv_dataset.py
    │   │   ├── amr_action_pointer_dataset.py
    │   │   ├── amr_action_pointer_goldamr_dataset.py
    │   │   ├── amr_action_pointer_graphmp_dataset.py
    │   │   ├── amr_bpe.py
    │   │   ├── data_utils.py
    │   │   ├── indexed_dataset.py
    │   │   └── language_pair_dataset.py
    │   ├── extract_bart
    │   │   ├── __init__.py
    │   │   ├── binarize_encodings.py
    │   │   ├── composite_embeddings.py
    │   │   ├── mapavg_embeddings.py
    │   │   └── sentence_encoding.py
    │   ├── generate.py
    │   ├── generate_sliding.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── attention_masks.py
    │   │   ├── graph_attention_masks.py
    │   │   ├── graphmp_attention_masks.py
    │   │   ├── transformer_tgt_pointer.py
    │   │   ├── transformer_tgt_pointer_bart.py
    │   │   ├── transformer_tgt_pointer_bart_sattn.py
    │   │   ├── transformer_tgt_pointer_bartsv.py
    │   │   ├── transformer_tgt_pointer_bartsv_sattn.py
    │   │   ├── transformer_tgt_pointer_graph.py
    │   │   └── transformer_tgt_pointer_graphmp.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── factored_embeddings.py
    │   │   ├── multihead_attention.py
    │   │   ├── multihead_attention_old.py
    │   │   ├── transformer_layer.py
    │   │   └── transformer_layer_old.py
    │   ├── options.py
    │   ├── options_train.py
    │   ├── preprocess.py
    │   ├── preprocess_bart.py
    │   ├── preprocess_bartsv.py
    │   ├── preprocess_graphmp.py
    │   ├── roberta
    │   │   ├── __init__.py
    │   │   ├── binarize_embeddings.py
    │   │   ├── pretrained_embeddings.py
    │   │   └── pretrained_embeddings_bert.py
    │   ├── sequence_generator.py
    │   ├── sequence_generator_bartsv.py
    │   ├── sequence_generator_graph.py
    │   ├── sequence_generator_graphmp.py
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── amr_action_pointer.py
    │   │   ├── amr_action_pointer_bart.py
    │   │   ├── amr_action_pointer_bart_dyo.py
    │   │   ├── amr_action_pointer_bartsv.py
    │   │   ├── amr_action_pointer_graphmp.py
    │   │   └── amr_action_pointer_graphmp_amr1.py
    │   ├── tests
    │   │   ├── test_action_info_graphmp_tofile.py
    │   │   ├── test_action_info_tofile.py
    │   │   ├── test_action_info_tolist.py
    │   │   ├── test_amr_action_bpe.py
    │   │   ├── test_amr_action_unk.py
    │   │   ├── test_composite_embeddings.py
    │   │   ├── test_composite_embeddings_mapping.py
    │   │   └── test_factored_embeddings.py
    │   ├── tokenizer.py
    │   ├── train.py
    │   ├── utils.py
    │   ├── utils_font.py
    │   └── utils_import.py
    ├── ibm_neural_aligner
    │   ├── README.md
    │   ├── __init__.py
    │   ├── align_leamr.py
    │   ├── align_utils.py
    │   ├── alignment_decoder.py
    │   ├── amr_utils.py
    │   ├── ccc.launch_many_jobs.py
    │   ├── ccc.summarize.py
    │   ├── dummy_align.py
    │   ├── evaluation.py
    │   ├── formatter.py
    │   ├── gcn.py
    │   ├── gypsum
    │   │   ├── setup_amr2.sh
    │   │   ├── setup_amr3.sh
    │   │   ├── sweep.0.py
    │   │   ├── sweep.4a.py
    │   │   ├── templates.py
    │   │   ├── view_sweep.2.py
    │   │   └── view_sweep.py
    │   ├── install.sh
    │   ├── leamr_align.py
    │   ├── lexicon.py
    │   ├── main.py
    │   ├── make_splits.py
    │   ├── metric_utils.py
    │   ├── pretrained_embeddings.py
    │   ├── pretrained_embeddings.sh
    │   ├── resolve_manual_alignments.py
    │   ├── run_detailed_eval.py
    │   ├── run_eval.py
    │   ├── run_model_selection.py
    │   ├── run_sampler.py
    │   ├── standalone_elmo.py
    │   ├── tokenize_amr.py
    │   ├── transformer_lm.py
    │   ├── tree_lstm.py
    │   ├── tree_rnn.py
    │   ├── view_manual_alignments.py
    │   ├── vocab.py
    │   └── vocab_definitions.py
    └── transition_amr_parser
    │   ├── __init__.py
    │   ├── action_pointer
    │       ├── __init__.py
    │       ├── amr_parser.py
    │       ├── o8_data_oracle.py
    │       ├── o8_fake_parse.py
    │       ├── o8_state_machine.py
    │       ├── o8_state_machine_amr1.py
    │       ├── o8_state_machine_reformer.py
    │       ├── o8_state_machine_reformer_amr1.py
    │       ├── parse.py
    │       └── roberta_utils.py
    │   ├── add_id_to_amr.py
    │   ├── add_sentence_amrs_to_file.py
    │   ├── amr.py
    │   ├── amr_aligner.py
    │   ├── amr_constituents.py
    │   ├── amr_latex.py
    │   ├── amr_machine.py
    │   ├── clbar.py
    │   ├── force_overlap_actions.py
    │   ├── gold_subgraph_align.py
    │   ├── io.py
    │   ├── make_sliding_splits.py
    │   ├── merge_sliding_splits.py
    │   ├── parse.py
    │   └── plots.py
└── tests
    ├── align_mode.py
    ├── align_mode.sh
    ├── all.sh
    ├── amr_io.py
    ├── amr_io.sh
    ├── correctly_installed.py
    ├── correctly_installed.sh
    ├── create_wiki25_mockup.sh
    ├── download_little_prince.sh
    ├── fairseq_data_iterator.py
    ├── fairseq_data_iterator.sh
    ├── minimal_test.sh
    ├── minimal_test_lsf.sh
    ├── neural_aligner.sh
    ├── oracles
        ├── amr_o10.sh
        └── amr_o10_doc.sh
    ├── smatch.sh
    ├── standalone-doc.sh
    ├── standalone.sh
    └── tokenizer.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # These should not be commited
 2 | # environment setter. May contain keys and other THIS CAN NOT BE COMMITED
 3 | set_environment.sh
 4 | 
 5 | # git
 6 | *.orig
 7 | .mailmap
 8 | 
 9 | # data
10 | media/
11 | PROGRESS*
12 | DATA*
13 | *.zip
14 | *.json
15 | *.npy
16 | # *.npy.*
17 | 
18 | # run scripts logs
19 | logs*
20 | *.log
21 | 
22 | # external tools
23 | amr-evaluation/
24 | git-filter-repo/
25 | 
26 | # virtual environments
27 | .python-version
28 | venv*/
29 | cenv*/
30 | 
31 | # debug
32 | *.lprof
33 | debug*
34 | tmp*
35 | TMP*
36 | 
37 | # other
38 | __pycache__/
39 | *.ipynb_checkpoints/
40 | *.ipynb
41 | transition_amr_parser.egg-info/
42 | 
43 | # python package
44 | dist/
45 | 
46 | # cluster tools
47 | jbsub_logs/
48 | .jbsub_logs/
49 | 
50 | # vim
51 | .vim/
52 | ctags
53 | *.swp
54 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM registry.access.redhat.com/ubi8/ubi:latest as rhel-base
  2 | 
  3 | RUN dnf update -y \
  4 |     && dnf install -y --disableplugin=subscription-manager \
  5 |     python38 \
  6 |     python38-setuptools \
  7 |     python38-wheel \
  8 |     python38-pip \
  9 |     python38-devel \
 10 |     make \
 11 |     git \
 12 |     glibc-langpack-en \
 13 |     curl \
 14 |     gcc \
 15 |     gcc-c++ \
 16 |     unzip \
 17 |     && dnf autoremove -y \
 18 |     && dnf clean all \
 19 |     && pip3 install --upgrade --no-cache-dir pip
 20 | 
 21 | ADD https://github.com/ibmruntimes/semeru8-binaries/releases/download/jdk8u302-b08_openj9-0.27.0/ibm-semeru-open-8-jdk-1.8.0.302.b08-1.x86_64.rpm .
 22 | RUN dnf install -y ibm-semeru-open-8-jdk-1.8.0.302.b08-1.x86_64.rpm
 23 | 
 24 | #RUN locale-gen en_US.UTF-8 && /usr/sbin/update-locale LANG=en_US.UTF-8
 25 | ENV LANG en_US.UTF-8
 26 | ENV LANGUAGE en_US:en
 27 | ENV LC_ALL en_US.UTF-8
 28 | # Model Location
 29 | 
 30 | ENV MODEL_PATH "DATA/gofa20220412/models/exp_cofill_o10_act-states_bart.base/_act-pos_vmask1_shiftpos1_ptr-lay6-h1_cam-layall-h2-abuf_dec-sep-emb-sha0_bart-init-dec-emb__fp16-_lr0.0001-mt1024x8-wm4000-dp0.2/ep15-seed44/checkpoint_top3-avg.pt"
 31 | 
 32 | # GRPC Port (so that it can be set during run time)
 33 | ENV GRPC_PORT "50051"
 34 | 
 35 | # Set cache paths
 36 | ENV CACHE_DIR "DATA"
 37 | ENV ROBERTA_CACHE_PATH ${CACHE_DIR}/bart.base
 38 | ENV PYTHONPATH /amr_parser/pip_modules
 39 | ## Install grpc for python3
 40 | 
 41 | FROM rhel-base as rhel-stage1
 42 | ARG ARTIFACTORY_USERNAME
 43 | ARG ARTIFACTORY_API_KEY
 44 | ENV ARTIFACTORY_USERNAME=$ARTIFACTORY_USERNAME
 45 | ENV ARTIFACTORY_API_KEY=$ARTIFACTORY_API_KEY
 46 | 
 47 | # ADD . /amr_parser/
 48 | WORKDIR /amr_parser
 49 | 
 50 | COPY LICENSE README.md setup.py requirements.txt /amr_parser/
 51 | COPY DATA/ /amr_parser/DATA
 52 | COPY preprocess/ /amr_parser/preprocess
 53 | COPY scripts/ /amr_parser/scripts/
 54 | COPY tests/ /amr_parser/tests
 55 | COPY configs/ /amr_parser/configs/
 56 | COPY fairseq_ext/ /amr_parser/fairseq_ext/
 57 | COPY service/ /amr_parser/service
 58 | COPY transition_amr_parser/ /amr_parser/transition_amr_parser/
 59 | 
 60 | RUN python3 -m pip install -t ${PYTHONPATH} --upgrade pip \
 61 |     && python3 -m pip install -t ${PYTHONPATH} protobuf grpcio grpcio-tools grpcio-health-checking \
 62 |     && python3 -m pip install -t ${PYTHONPATH} statsd
 63 | 
 64 | # Copy code
 65 | # ADD . /amr_parser
 66 | 
 67 | #RUN scripts/update_config.sh \
 68 | #    --artifactory_username $ARTIFACTORY_USERNAME \
 69 | #    --artifactory_api_key $ARTIFACTORY_API_KEY \
 70 | #    --encode_username
 71 | #RUN pip install -t ${PYTHONPATH} 'pyizumo[dp]'==0.1.5 watson-sire==1.0.18 requests
 72 | 
 73 | ARG TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX 7.5+PTX"
 74 | # Install the packages
 75 | RUN python3 -m pip install -t ${PYTHONPATH} .
 76 | #RUN pip3 install  -t ${PYTHONPATH} -r requirements.txt
 77 | RUN rm -rf ${PYTHONPATH}/dataclasses*
 78 | RUN python3 -m pip install -t ${PYTHONPATH} torch-scatter==1.3.2
 79 | 
 80 | # Compile the protos
 81 | RUN python3 -m grpc_tools.protoc -I./service/  --python_out=./service/ --grpc_python_out=./service/ ./service/wordvec.proto
 82 | RUN python3 -m grpc_tools.protoc -I./service/  --python_out=./service/ --grpc_python_out=./service/ ./service/amr2.proto
 83 | # RUN chown worker:worker /amr_parser
 84 | RUN rm -rf *.zip
 85 | RUN chmod -R 777 /amr_parser
 86 | 
 87 | FROM golang:1.17 AS grpcurl_build
 88 | RUN go install github.com/fullstorydev/grpcurl/cmd/grpcurl@latest
 89 | 
 90 | FROM rhel-base as amr-final
 91 | COPY --from=rhel-stage1 /amr_parser/ /amr_parser/
 92 | COPY --from=grpcurl_build /go/bin/grpcurl /usr/local/bin/grpcurl
 93 | # start the server
 94 | ENV PYTHONPATH "/amr_parser:/amr_parser/server:/amr_parser/pip_modules"
 95 | 
 96 | WORKDIR /amr_parser
 97 | RUN ls -l /amr_parser/DATA
 98 | RUN ls -l /amr_parser/DATA/bart.base
 99 | #RUN mkdir -p /.cache && chmod -R 777 /.cache
100 | #RUN python3 service/amr_test.py -m ${MODEL_PATH} -c ${ROBERTA_CACHE_PATH}
101 | CMD python3 -u service/amr_server.py --in-model ${MODEL_PATH} --roberta-cache-path ${ROBERTA_CACHE_PATH} --port ${GRPC_PORT}
102 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "transition_neural_parser"
 7 | version = '0.5.4'
 8 | authors = [
 9 |   { name="Ramon", email="ramon.astudillo@ibm.com" },
10 |   { name="Young-Suk", email="ysuklee@us.ibm.com" },
11 |   { name="Tahira", email="tnaseem@us.ibm.com" },
12 |   { name="Sadhana", email="sadhana.kumaravel1@ibm.com" },
13 |   { name="GX", email="GX.Xu@ibm.com" },
14 |   { name="Hans", email="raduf@us.ibm.com" },
15 |   { name="Salim", email="roukos@us.ibm.com" },
16 | ]
17 | description = "The package for transition based nueral AMR parser"
18 | readme = "README.md"
19 | requires-python = ">=3.7"
20 | classifiers = [
21 |     "Programming Language :: Python :: 3",
22 |     "License :: OSI Approved :: Apache Software License",
23 |     "Operating System :: POSIX :: Linux",
24 | ]
25 | 
26 | [tool.poetry.platforms]
27 | os = "linux, darwin"
28 | 
29 | [project.urls]
30 | homepage = "https://github.com/IBM/transition-amr-parser"
31 | tracker = "https://github.com/IBM/transition-amr-parser" 


--------------------------------------------------------------------------------
/run/align.sh:
--------------------------------------------------------------------------------
 1 | set -o pipefail
 2 | set -o errexit
 3 | . set_environment.sh
 4 | HELP="$0 <checkpoint> <in_amr> <out_amr>"
 5 | [ -z $1 ] && echo "$HELP" && exit 1
 6 | [ -z $2 ] && echo "$HELP" && exit 1
 7 | [ -z $3 ] && echo "$HELP" && exit 1
 8 | checkpoint=$1
 9 | in_amr=$2
10 | out_amr=$3
11 | set -o nounset
12 | 
13 | amr-parse --in-checkpoint $checkpoint --in-amr $in_amr --out-amr $out_amr --batch-size 512 --roberta-batch-size 512
14 | 


--------------------------------------------------------------------------------
/run/lsf/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | This code is intended to train models from scratch on the CCC cluster but can
  3 | be repurposed for other task managers e.g. slurm. You can do a mini run to
  4 | check how this all works under
  5 | 
  6 |     bash tests/minimal_test_lsf.sh
  7 | 
  8 | First of all make sure you have installed according to README.md. Be sure to
  9 | activate your environment in `set_environment.sh` since this is called by the
 10 | different scripts
 11 | 
 12 | Then ensure you have unzipped the data from its location, you will need at least
 13 | 
 14 | 1. the corpus you want to train for e.g. AMr2.0 (optionally already aligned)
 15 | 
 16 | 2. the entity linking cache for that corpus
 17 | 
 18 | once you have unzipped these items we are ready to go. The code is though to be
 19 | latched from a **login node** not a compute node. You will need some app to
 20 | have a pervasive session on that login node (this is a good idea in general)
 21 | like tmux (recommended) or screen. From one of those do e.g.
 22 | 
 23 |     bash run/lsf/run_experiment.sh configs/amr2.0-structured-bart-large-neur-al.sh
 24 | 
 25 | this will launch all the needed jobs in a dependent fashion so that one is run
 26 | after another (seeds will be ran in parallel). It will also display the status
 27 | of the training. The script will hold until the first checkpoint is created to
 28 | launch the evaluation jobs. This is why this command line call needs to be kept
 29 | alive, after that it is no longer necessary.
 30 | 
 31 | At any point you can do
 32 | 
 33 |     bash run/status.sh -c configs/amr2.0-structured-bart-large-neur-al.sh
 34 | 
 35 | to check the status of that experiment. Once results start appearing, you can use
 36 | 
 37 |     bash run/status.sh --configs configs/amr2.0-structured-bart-large-neur-al.sh --results
 38 | 
 39 | to check progress. To compare models and get details of loss and Smatch, you
 40 | can plot a png and bring it locally with scp with
 41 | 
 42 |     python scripts/plot_results.py --in-configs configs/amr2.0-structured-bart-large-neur-al.sh --title my-training --out-png my-training.png
 43 | 
 44 | each step of the experiment has its own folder and it is completed it should
 45 | have a `.done` file. If you delete this the stage will be redone (not the
 46 | neural aligner has multiple of these files). The final model should be found under e.g.
 47 | 
 48 |     DATA/AMR2.0/models/amr2.0-structured-bart-large-neur-al/
 49 | 
 50 | We try to avoid running on the tests set to prevent corpus overfitting, this
 51 | can be done with
 52 | 
 53 |     bash run/lsf/final_test.sh configs/amr2.0-structured-bart-large-neur-al.sh
 54 | 
 55 | It will ask you to confirm.    
 56 | 
 57 | Once training is done you can save space by calling
 58 | 
 59 |     bash run/status.sh -c configs/amr2.0-structured-bart-large-neur-al.sh --final-remove
 60 | 
 61 | This will remove the optimizer from configs `DECODING_CHECKPOINT` and delete
 62 | all other. Save copies if you want further train later.
 63 | 
 64 | to save the minimal files needed for a model into a zip do
 65 | 
 66 |     bash scripts/export_model.sh configs/amr2.0-structured-bart-large-neur-al.sh
 67 | 
 68 | ## Things that can go wrong
 69 | 
 70 | Code is built to be able to resume if it stops, just do 
 71 | 
 72 |     bash run/lsf/run_experiment.sh configs/amr2.0-structured-bart-large-neur-al.sh
 73 | 
 74 | But it should not die, so if it did it is important to find the reason first
 75 | before resuming.
 76 | 
 77 | The most common problem is that you hit your space quota and code dies halfway
 78 | while writing a checkpoint. You need to know how to check your quota to avoid
 79 | this. Also the jobs doing evaluation also take care of removing checkpoints. If
 80 | these die then your space can finish quickly. This should not happen and it is
 81 | best to find the reason why this happened before relaunching evaluation. You
 82 | can do this with
 83 | 
 84 |     bash run/lsf/run_model_eval.sh configs/amr2.0-structured-bart-large-neur-al.sh
 85 | 
 86 | If you hit your quota, you need to fix that first, then you will also have to
 87 | find and delete corrupted checkpoints. For this you can use
 88 | 
 89 |     bash run/status.sh -c configs/amr2.0-structured-bart-large-neur-al.sh --remove-corrupted-checkpoints
 90 | 
 91 | the code automatically calls
 92 | 
 93 |     bash run/status.sh -c configs/amr2.0-structured-bart-large-neur-al.sh --link-best --remove
 94 | 
 95 | to find the best checkpoint and remove checkpoints not in the top n-best, but
 96 | it may come handy to run this yourself at some point. It is already a bad
 97 | state of affairs if some checkpoint got deleted without being evaluated, but
 98 | you can always ignore this by adding `--ignore-missing-checkpoints`
 99 | 
100 | ## Parsing Large Corpora
101 | 
102 | calling this script on a login node
103 | 
104 | ```
105 | bash run/lsf/parse.sh <path to checkpoint> <large file with sentences> [-s chunk size]
106 | ```
107 | 
108 | will split your data into chunks of `<chunk size>` and launch a paralell job for each. Results for each chunk are stored in
109 | 
110 | ```
111 | <large file with sentences>.split_<alphabetic string>
112 | ```
113 | 
114 | once all jobs are completed to recompose, just do
115 | 
116 | ```
117 | cat <large file with sentences>.split_* > <output_amr_file>
118 | ```
119 | 


--------------------------------------------------------------------------------
/run/lsf/align.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | . set_environment.sh
 5 | set -o nounset
 6 | 
 7 | # Argument handling
 8 | # First argument must be checkpoint
 9 | HELP="\nbash $0 <checkpoint> <in_amr> <out_amr> [-s <max_split_size>]\n"
10 | [ -z "$1" ] && echo -e "$HELP" && exit 1
11 | [ -z "$2" ] && echo -e "$HELP" && exit 1
12 | [ -z "$3" ] && echo -e "$HELP" && exit 1
13 | first_path=$(echo $1 | sed 's@:.*@@g')
14 | [ ! -f "$first_path" ] && "Missing $1" && exit 1
15 | checkpoint=$1
16 | in_amr=$2
17 | out_amr=$3
18 | # process the rest with argument parser
19 | max_split_size=2000
20 | shift 
21 | shift 
22 | shift 
23 | while [ "$#" -gt 0 ]; do
24 |   case "$1" in
25 |     -s) max_split_size="$2"; shift 2;;
26 |     *) echo "unrecognized argument: $1"; exit 1;;
27 |   esac
28 | done
29 | 
30 | # splits folder
31 | splits_folder=${out_amr}.${max_split_size}splits/
32 | mkdir -p $splits_folder
33 | 
34 | # Split files
35 | split_files=$(
36 |     python scripts/split_amrs.py \
37 |     $in_amr $max_split_size ${splits_folder}/in_split
38 | )
39 | 
40 | # Launch multiple decodings jobs
41 | for split in $split_files;do
42 | 
43 |     echo "bash run/align.sh $checkpoint $split ${split}.amr"
44 | 
45 |     if [ ! -f "${split}.amr" ];then
46 | 
47 |         jbsub -cores 1+1 -mem 50g -q x86_6h -require v100 \
48 |               -name $(basename $split)-$$ \
49 |               -out ${splits_folder}/align-%J-$$.stdout \
50 |               -err ${splits_folder}/align-%J-$$.stderr \
51 |               /bin/bash run/align.sh $checkpoint $split ${split}.amr
52 |     
53 |     fi
54 | 
55 | done
56 | 


--------------------------------------------------------------------------------
/run/lsf/final_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | set -o errexit
 3 | set -o pipefail
 4 | 
 5 | # Argument handling
 6 | HELP="\ne.g. bash $0 <config>\n"
 7 | [ -z "$1" ] && echo -e "$HELP" && exit 1
 8 | config=$1
 9 | if [ -z "$2" ];then
10 |     # identify experiment by the repository tag
11 |     jbsub_basename="$(basename $config | sed 's@\.sh$@@')"
12 | else
13 |     # identify experiment by given tag
14 |     jbsub_basename=$2
15 | fi
16 | # set environment (needed for the python code below)
17 | # NOTE: Old set_environment.sh forbids launching in login node.
18 | . set_environment.sh
19 | set -o nounset
20 | 
21 | # Load config
22 | echo "[Configuration file:]"
23 | echo $config
24 | . $config
25 | 
26 | # MANUAL OVERRIDE !!
27 | # BEAM_SIZE=1
28 | # DECODING_CHECKPOINT=checkpoint_wiki.smatch_best1.pt
29 | 
30 | # Running test announcement
31 | printf "\n\033[93mWARNING\033[0m: Everytime you look at the test set, your corpus dies a little (by corpus overfitting)\n\n" 
32 | echo -e " \nbash run/ad_test.sh ${MODEL_FOLDER}seed${SEEDS}/$DECODING_CHECKPOINT -b $BEAM_SIZE -s test\n"
33 | read -p "Do you wish to continue? Y/[N]" answer
34 | [ "$answer" != "Y" ] && exit 1
35 | 
36 | # Exit if we launch this directly from a computing node
37 | if [[ "$HOSTNAME" =~ dccpc.* ]] || [[ "$HOSTNAME" =~ dccx[cn].* ]] || [[ "$HOSTNAME" =~ cccx[cn].* ]];then
38 |     echo -e "\n$0 must be launched from a login node (submits its own jbsub calls)\n" 
39 |     exit 1
40 | fi
41 | 
42 | for seed in $SEEDS;do
43 | 
44 |     # define seed and working dir
45 |     checkpoints_dir="${MODEL_FOLDER}seed${seed}/"
46 | 
47 |     # test all available checkpoints and link the best model on dev too
48 |     jbsub_tag="fdec-${jbsub_basename}-s${seed}-$$"
49 |     jbsub -cores 1+1 -mem 150g -q x86_6h -require v100 \
50 |           -name "$jbsub_tag" \
51 |           -out $checkpoints_dir/${jbsub_tag}-%J.stdout \
52 |           -err $checkpoints_dir/${jbsub_tag}-%J.stderr \
53 |           /bin/bash run/test.sh ${checkpoints_dir}/$DECODING_CHECKPOINT \
54 |             -b $BEAM_SIZE \
55 |             -s test
56 | 
57 | done
58 | 


--------------------------------------------------------------------------------
/run/lsf/parse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | 
 5 | # Argument handling
 6 | # First argument must be checkpoint
 7 | HELP="\nbash $0 <checkpoint> <tokenized_sentences> [-s <max_split_size>] [--tokenize]\n"
 8 | [ -z "$1" ] && echo -e "$HELP" && exit 1
 9 | [ -z "$2" ] && echo -e "$HELP" && exit 1
10 | first_path=$(echo $1 | sed 's@:.*@@g')
11 | [ ! -f "$first_path" ] && "Missing $1" && exit 1
12 | checkpoint=$1
13 | tokenized_sentences=$2
14 | # process the rest with argument parser
15 | tokenize=""
16 | max_split_size=2000
17 | shift 
18 | shift 
19 | while [ "$#" -gt 0 ]; do
20 |   case "$1" in
21 |     --tokenize) tokenize="--tokenize"; shift 1;;
22 |     -s) max_split_size="$2"; shift 2;;
23 |     *) echo "unrecognized argument: $1"; exit 1;;
24 |   esac
25 | done
26 | 
27 | set -o nounset
28 | 
29 | # Split files
30 | split -l $max_split_size $tokenized_sentences ${tokenized_sentences}.split_
31 | 
32 | # Launch multiple decodings jobs
33 | for split in $(ls ${tokenized_sentences}.split_*);do
34 |     jbsub -cores 1+1 -mem 50g -q x86_6h -require v100 \
35 |           -name $(basename $split) \
36 |           -out $(dirname $split)%J.stdout \
37 |           -err $(dirname $split)/%J.stderr \
38 |           /bin/bash run/parse.sh $checkpoint $split ${split}.amr $tokenize
39 | done
40 | 


--------------------------------------------------------------------------------
/run/lsf/run_model_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | set -o errexit
 3 | set -o pipefail
 4 | 
 5 | # Argument handling
 6 | HELP="\ne.g. bash $0 <config>\n"
 7 | [ -z "$1" ] && echo -e "$HELP" && exit 1
 8 | config=$1
 9 | if [ -z "$2" ];then
10 |     # identify experiment by the repository tag
11 |     jbsub_basename="$(basename $config | sed 's@\.sh$@@')"
12 | else
13 |     # identify experiment by given tag
14 |     jbsub_basename=$2
15 | fi
16 | # set environment (needed for the python code below)
17 | # NOTE: Old set_environment.sh forbids launching in login node.
18 | . set_environment.sh
19 | set -o nounset
20 | 
21 | # Load config
22 | echo "[Configuration file:]"
23 | echo $config
24 | . $config
25 | 
26 | # Quick exits
27 | # Data not extracted or aligned data not provided
28 | if [ ! -f "$AMR_TRAIN_FILE_WIKI" ] && [ ! -f "$ALIGNED_FOLDER/train.txt" ];then
29 |     echo -e "\nNeeds $AMR_TRAIN_FILE_WIKI or $ALIGNED_FOLDER/train.txt\n" 
30 |     exit 1
31 | fi
32 | # linking cache not empty but folder does not exist
33 | if [ "$LINKER_CACHE_PATH" != "" ] && [ ! -d "$LINKER_CACHE_PATH" ];then
34 |     echo -e "\nNeeds linking cache $LINKER_CACHE_PATH\n"
35 |     exit 1
36 | fi 
37 | 
38 | # wait until first checkpoint is available for any of the seeds. 
39 | # Clean-up checkpoints and inform of status in the meanwhile
40 | python run/status.py -c $config \
41 |     --wait-checkpoint-ready-to-eval --clear
42 | 
43 | for seed in $SEEDS;do
44 | 
45 |     checkpoints_dir="${MODEL_FOLDER}seed${seed}/"
46 | 
47 |     # test all available checkpoints and link the best model on dev too
48 |     jbsub_tag="tdec-${jbsub_basename}-s${seed}-$$"
49 |     jbsub -cores 1+1 -mem 50g -q x86_6h -require v100 \
50 |           -name "$jbsub_tag" \
51 |           -out $checkpoints_dir/${jbsub_tag}-%J.stdout \
52 |           -err $checkpoints_dir/${jbsub_tag}-%J.stderr \
53 |           /bin/bash run/run_model_eval.sh $config "$seed"
54 | 
55 | done
56 | 
57 | # wait until final models has been evaluated 
58 | # NOTE checkpoints are cleaned-up by run_model_eval.sh
59 | python run/status.py -c $config --wait-finished --clear
60 | 


--------------------------------------------------------------------------------
/run/parse.sh:
--------------------------------------------------------------------------------
 1 | set -o pipefail
 2 | set -o errexit
 3 | . set_environment.sh
 4 | HELP="$0 <checkpoint> <tokenized sents> <out_amr> [--tokenize]"
 5 | [ -z $1 ] && echo "$HELP" && exit 1
 6 | [ -z $2 ] && echo "$HELP" && exit 1
 7 | [ -z $3 ] && echo "$HELP" && exit 1
 8 | checkpoint=$1
 9 | tokenized_sentences=$2
10 | out_amr=$3
11 | 
12 | tokenize=""
13 | shift 3
14 | while [ "$#" -gt 0 ]; do
15 |   case "$1" in
16 |     --tokenize) tokenize="--tokenize"; shift 1;;
17 |     *) echo "unrecognized argument: $1"; exit 1;;
18 |   esac
19 | done
20 | 
21 | amr-parse \
22 |     --in-checkpoint $checkpoint \
23 |     --in-tokenized-sentences $tokenized_sentences \
24 |     --out-amr $out_amr \
25 |     $tokenize
26 | 


--------------------------------------------------------------------------------
/run/run_experiment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | 
 5 | # Argument handling
 6 | HELP="\nbash $0 <config>\n"
 7 | [ -z "$1" ] && echo -e "$HELP" && exit 1
 8 | config=$1
 9 | [ ! -f "$config" ] && "Missing $config" && exit 1
10 | 
11 | # activate virtualenenv and set other variables
12 | . set_environment.sh
13 | 
14 | set -o nounset
15 | 
16 | # random seed
17 | seed=42
18 | # decode in paralel to training. ATTENTION: you will need to GPUS for this
19 | on_the_fly_decoding=false
20 | 
21 | # Load config
22 | echo "[Configuration file:]"
23 | echo $config
24 | . $config 
25 | 
26 | # Quick exits
27 | # Data not extracted or aligned data not provided
28 | if [ ! -f "$AMR_TRAIN_FILE_WIKI" ] && [ ! -f "$ALIGNED_FOLDER/train.txt" ];then
29 |     echo -e "\nNeeds $AMR_TRAIN_FILE_WIKI or $ALIGNED_FOLDER/train.txt\n" 
30 |     exit 1
31 | fi
32 | # linking cache not empty but folder does not exist
33 | if [ "$LINKER_CACHE_PATH" != "" ] && [ ! -d "$LINKER_CACHE_PATH" ];then
34 |     echo -e "\nNeeds linking cache $LINKER_CACHE_PATH\n"
35 |     exit 1
36 | fi    
37 | # not using neural aligner but no alignments provided
38 | if [ "$align_tag" != "ibm_neural_aligner" ] && [ ! -f $ALIGNED_FOLDER/.done ];then
39 |     echo -e "\nYou need to provide $align_tag alignments\n"
40 |     exit 1
41 | fi
42 | 
43 | # This will store the final model
44 | mkdir -p ${MODEL_FOLDER}seed${seed}
45 | # Copy the config and soft-link it with an easy to find name
46 | cp $config ${MODEL_FOLDER}seed${seed}/
47 | rm -f ${MODEL_FOLDER}seed${seed}/config.sh
48 | ln -s $(basename $config) ${MODEL_FOLDER}seed${seed}/config.sh
49 | 
50 | # Add a tag with the commit(s) used to train this model. 
51 | if [ "$(git status --porcelain | grep -v '^??')" == "" ];then
52 |     # no uncommited changes
53 |     touch "${MODEL_FOLDER}seed${seed}/$(git log --format=format:"%h" -1)"
54 | else
55 |     # uncommited changes
56 |     touch "${MODEL_FOLDER}seed${seed}/$(git log --format=format:"%h" -1)+"
57 | fi
58 | 
59 | echo "[Aligning AMR:]"
60 | mkdir -p $ALIGNED_FOLDER
61 | bash run/train_aligner.sh $config
62 | 
63 | echo "[Building oracle actions:]"
64 | mkdir -p $ORACLE_FOLDER
65 | # TODO: replace by task agnostic oracle creation
66 | bash run/amr_actions.sh $config
67 | 
68 | echo "[Preprocessing data:]"
69 | mkdir -p $DATA_FOLDER
70 | bash run/preprocess.sh $config
71 | 
72 | [ "$on_the_fly_decoding" = true ] \
73 |     && echo "[Decoding and computing smatch (on the fly):]" \
74 |     && bash run/run_model_eval.sh $config $seed &
75 | 
76 | echo "[Training:]"
77 | bash run/train.sh $config $seed 
78 | 
79 | [ "$on_the_fly_decoding" = false ] \
80 |     && echo "[Decoding and computing smatch:]" \
81 |     && bash run/run_model_eval.sh $config $seed
82 | 


--------------------------------------------------------------------------------
/run/run_model_eval.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -o errexit
  4 | set -o pipefail
  5 | 
  6 | # Argument handling
  7 | HELP="\nbash $0 <config> <seed>\n"
  8 | # config file
  9 | [ -z "$1" ] && echo -e "$HELP" && exit 1
 10 | [ ! -f "$1" ] && "Missing $1" && exit 1
 11 | config=$1
 12 | # random seed
 13 | [ -z "$2" ] && echo -e "$HELP" && exit 1
 14 | seed=$2
 15 | 
 16 | # activate virtualenenv and set other variables
 17 | . set_environment.sh
 18 | 
 19 | set -o nounset
 20 | 
 21 | # Load config
 22 | echo "[Configuration file:]"
 23 | echo $config
 24 | . $config 
 25 | 
 26 | # Quick exits
 27 | # Data not extracted or aligned data not provided
 28 | if [ ! -f "$AMR_TRAIN_FILE_WIKI" ] && [ ! -f "$ALIGNED_FOLDER/train.txt" ];then
 29 |     echo -e "\nNeeds $AMR_TRAIN_FILE_WIKI or $ALIGNED_FOLDER/train.txt\n" 
 30 |     exit 1
 31 | fi
 32 | # linking cache not empty but folder does not exist
 33 | if [ "$LINKER_CACHE_PATH" != "" ] && [ ! -d "$LINKER_CACHE_PATH" ];then
 34 |     echo -e "\nNeeds linking cache $LINKER_CACHE_PATH\n"
 35 |     exit 1
 36 | fi 
 37 | 
 38 | # folder of the model seed
 39 | checkpoints_folder=${MODEL_FOLDER}seed${seed}/
 40 | 
 41 | # Evaluate all required checkpoints with EVAL_METRIC
 42 | if [ ! -f "$checkpoints_folder/epoch_tests/.done" ];then
 43 | 
 44 |     mkdir -p "$checkpoints_folder/epoch_tests/"
 45 | 
 46 |     # Note this removes models and links best models on the fly
 47 |     while [ "$(python run/status.py -c $config --seed $seed --list-checkpoints-to-eval --link-best --remove)" != "" ];do
 48 |     
 49 |         # get existing checkpoints
 50 |         ready_checkpoints=$(python run/status.py -c $config --seed $seed --list-checkpoints-ready-to-eval)
 51 |     
 52 |         # if there are no checkpoints at this moment, wait and restart loop
 53 |         if [ "$ready_checkpoints" == "" ];then
 54 |             printf "\r$$ is waiting for checkpoints of ${config}:$seed"
 55 |             sleep 1m
 56 |             continue
 57 |         fi    
 58 |         echo ""
 59 |     
 60 |         # run test for these checkpoints
 61 |         for checkpoint in $ready_checkpoints;do
 62 |             results_prefix=$checkpoints_folder/epoch_tests/dec-$(basename $checkpoint .pt)
 63 |             bash run/test.sh $checkpoint -o $results_prefix
 64 | 
 65 |             # clean this checkpoint. This can be helpful if we started a job
 66 |             # with lots of pending checkpoints to evaluate
 67 |             python run/status.py -c $config --seed $seed  --link-best --remove
 68 |         done
 69 |     done
 70 |     touch $checkpoints_folder/epoch_tests/.done
 71 | 
 72 | else
 73 | 
 74 |     printf "[\033[92m done \033[0m] $checkpoints_folder/epoch_tests/.done\n"
 75 |     
 76 | fi
 77 | 
 78 | # This should not be needed, but its a sanity check
 79 | python run/status.py -c $config --seed $seed --list-checkpoints-to-eval --link-best --remove
 80 | 
 81 | # 3 checkpoint average
 82 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt ]]; then
 83 |     echo "Evaluation/Ranking failed, missing $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt "
 84 |     exit 1
 85 | fi
 86 | 
 87 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_top3-avg.pt ]]; then
 88 |     python src/fairseq_ext/average_checkpoints.py \
 89 |         --input \
 90 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best1.pt \
 91 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best2.pt \
 92 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt \
 93 |         --output $checkpoints_folder/checkpoint_${EVAL_METRIC}_top3-avg.pt
 94 | fi
 95 | 
 96 | 
 97 | # 5 checkpoint average
 98 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt ]]; then
 99 |     echo "Evaluation/Ranking failed, missing $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt "
100 |     exit 1
101 | fi
102 | 
103 | 
104 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_top5-avg.pt ]]; then
105 |     python src/fairseq_ext/average_checkpoints.py \
106 |         --input \
107 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best1.pt \
108 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best2.pt \
109 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt \
110 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best4.pt \
111 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt \
112 |         --output $checkpoints_folder/checkpoint_${EVAL_METRIC}_top5-avg.pt
113 | fi
114 | 
115 | # Final run
116 | [ ! -f "$checkpoints_folder/$DECODING_CHECKPOINT" ] \
117 |     && echo -e "Missing $checkpoints_folder/$DECODING_CHECKPOINT" \
118 |     && exit 1
119 | mkdir -p $checkpoints_folder/beam${BEAM_SIZE}/
120 | bash run/test.sh $checkpoints_folder/$DECODING_CHECKPOINT -b $BEAM_SIZE
121 | 


--------------------------------------------------------------------------------
/run/run_model_eval_sliding.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -o errexit
  4 | set -o pipefail
  5 | 
  6 | # Argument handling
  7 | HELP="\nbash $0 <config> <seed>\n"
  8 | # config file
  9 | [ -z "$1" ] && echo -e "$HELP" && exit 1
 10 | [ ! -f "$1" ] && "Missing $1" && exit 1
 11 | config=$1
 12 | # random seed
 13 | [ -z "$2" ] && echo -e "$HELP" && exit 1
 14 | seed=$2
 15 | 
 16 | # activate virtualenenv and set other variables
 17 | . set_environment.sh
 18 | 
 19 | set -o nounset
 20 | 
 21 | # Load config
 22 | echo "[Configuration file:]"
 23 | echo $config
 24 | . $config 
 25 | 
 26 | # Quick exits
 27 | # Data not extracted or aligned data not provided
 28 | if [ ! -f "$AMR_TRAIN_FILE_WIKI" ] && [ ! -f "$ALIGNED_FOLDER/train.txt" ];then
 29 |     echo -e "\nNeeds $AMR_TRAIN_FILE_WIKI or $ALIGNED_FOLDER/train.txt\n" 
 30 |     exit 1
 31 | fi
 32 | # linking cache not empty but folder does not exist
 33 | if [ "$LINKER_CACHE_PATH" != "" ] && [ ! -d "$LINKER_CACHE_PATH" ];then
 34 |     echo -e "\nNeeds linking cache $LINKER_CACHE_PATH\n"
 35 |     exit 1
 36 | fi 
 37 | 
 38 | # folder of the model seed
 39 | checkpoints_folder=${MODEL_FOLDER}seed${seed}/
 40 | 
 41 | # Evaluate all required checkpoints with EVAL_METRIC
 42 | if [ ! -f "$checkpoints_folder/epoch_tests/.done" ];then
 43 | 
 44 |     mkdir -p "$checkpoints_folder/epoch_tests/"
 45 | 
 46 |     # Note this removes models and links best models on the fly
 47 |     while [ "$(python run/status.py -c $config --seed $seed --list-checkpoints-to-eval --link-best --remove)" != "" ];do
 48 |     
 49 |         # get existing checkpoints
 50 |         ready_checkpoints=$(python run/status.py -c $config --seed $seed --list-checkpoints-ready-to-eval)
 51 |     
 52 |         # if there are no checkpoints at this moment, wait and restart loop
 53 |         if [ "$ready_checkpoints" == "" ];then
 54 |             printf "\r$$ is waiting for checkpoints of ${config}:$seed"
 55 |             sleep 1m
 56 |             continue
 57 |         fi    
 58 |         echo ""
 59 |     
 60 |         # run test for these checkpoints
 61 |         for checkpoint in $ready_checkpoints;do
 62 |             results_prefix=$checkpoints_folder/epoch_tests/dec-$(basename $checkpoint .pt)
 63 |             bash run/test_sliding.sh $checkpoint -o $results_prefix
 64 | 
 65 |             # clean this checkpoint. This can be helpful if we started a job
 66 |             # with lots of pending checkpoints to evaluate
 67 |             python run/status.py -c $config --seed $seed  --link-best --remove
 68 |         done
 69 |     done
 70 |     touch $checkpoints_folder/epoch_tests/.done
 71 | 
 72 | else
 73 | 
 74 |     printf "[\033[92m done \033[0m] $checkpoints_folder/epoch_tests/.done\n"
 75 |     
 76 | fi
 77 | 
 78 | # This should not be needed, but its a sanity check
 79 | python run/status.py -c $config --seed $seed --list-checkpoints-to-eval --link-best --remove
 80 | 
 81 | # 3 checkpoint average
 82 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt ]]; then
 83 |     echo "Evaluation/Ranking failed, missing $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt "
 84 |     exit 1
 85 | fi
 86 | 
 87 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_top3-avg.pt ]]; then
 88 |     python src/fairseq_ext/average_checkpoints.py \
 89 |         --input \
 90 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best1.pt \
 91 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best2.pt \
 92 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt \
 93 |         --output $checkpoints_folder/checkpoint_${EVAL_METRIC}_top3-avg.pt
 94 | fi
 95 | 
 96 | 
 97 | # 5 checkpoint average
 98 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt ]]; then
 99 |     echo "Evaluation/Ranking failed, missing $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt "
100 |     exit 1
101 | fi
102 | 
103 | 
104 | if [[ ! -f $checkpoints_folder/checkpoint_${EVAL_METRIC}_top5-avg.pt ]]; then
105 |     python src/fairseq_ext/average_checkpoints.py \
106 |         --input \
107 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best1.pt \
108 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best2.pt \
109 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best3.pt \
110 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best4.pt \
111 |             $checkpoints_folder/checkpoint_${EVAL_METRIC}_best5.pt \
112 |         --output $checkpoints_folder/checkpoint_${EVAL_METRIC}_top5-avg.pt
113 | fi
114 | 
115 | # Final run
116 | [ ! -f "$checkpoints_folder/$DECODING_CHECKPOINT" ] \
117 |     && echo -e "Missing $checkpoints_folder/$DECODING_CHECKPOINT" \
118 |     && exit 1
119 | mkdir -p $checkpoints_folder/beam${BEAM_SIZE}/
120 | bash run/test_sliding.sh $checkpoints_folder/$DECODING_CHECKPOINT -b $BEAM_SIZE
121 | 


--------------------------------------------------------------------------------
/run/status.sh:
--------------------------------------------------------------------------------
1 | set -o errexit 
2 | set -o pipefail
3 | . set_environment.sh
4 | set -o nounset 
5 | 
6 | python run/status.py $@
7 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
  1 | ## Plot AMRs
  2 | 
  3 | To plot in LaTex using tikz, you can use
  4 | 
  5 | ```
  6 | python scripts/plot_amr.py --in-amr DATA/wiki25.jkaln --out-amr tmp.tex
  7 | ```
  8 | 
  9 | Use `--indices` to select AMRs by the order they appear in the file. See
 10 | `--has-*` flags to select by graph properties
 11 | 
 12 | To plot using matplotlib (for e.g. notebooks) you can use `AMR.plot()` in the
 13 | AMR class
 14 | 
 15 | ## JAMR to ISI notaion
 16 | 
 17 | To convert an AMR file aligned using JAMR (+Kevin) aligner into ISI alignments format.
 18 | 
 19 | ```
 20 | python scripts/jamr2isi.py --in-amr <jamr_aligned_amr> --out-amr <output_in_isi_notaion>
 21 | ```
 22 | 
 23 | ## Understanding the Oracle
 24 | 
 25 | An oracle is a module that given a sentence and its AMR annotation (aligned,
 26 | right now) provides a sequence of actions, that played on a state machine
 27 | produce back the AMR. Current AMR oracle aka Oracle10 can be explored in
 28 | isolation running
 29 | 
 30 | ```
 31 | bash tests/oracles/amr_o10.sh DATA/wiki25.jkaln
 32 | ```
 33 | 
 34 | ## Sanity check AMR 
 35 | 
 36 | You can check any AMR against any propbank frames and their rules
 37 | 
 38 | Extract all frames in separate `xml` format file into one single `json` format
 39 | ```
 40 | python scripts/read_propbank.py /path/to/amr_2.0/data/frames/xml/ DATA/probank_amr2.0.json
 41 | ```
 42 | 
 43 | Run sanity check, for example
 44 | ```
 45 | python scripts/sanity_check_amr.py /path/to/amr2.0/train.txt DATA/probank_amr2.0.json
 46 | 
 47 | 36522 sentences 152897 predicates
 48 | 401 role not in propbank
 49 | 322 predicate not in propbank
 50 | 25 missing required role
 51 | ```
 52 | 
 53 | ## Paired Boostrap Significance Test for Smatch
 54 | 
 55 | The following script implements the paired boostrap significance test after
 56 | 
 57 |     @Book{Nor89,
 58 |         author = {E. W. Noreen},
 59 |         title =  {Computer-Intensive Methods for Testing Hypotheses},
 60 |         publisher = {John Wiley Sons},
 61 |         year = {1989},
 62 |     }
 63 | 
 64 | . See also [this paper](https://aclanthology.org/W05-0908). To use you can call
 65 | 
 66 | ```bash
 67 | python scripts/smatch_aligner.py \
 68 |     --in-reference-amr /path/to/gold.amr \
 69 |     --in-amrs \
 70 |         /path/to/predicted1.amr \ 
 71 |         /path/to/predicted2.amr \ 
 72 |         ...
 73 |         /path/to/predictedN.amr \ 
 74 |     --amr-labels \
 75 |         label1 \
 76 |         label2 \
 77 |         ...
 78 |         labelN \
 79 |     --bootstrap-test
 80 | ```
 81 | 
 82 | for each pair of predicted amr files, it tests the hypothesis that the
 83 | prediction with largest Smatch is significantly greater than the smaller one.
 84 | Use `--bootstrap-test-restarts` to set the number of samples (default `10,000`,
 85 | note this has little effect on speed). Use `--out-boostrap-png
 86 | /path/to/file.png` to save the distribution of score differences for each pair.
 87 | Script calls the original `smatch` python module. In order to export components
 88 | it needs the main branch after version `1.0.4`. 
 89 | 
 90 | ## Maximum Bayesian Smatch Ensemble (MBSE)
 91 | 
 92 | The following script implements MBSE-A as in [(Lee et al 2022)](https://arxiv.org/abs/2112.07790), just do
 93 | 
 94 | ```
 95 | python scripts/mbse.py \
 96 |     --in-amrs \
 97 |         /path/to/predicted1.amr \ 
 98 |         /path/to/predicted2.amr \ 
 99 |         ...
100 |         /path/to/predictedN.amr \ 
101 |     --out-amr /path/to/ensemble.amr
102 | ```
103 | 


--------------------------------------------------------------------------------
/scripts/add_wiki.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | if __name__ == '__main__':
 6 | 
 7 |     famr, fwiki, wiki_folder = sys.argv[1:]
 8 | 
 9 |     famr = open(famr)
10 |     fwiki = open(fwiki)
11 |     ftrn = open(f"{wiki_folder}/trn.wikis")
12 |     wiki_ht = {}
13 |     for line in ftrn:
14 |         if len(line.strip().split('\t')) != 2:
15 |             continue
16 |         (n,w) = line.strip().split('\t')
17 |         wiki_ht[n] = w
18 | 
19 |     # FIXME: Temporary fix for AMR3.0
20 |     amr3_file = f"{wiki_folder}/amr3trn.wikis"
21 |     if os.path.isfile(amr3_file):
22 |         ftrn3 = open(amr3_file)
23 |         for line in ftrn3:
24 |             if len(line.strip().split('\t')) != 2:
25 |                 continue
26 |             (n,w) = line.strip().split('\t')
27 |             if n[-1:]==' ':
28 |                 n = n[:-1]
29 |             if n not in wiki_ht:
30 |                 wiki_ht[n] = w
31 | 
32 |     all_wikis = []
33 |     wikis = []
34 |     for line in fwiki:
35 |         if line.strip() == "":
36 |             all_wikis.append(wikis)
37 |             wikis = []
38 |         else:
39 |             wikis.append(line.strip().split())
40 | 
41 |     lc = 0
42 |     while True:
43 |         line = famr.readline()
44 |         if not line:
45 |             break
46 |         line = line.rstrip()
47 |         if line.strip()=="" :
48 |             lc += 1
49 |         if ":name" in line:
50 | 
51 |             #get name
52 |             namelines = []
53 |             nextline = famr.readline()
54 |             namelines.append(nextline.rstrip())
55 |             tok = ""
56 |             while "op" in nextline:
57 |                 tok += nextline[nextline.find("\"")+1:nextline.rfind("\"")] + " "
58 |                 if ")" in nextline:
59 |                     break
60 |                 nextline = famr.readline()
61 |                 namelines.append(nextline.rstrip())
62 |             tok = tok.strip()
63 | 
64 |             #get wiki of the name
65 |             #print tok
66 |             if tok in wiki_ht:
67 |                 wiki = wiki_ht[tok]
68 |                 line = line.replace(":name",":wiki " + wiki + "\t:name")
69 |             else:
70 |                 if tok != "":
71 |                     for i in range(len(all_wikis[lc])):
72 |                         if tok.split()[0] in all_wikis[lc][i][0] :# or (all_wikis[lc][i][1] != '-' and all_wikis[lc][i][1] == tok) or tok in all_wikis[lc][i][0] :
73 |                             wiki = all_wikis[lc][i][1]
74 |                             if wiki != '-':
75 |                                 wiki = "\""+wiki+"\""
76 |                             line = line.replace(":name",":wiki " + wiki + "\t:name")
77 |                             break
78 | 
79 |             print(line)
80 |             print("\n".join(namelines))
81 |         else:
82 |             print(line)
83 | 


--------------------------------------------------------------------------------
/scripts/amr_latex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import collections
  3 | import sys
  4 | import re
  5 | import string
  6 | import os
  7 | 
  8 | def replace_symbols(line):
  9 | 
 10 |     line = line.replace("\\","")
 11 |     line = line.replace("$","\$")
 12 |     line = line.replace("#","*")
 13 |     line = line.replace("&","and")
 14 |     line = line.replace("%","\%")
 15 |     line = line.replace("_","-")
 16 |     line = line.replace("^","hat")
 17 |     line = line.replace("(","LBR")
 18 |     line = line.replace(")","RBR")
 19 |     line = line.replace("{","LBR")
 20 |     line = line.replace("}","RBR")
 21 | 
 22 |     return line
 23 | 
 24 | def get_node_depth(amr):
 25 | 
 26 |     node_TO_edges = collections.defaultdict(list)
 27 |     for e in amr.edges:
 28 |         s, y, t = e
 29 |         node_TO_edges[s].append(e)
 30 | 
 31 |     new_edges = []
 32 | 
 33 |     seen = set()
 34 |     seen.add(amr.root)
 35 | 
 36 |     node_TO_lvl = {}
 37 |     node_TO_lvl[amr.root] = 0
 38 | 
 39 |     def helper(root, prefix='0'):
 40 |         if root not in node_TO_edges:
 41 |             return
 42 | 
 43 |         for i, e in enumerate(node_TO_edges[root]):
 44 |             s, y, t = e
 45 |             assert s == root
 46 |             if t in seen:
 47 |                 continue
 48 |             seen.add(t)
 49 |             new_prefix = '{}.{}'.format(prefix, i)
 50 |             node_TO_lvl[t] = new_prefix.count('.')
 51 | 
 52 |             helper(t, prefix=new_prefix)
 53 | 
 54 |     helper(amr.root)
 55 | 
 56 |     return node_TO_lvl
 57 | 
 58 | def get_tikz_latex(amr, tokens, nodes, edges, alignments):
 59 | 
 60 |     for i in range(len(tokens)):
 61 |         tokens[i] = replace_symbols(tokens[i])
 62 |     for node in nodes:
 63 |         nodes[node]  = replace_symbols(nodes[node])
 64 | 
 65 |     latex_str = ""
 66 | 
 67 |     latex_str += "\\begin{center}\n\\begin{tikzpicture}[scale=1.5]\n"
 68 |     for i in range(0,len(tokens)):
 69 |         word = tokens[i]
 70 |         latex_str += "\\draw(" + str(float(i)*0.8) + ",0) node {" + word[0:10] + "};\n"
 71 | 
 72 |     children = {}
 73 |     for node in nodes:
 74 |         children[node] = []
 75 |         for edge in edges:
 76 |             if edge[0] == node:
 77 |                 children[node].append(edge[2])
 78 | 
 79 |     node_keys = nodes.keys()
 80 |     node_TO_lvl = get_node_depth(amr)
 81 |     levels = {}
 82 |     for node in nodes:
 83 |         lvl = node_TO_lvl[node]
 84 |         if lvl not in levels:
 85 |             levels[lvl] = []
 86 |         levels[lvl].append(node)
 87 |     max_lvl = max(levels.keys())
 88 | 
 89 |     node_names = {}
 90 |     for lvl in levels:
 91 |         y = 0.5 + (max_lvl - lvl) * 1.5
 92 |         for node in levels[lvl]:
 93 |             x=-0.8
 94 |             if node in alignments:
 95 |                 x = float(alignments[node])*0.8
 96 |             node_names[node] = node.replace(".","_").replace("#", "X")
 97 |             latex_str += "\\node [draw,rounded corners] (" + str(node_names[node]) + ") at (" + str(x) + "," + str(y) + ") {" + nodes[node] + "};\n"
 98 |     '''
 99 |     plotted = []
100 |     previous_plotted = []
101 |     level = 0
102 |     while len(plotted) != len(nodes):
103 |         for node in nodes:
104 |             if node not in plotted and (len(children[node]) == 0 or all(child in previous_plotted for child in children[node])):
105 |                 #plot this nodes
106 |                 x=-0.8
107 |                 if node in alignments:
108 |                     x = float(alignments[node])*0.8
109 |                 y = 0.5 + level * 1.5
110 |                 node_names[node] = node.replace(".","_")
111 |                 latex_str += "\\node [draw,rounded corners] (" + str(node_names[node]) + ") at (" + str(x) + "," + str(y) + ") {" + nodes[node] + "};\n"
112 |                 plotted.append(node)
113 |         level += 1
114 |         previous_plotted = plotted
115 |     '''
116 | 
117 |     for edge in edges:
118 |         latex_str += "\\draw [-latex,thick] (" + node_names[edge[0]] + ") -- node {\\footnotesize " + replace_symbols(edge[1]) + "} (" + node_names[edge[2]] + ");\n"
119 | 
120 |     latex_str += "\\end{tikzpicture}\n\\end{center}\n"
121 | 
122 |     return latex_str
123 | 


--------------------------------------------------------------------------------
/scripts/doc-amr/pack_amrs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import copy
 3 | import numpy as np
 4 | from tqdm import tqdm
 5 | 
 6 | from amr_io import read_amr
 7 | from ipdb import set_trace
 8 | from docamr_utils import get_sen_ends
 9 | 
10 | def connect_sen_amrs(amr):
11 | 
12 |     if len(amr.roots) <= 1:
13 |         return
14 | 
15 |     node_id = amr.add_node("document")
16 |     amr.root = str(node_id)
17 |     for (i,root) in enumerate(amr.roots):
18 |         amr.edges.append((amr.root, ":snt"+str(i+1), root))
19 | 
20 | 
21 | def make_packed_amrs(amrs, max_tok=400, randomize=True):
22 |     packed_amrs = []
23 | 
24 |     keys = [k for k in amrs.keys()]
25 |     
26 |     indices = np.array(range(len(amrs)))
27 |     if randomize:
28 |         indices = np.random.permutation(len(amrs))
29 | 
30 |     amr = copy.deepcopy(amrs[keys[indices[0]]])
31 |     for idx in indices[1:]:
32 |         next_amr = amrs[keys[idx]]
33 |         if len(amr.tokens) + len(next_amr.tokens) <= max_tok:
34 |             amr = amr + copy.deepcopy(next_amr)
35 |         else:
36 |             connect_sen_amrs(amr)
37 |             get_sen_ends(amr)
38 |             packed_amrs.append(amr)
39 |             amr = copy.deepcopy(next_amr)
40 | 
41 |     connect_sen_amrs(amr)
42 |     get_sen_ends(amr)
43 |     packed_amrs.append(amr)
44 | 
45 |     return packed_amrs
46 | 
47 | 
48 | def main(args):
49 | 
50 |     assert args.out_amr        
51 |     assert args.in_amr        
52 | 
53 |     amrs = read_amr(args.in_amr)
54 | 
55 |     with open(args.out_amr, 'w') as fid:
56 |         packed = make_packed_amrs(amrs)
57 |         for amr in packed:
58 |             fid.write(amr.__str__())
59 |                 
60 | def argument_parser():
61 | 
62 |     parser = argparse.ArgumentParser(description='Read AMRs and Corefs and put them together', \
63 |                                      formatter_class=argparse.RawTextHelpFormatter)
64 |     parser.add_argument(
65 |         "--in-amr",
66 |         help="path to AMR3 annoratations",
67 |         type=str
68 |     )
69 |     parser.add_argument(
70 |         "--out-amr",
71 |         help="Output file containing AMR in penman format",
72 |         type=str,
73 |     )
74 |     args = parser.parse_args()
75 |     return args
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main(argument_parser())
80 | 


--------------------------------------------------------------------------------
/scripts/doc-amr/remove_amrs.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from transition_amr_parser.io import read_blocks
 3 | import re
 4 | 
 5 | regex = r"--avoid-indices ([\d\s]+)"
 6 | 
 7 | 
 8 | def main(args):
 9 | 
10 |     tqdm_amrs_str = read_blocks(args.in_amr)
11 |     indices = re.findall(regex,args.arg_str)
12 |     avoid_indices = indices[0].split()
13 |     avoid_indices = [int(i) for i in avoid_indices]
14 | 
15 |     with open(args.out_amr, 'w') as fid:
16 |         for idx, penman_str in enumerate(tqdm_amrs_str):
17 |             if not idx in avoid_indices:
18 |                 fid.write(penman_str+'\n')
19 | 
20 |            
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     parser = ArgumentParser()
25 |     parser.add_argument(
26 |         "--in-amr",
27 |         help="In file containing AMR in penman format",  
28 |         type=str
29 |     )
30 |     parser.add_argument(
31 |         "--arg-str",
32 |         help="the arg string containing the indices needed to be removed",  
33 |         type=str
34 |     )
35 | 
36 |     parser.add_argument(
37 |         "--out-amr",
38 |         help="out amr after removal of avois indices",
39 |         type=str,
40 |     )
41 |     args = parser.parse_args()
42 |     main(args)
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/scripts/doc-amr/remove_sen.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from transition_amr_parser.io import read_blocks
 3 | import re
 4 | 
 5 | regex = r"--avoid-indices ([\d\s]+)"
 6 | 
 7 | 
 8 | def main(args):
 9 | 
10 |     lines = open(args.in_file).readlines()
11 |     indices = re.findall(regex,args.arg_str)
12 |     avoid_indices = indices[0].split()
13 |     avoid_indices = [int(i) for i in avoid_indices]
14 | 
15 |     with open(args.out_file, 'w') as fid:
16 |         for idx, line in enumerate(lines):
17 |             if not idx in avoid_indices:
18 |                 fid.write(line)
19 | 
20 |            
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     parser = ArgumentParser()
25 |     parser.add_argument(
26 |         "--in-file",
27 |         help="In file containing sen",  
28 |         type=str
29 |     )
30 |     parser.add_argument(
31 |         "--arg-str",
32 |         help="the arg string containing the indices needed to be removed",  
33 |         type=str
34 |     )
35 | 
36 |     parser.add_argument(
37 |         "--out-file",
38 |         help="out file after removal of avoids indices",
39 |         type=str,
40 |     )
41 |     args = parser.parse_args()
42 |     main(args)
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/scripts/export_alignment_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | set -o pipefail
 5 | 
 6 | # Argument handling
 7 | HELP="\nbash $0 <config> <seed>\n"
 8 | # config file
 9 | [ -z "$1" ] && echo -e "$HELP" && exit 1
10 | [ ! -f "$1" ] && "Missing $1" && exit 1
11 | config=$1
12 | 
13 | # activate virtualenenv and set other variables
14 | . set_environment.sh
15 | 
16 | set -o nounset
17 | 
18 | # Load config
19 | echo "[Configuration file:]"
20 | echo $config
21 | . $config
22 | 
23 | [ ! -f DATA/$TASK_TAG/aligned/ibm_neural_aligner/.done ] \
24 |     && printf "\nIs aligner training complete?\n" \
25 |     && exit 1
26 | 
27 | zip -r ${TASK_TAG}_ibm_neural_aligner.zip \
28 |     DATA/$TASK_TAG/aligned/ibm_neural_aligner/log/model.latest.pt \
29 |     DATA/$TASK_TAG/aligned/ibm_neural_aligner/log/flags.json \
30 |     DATA/$TASK_TAG/aligned/ibm_neural_aligner/vocab.text.txt \
31 |     DATA/$TASK_TAG/aligned/ibm_neural_aligner/vocab.amr.txt \
32 |     DATA/$TASK_TAG/aligned/ibm_neural_aligner/.done.train 
33 | 
34 | echo "Created ${TASK_TAG}_ibm_neural_aligner.zip"
35 | 


--------------------------------------------------------------------------------
/scripts/export_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | set -o pipefail
 5 | 
 6 | # Argument handling
 7 | HELP="\nbash $0 <config> <seed>\n"
 8 | # config file
 9 | [ -z "$1" ] && echo -e "$HELP" && exit 1
10 | [ ! -f "$1" ] && "Missing $1" && exit 1
11 | config=$1
12 | # random seed
13 | [ -z "$2" ] && echo -e "$HELP" && exit 1
14 | seed=$2
15 | 
16 | # activate virtualenenv and set other variables
17 | . set_environment.sh
18 | 
19 | set -o nounset
20 | 
21 | # Load config
22 | echo "[Configuration file:]"
23 | echo $config
24 | . $config
25 | 
26 | model_folder=${MODEL_FOLDER}seed${seed}/
27 | model_name=$config_name
28 | 
29 | # needed files
30 | checkpoint=$model_folder/$DECODING_CHECKPOINT
31 | 
32 | echo "$checkpoint"
33 | 
34 | [ ! -f "$checkpoint" ] && echo "Is $config training complete?" && exit 1
35 | 
36 | echo "This will remove optimizer from ${checkpoint}."
37 | read -p "Do you wish to continue? Y/[N]" answer
38 | [ "$answer" != "Y" ] && exit 1
39 | 
40 | # remove optimizer from checkpoint
41 | python scripts/remove_optimizer_state.py $checkpoint $checkpoint
42 | # zip all
43 | if [ -f "$model_folder/actions.vocab.nodes" ];then
44 |     zip -r ${model_name}-seed${seed}.zip \
45 |         $checkpoint \
46 |         $model_folder/config.sh \
47 |         $model_folder/dict.actions_nopos.txt \
48 |         $model_folder/actions.vocab.nodes \
49 |         $model_folder/actions.vocab.others \
50 |         $model_folder/dict.en.txt \
51 |         $model_folder/machine_config.json
52 | else
53 |     zip -r ${model_name}-seed${seed}.zip \
54 |         $checkpoint \
55 |         $model_folder/config.sh \
56 |         $model_folder/dict.actions_nopos.txt \
57 |         $model_folder/dict.en.txt \
58 |         $model_folder/machine_config.json
59 | fi
60 | 


--------------------------------------------------------------------------------
/scripts/install_satori.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail 
 3 | 
 4 | # activate conda
 5 | # FIXME: Replace this with your conda
 6 | eval "$(/nobackup/users/ramast/miniconda3/bin/conda shell.bash hook)"
 7 | # Create local env if missing
 8 | [ ! -d cenv_ppc ] && conda create -y -p ./cenv_ppc
 9 | echo "conda activate ./cenv_ppc"
10 | conda activate ./cenv_ppc
11 | 
12 | # accept POWER AI license
13 | export IBM_POWERAI_LICENSE_ACCEPT=yes
14 | 
15 | # this may not be needed
16 | export PYTHONPATH=.
17 | 
18 | set -o nounset 
19 | 
20 | # install python version to be used
21 | conda install -y pytorch==1.4.0 -c pytorch -c powerai
22 | 
23 | # fairseq
24 | [ ! -d fairseq ] && git clone https://github.com/pytorch/fairseq.git
25 | cd fairseq
26 | git checkout v0.10.2
27 | pip install --editable .
28 | cd ..
29 | 
30 | # smatch
31 | [ ! -d smatch ] && git clone https://github.com/snowblink14/smatch.git
32 | cd smatch
33 | git checkout v1.0.4
34 | pip install .
35 | cd ..
36 | 
37 | # repo instal proper
38 | pip install --editable .
39 | 
40 | # TODO: Install pytorch scatter
41 | 
42 | # Tried to use this, but gcc is not available to load
43 | 
44 | # module load gcc
45 | # pip install torch-scatter --no-cache-dir
46 | 
47 | # This is what I did IBM's CCC PPC machines. Bottom line we need a GCC higher
48 | # than the one available by default
49 | 
50 | # # install pytorch scatter
51 | # rm -Rf pytorch_scatter.ppc
52 | # git clone https://github.com/rusty1s/pytorch_scatter.git pytorch_scatter.ppc
53 | # cd pytorch_scatter.ppc
54 | # git checkout 1.3.2
55 | # Ensure modern GCC
56 | #export GCC_DIR=/opt/share/gcc-5.4.0/ppc64le/
57 | #export PATH=/opt/share/cuda-9.0/ppc64le/bin:$GCC_DIR/bin:$PATH
58 | #export LD_LIBRARY_PATH=$GCC_DIR/lib:$LD_LIBRARY_PATH
59 | #export LD_LIBRARY_PATH=$GCC_DIR/lib64:$LD_LIBRARY_PATH
60 | #python setup.py develop
61 | #cd ..
62 | 
63 | # check all ok
64 | python tests/correctly_installed.py
65 | 


--------------------------------------------------------------------------------
/scripts/merge_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | def merge_dir(dir, outfile):
 5 | 
 6 |     # collect amrs
 7 |     amrs = []
 8 |     for filename in sorted(os.listdir(dir)):
 9 |         if not filename.startswith("amr"):
10 |             continue
11 |         with open(os.path.join(dir, filename), encoding='utf-8') as f:
12 |             print(filename)
13 |             for i,line in enumerate(f):
14 |                 if i in [0, 1]:
15 |                     continue
16 |                 if line.startswith('# ::align'):
17 |                     continue
18 |                 amrs.append(line)
19 |             amrs.append('\n')
20 | 
21 |     # normalization
22 |     amrs = ''.join(amrs)
23 |     amrs = amrs.replace('\r','')
24 |     amrs = amrs.replace('\n\n\n','\n\n')
25 |     amrs = amrs.replace('\u0092',"'")
26 |     amrs = amrs.replace('\u0085'," ")
27 | 
28 |     # write data
29 |     with open(outfile,'w+', encoding='utf-8') as f:
30 |         f.write(amrs)
31 |         print(amrs.count('# ::snt'))
32 | 
33 | if __name__ == '__main__':
34 |     input_dir, output_dir = sys.argv[1:]
35 |     os.makedirs(output_dir, exist_ok=True)
36 |     merge_dir(f'{input_dir}/training/', f'{output_dir}/train.txt')
37 |     merge_dir(f'{input_dir}/dev/', f'{output_dir}/dev.txt')
38 |     merge_dir(f'{input_dir}/test/', f'{output_dir}/test.txt')
39 | 


--------------------------------------------------------------------------------
/scripts/parse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -o pipefail 
 3 | set -o errexit 
 4 | # load local variables used below
 5 | . set_environment.sh
 6 | HELP="$0 <checkpoint> <tokenized sentences file> <out amr>"
 7 | [ "$#" -lt 3 ] && echo "$HELP" && exit 1
 8 | checkpoint=$1
 9 | input_file=$2
10 | output_amr=$3
11 | set -o nounset
12 | 
13 | amr-parse \
14 |     --in-checkpoint $checkpoint \
15 |     --in-tokenized-sentences $input_file \
16 |     --out-amr $output_amr \
17 |     --roberta-cache-path DATA/bart.large \
18 |     --batch-size 128 \
19 |     --roberta-batch-size 1
20 | 


--------------------------------------------------------------------------------
/scripts/plot_amr.py:
--------------------------------------------------------------------------------
  1 | # https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.patches.FancyBboxPatch.html#matplotlib.patches.FancyBboxPatch
  2 | # https://matplotlib.org/3.1.1/tutorials/text/annotations.html#placing-artist-at-the-anchored-location-of-the-axes
  3 | # FIXME: Separate rendering and node position calculation
  4 | # FIXME: Variable names messy
  5 | from random import shuffle
  6 | import argparse
  7 | from transition_amr_parser.io import read_amr
  8 | from transition_amr_parser.amr_latex import (
  9 |     get_tikz_latex,
 10 |     save_graphs_to_tex,
 11 | )
 12 | 
 13 | 
 14 | def argument_parser():
 15 | 
 16 |     parser = argparse.ArgumentParser(description='AMR alignment plotter')
 17 |     # Single input parameters
 18 |     parser.add_argument(
 19 |         "--in-amr",
 20 |         help="AMR 2.0+ annotation file to be splitted",
 21 |         type=str,
 22 |         required=True
 23 |     )
 24 |     parser.add_argument(
 25 |         "--shuffle",
 26 |         help="randomize input AMRs",
 27 |         action='store_true'
 28 |     )
 29 |     parser.add_argument(
 30 |         "--jamr",
 31 |         help="Read from JAMR annotations",
 32 |         action='store_true'
 33 |     )
 34 |     parser.add_argument(
 35 |         "--out-tex",
 36 |         help="output",
 37 |         type=str,
 38 |         required=True
 39 |     )
 40 |     # latex / tikz variables
 41 |     parser.add_argument("--scale", type=float, default=1.0)
 42 |     parser.add_argument("--x-warp", type=float, default=1.0)
 43 |     parser.add_argument("--y-warp", type=float, default=1.0)
 44 |     #
 45 |     parser.add_argument(
 46 |         "--max-graphs",
 47 |         help="Will stop after plotting this amount",
 48 |         default=100,
 49 |         type=int,
 50 |     )
 51 |     parser.add_argument(
 52 |         "--indices", nargs='+',
 53 |         help="Position on the AMR file of sentences to plot"
 54 |     )
 55 |     parser.add_argument(
 56 |         "--has-nodes", nargs='+',
 57 |         help="filter for AMRs that have those nodes"
 58 |     )
 59 |     parser.add_argument(
 60 |         "--has-repeated-nodes",
 61 |         help="filter for AMRs that have more than one node of same name",
 62 |         action='store_true'
 63 |     )
 64 |     parser.add_argument(
 65 |         "--has-repeated-tokens",
 66 |         help="filter for AMRs that have more than one node of same name",
 67 |         action='store_true'
 68 |     )
 69 |     parser.add_argument(
 70 |         "--has-edges", nargs='+',
 71 |         help="filter for AMRs that have those nodes"
 72 |     )
 73 |     args = parser.parse_args()
 74 |     return args
 75 | 
 76 | 
 77 | def fix_ner_alignments(amr):
 78 | 
 79 |     # fix alignments
 80 |     for (src, edge, trg) in amr.edges:
 81 |         if edge == ':name' and amr.nodes[trg] == 'name':
 82 |             ops = sorted(amr.children(trg), key=lambda x: [1])[::-1]
 83 |             if (
 84 |                 len(amr.alignments[trg]) > 1
 85 |                 and len(amr.alignments[trg]) == len(ops)
 86 |             ):
 87 |                 for idx, (nid, _) in enumerate(ops):
 88 |                     amr.alignments[nid] = [amr.alignments[trg][idx]]
 89 | 
 90 |     return amr
 91 | 
 92 | 
 93 | def skip_amr(amr, args):
 94 |     return (
 95 |         args.has_nodes
 96 |         and not set(args.has_nodes) <= set(amr.nodes.values())
 97 |     ) or (
 98 |         args.has_edges
 99 |         and not set(args.has_edges) <= set([x[1][1:] for x in amr.edges])
100 |     ) or (
101 |         args.has_repeated_nodes
102 |         and len(set(amr.nodes.values())) == len(amr.nodes.values())
103 |     ) or (
104 |         args.has_repeated_tokens
105 |         and len(set(amr.tokens)) == len(amr.tokens)
106 |     )
107 | 
108 | 
109 | def main(args):
110 | 
111 |     # argument handling
112 |     amrs = read_amr(args.in_amr, jamr=args.jamr)
113 | 
114 |     print(f'Read {args.in_amr}')
115 |     num_amrs = len(amrs)
116 |     if args.indices:
117 |         indices = list(map(int, args.indices))
118 |     else:
119 |         indices = list(range(num_amrs))
120 |     # write into file
121 |     tex_file = args.out_tex
122 |     if args.shuffle:
123 |         shuffle(indices)
124 | 
125 |     # get one sample
126 |     amr_strs = []
127 |     for index in indices:
128 | 
129 |         amr = amrs[index]
130 | 
131 |         # Fix NER
132 |         amr = fix_ner_alignments(amr)
133 | 
134 |         # Remove ROOT
135 |         if amr.tokens[-1] == '<ROOT>':
136 |             amr.tokens = amr.tokens[:-1]
137 | 
138 |         if len(amr_strs) >= args.max_graphs:
139 |             # too many graphs
140 |             break
141 | 
142 |         # skip amr not meeting criteria
143 |         if skip_amr(amr, args) or amr.edges == []:
144 |             continue
145 | 
146 |         src,  _, trg = amr.edges[0]
147 | 
148 |         # get latex string
149 |         amr_str = get_tikz_latex(
150 |             amr,
151 |             # color_by_id={'a': 'red'},
152 |             # color_by_id_pair={(src, trg): 'red'},
153 |             scale=args.scale,
154 |             x_warp=args.x_warp,
155 |             y_warp=args.y_warp
156 |         )
157 | 
158 |         # plot
159 |         amr_strs.append(amr_str)
160 | 
161 |         # open on the fly
162 |         save_graphs_to_tex(tex_file, amr_str, plot_cmd='open')
163 | 
164 |         response = input('Quit [N/y]?')
165 |         if response == 'y':
166 |             break
167 | 
168 |     # write all graphs to a single tex
169 |     print(f'Wrote {len(amr_strs)} amrs into {tex_file}')
170 |     save_graphs_to_tex(tex_file, '\n'.join(amr_strs))
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     # argument handling
175 |     main(argument_parser())
176 | 


--------------------------------------------------------------------------------
/scripts/plot_results.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections import defaultdict
  3 | import re
  4 | # pip install python-dateutil
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | from transition_amr_parser.io import read_experiment
  8 | # from ipdb import set_trace
  9 | 
 10 | 
 11 | def get_vectors(items, label, admit_none=False):
 12 | 
 13 |     def x_key(item):
 14 |         return int(item['epoch'])
 15 | 
 16 |     def y_reduce(items):
 17 |         if admit_none:
 18 |             vy = [float(x[label]) for x in items if x[label] is not None]
 19 |         else:
 20 |             vy = [float(x[label]) for x in items]
 21 |         return np.mean(vy)
 22 | 
 23 |     def get_y_std(items):
 24 |         if admit_none:
 25 |             vy = [float(x[label]) for x in items if x[label] is not None]
 26 |         else:
 27 |             vy = [float(x[label]) for x in items]
 28 |         return np.std(vy)
 29 | 
 30 |     # Cluster x-axis
 31 |     x_clusters = defaultdict(list)
 32 |     for item in items:
 33 |         x_clusters[x_key(item)].append(item)
 34 |     # get xy vectors
 35 |     x = np.array(sorted(x_clusters.keys()))
 36 |     y = np.array([y_reduce(x_clusters[x_i]) for x_i in x])
 37 |     y_std = np.array([get_y_std(x_clusters[x_i]) for x_i in x])
 38 | 
 39 |     return x, y, y_std
 40 | 
 41 | 
 42 | def get_score_from_log(file_path, score_name):
 43 | 
 44 |     smatch_results_re = re.compile(r'^F-score: ([0-9\.]+)')
 45 | 
 46 |     results = [None]
 47 | 
 48 |     if 'smatch' in score_name:
 49 |         regex = smatch_results_re
 50 |     else:
 51 |         raise Exception(f'Unknown score type {score_name}')
 52 | 
 53 |     with open(file_path) as fid:
 54 |         for line in fid:
 55 |             if regex.match(line):
 56 |                 results = regex.match(line).groups()
 57 |                 results = [100*float(x) for x in results]
 58 |                 break
 59 | 
 60 |     return results
 61 | 
 62 | 
 63 | def matplotlib_render(plotting_data, out_png, title):
 64 | 
 65 |     # plot in matplotlib
 66 |     plt.figure(figsize=(10, 10))
 67 |     # axis with extra space for legend
 68 |     ax = plt.axes([0.1, 0.1, 0.8, 0.7])
 69 |     # second axis for Smatch
 70 |     ax_smatch = ax.twinx()
 71 |     colors = ['b', 'r', 'g', 'm', 'y']
 72 |     tags = sorted(plotting_data.keys())
 73 |     handles = []
 74 |     for i in range(len(tags)):
 75 | 
 76 |         color = colors[i % len(colors)]
 77 | 
 78 |         # train loss
 79 |         x, y, y_std = plotting_data[tags[i]]['train']
 80 |         h = ax.plot(x, y, color)[0]
 81 |         # ax.fill_between(x, y - y_std, y + y_std, alpha=0.3)
 82 |         # h = ax.fill_between(x, y - y_std, y + y_std, color=color2, alpha=0.3)
 83 |         handles.append(h)
 84 | 
 85 |         # valid loss
 86 |         # x, y, _ = plotting_data[tags[i]]['valid']
 87 |         # ax.plot(x, y, '--' + color)
 88 | 
 89 |         # dev decoding score
 90 |         x, y, y_std = plotting_data[tags[i]]['valid-dec']
 91 |         ax_smatch.plot(x, y, color)
 92 |         ax_smatch.fill_between(x, y - y_std, y + y_std, alpha=0.3)
 93 |         ax_smatch.set(ylim=(80, 85))
 94 | 
 95 |         ax.set_xlabel('epoch')
 96 |         ax.set_ylabel('loss')
 97 |         ax_smatch.set_ylabel('Smatch')
 98 | 
 99 |     plt.legend(handles, tags, bbox_to_anchor=(0, 1, 1, 0))
100 |     if title:
101 |         plt.title(title)
102 |     if out_png:
103 |         print(f'wrote {out_png}')
104 |         plt.savefig(out_png)
105 |     else:
106 |         plt.show()
107 | 
108 | 
109 | def main(args):
110 | 
111 |     data = []
112 |     for config in args.in_configs:
113 |         data.extend(read_experiment(config))
114 | 
115 |     # Cluster by experiment
116 |     experiments = defaultdict(list)
117 |     for item in data:
118 |         experiments[item['experiment_key']].append(item)
119 | 
120 |     # For each experiment collect separate data for train, valid and score
121 |     # aggregate stats for multiple seeds and produce vectors for later
122 |     # plotting
123 |     plotting_data = defaultdict(dict)
124 |     for exp_tag, exp_data in experiments.items():
125 |         etime = np.median([
126 |             x['epoch_time'] for x in exp_data if x['epoch_time']]) / (60**2)
127 |         print(f'Collecting data for {exp_tag} ({etime:.2f} h/epoch)')
128 |         for sset in ['train', 'valid']:
129 |             valid_data = [x for x in exp_data if x['set'] == sset]
130 |             plotting_data[exp_tag][sset] = \
131 |                 get_vectors(valid_data, f'{sset}_loss')
132 |         sset = 'valid-dec'
133 |         score_data = [x for x in exp_data if x['set'] == sset]
134 |         plotting_data[exp_tag][sset] = \
135 |             get_vectors(score_data, 'score', admit_none=True)
136 | 
137 |     # Render picture in matplotlib
138 |     matplotlib_render(plotting_data, args.out_png, args.title)
139 | 
140 | 
141 | def argument_parser():
142 | 
143 |     parser = argparse.ArgumentParser(description='AMR results plotter')
144 |     # Single input parameters
145 |     parser.add_argument(
146 |         'in_configs',
147 |         nargs='+',
148 |         help="One or more config fils",
149 |         type=str,
150 |     )
151 |     parser.add_argument(
152 |         '--title',
153 |         help="Title of plot"
154 |     )
155 | 
156 |     parser.add_argument(
157 |         '-o', '--out-png',
158 |         help="Save into a file instead of plotting"
159 |     )
160 |     args = parser.parse_args()
161 |     return args
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     main(argument_parser())
166 | 


--------------------------------------------------------------------------------
/scripts/read_propbank.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import glob
 3 | import json
 4 | from tqdm import tqdm
 5 | from transition_amr_parser.io import read_frame
 6 | 
 7 | if __name__ == '__main__':
 8 | 
 9 |     # Argument handling
10 |     in_propank_folder, out_json = sys.argv[1:]
11 | 
12 |     # Read propbank into dict
13 |     propbank = {}
14 |     num_files = 0
15 |     for xml_file in tqdm(glob.glob(f'{in_propank_folder}/*.xml')):
16 |         propbank.update(read_frame(xml_file))
17 |         num_files += 1
18 |     if not num_files:
19 |         print('No XML files found!')
20 |         exit(1)
21 | 
22 |     num_preds = len(propbank)
23 |     num_examples = sum([len(x['examples']) for x in propbank.values()])
24 |     print(f'{num_files} files {num_preds} predicates {num_examples} examples read')
25 | 
26 |     # Write it into json
27 |     with open(out_json, 'w') as fid:
28 |         fid.write(json.dumps(propbank))
29 | 


--------------------------------------------------------------------------------
/scripts/remove_optimizer_state.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from fairseq_ext.utils import remove_optimizer_state
 3 | 
 4 | 
 5 | if __name__ == '__main__':
 6 | 
 7 |     if len(sys.argv[1:]) == 1:
 8 |         checkpoint_path = sys.argv[1]
 9 |         out_checkpoint_path = checkpoint_path
10 |     elif len(sys.argv[1:]) == 2:
11 |         checkpoint_path, out_checkpoint_path = sys.argv[1:]
12 | 
13 |     remove_optimizer_state(checkpoint_path, out_checkpoint_path)
14 | 


--------------------------------------------------------------------------------
/scripts/remove_wiki.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | 
 4 | 
 5 | if __name__ == '__main__':
 6 | 
 7 |     # argument handling
 8 |     amr_file, new_amr_file = sys.argv[1:]
 9 | 
10 |     with open(amr_file, encoding='utf-8') as fid:
11 |         amrs = fid.read()
12 |     amrs = re.sub(':wiki ".+?"( )?','', amrs)
13 |     amrs = re.sub(':wiki -( )?','', amrs)
14 |     l = amrs.count('# ::snt')
15 |     with open(new_amr_file, 'w+', encoding='utf-8') as f:
16 |         f.write(amrs)
17 |     print(new_amr_file)
18 |     print(l)
19 | 


--------------------------------------------------------------------------------
/scripts/sanity_check_amr.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | import json
 4 | from transition_amr_parser.io import read_amr
 5 | from collections import defaultdict
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | def get_propbank_name(amr_pred):
10 |     items = amr_pred.split('-')
11 |     prop_pred = '-'.join(items[:-1]) + '.' + items[-1]
12 |     if prop_pred.endswith('.91') or prop_pred in ['have-half-life.01']:
13 |         pass
14 |     else:
15 |         prop_pred = prop_pred.replace('-', '_')
16 |     return prop_pred
17 | 
18 | 
19 | if __name__ == '__main__':
20 | 
21 |     # Argument handling
22 |     in_amr, in_propbank_json = sys.argv[1:]
23 | 
24 |     amrs = read_amr(in_amr)
25 |     with open(in_propbank_json) as fid:
26 |         propbank = json.loads(fid.read())
27 | 
28 |     pred_regex = re.compile('.+-[0-9]+$')
29 | 
30 |     amr_alerts = defaultdict(list)
31 |     sid = 0
32 |     num_preds = 0
33 |     for amr in tqdm(amrs):
34 |         predicate_ids = [
35 |             k for k, v in amr.nodes.items() if pred_regex.match(v)
36 |         ]
37 |         num_preds += len(predicate_ids)
38 |         for pred_id in predicate_ids:
39 |             pred = get_propbank_name(amr.nodes[pred_id])
40 |             if pred not in propbank:
41 |                 amr_alerts['predicate not in propbank'].append(
42 |                     (sid, pred_id, pred)
43 |                 )
44 |             else:
45 |                 probank_roles = propbank[pred]['roles']
46 |                 # TODO: Identify obligatory args
47 |                 required_roles = set()
48 |                 required_location = set()
49 |                 for k, v in probank_roles.items():
50 |                     if '(must be specified)' in v['descr']:
51 |                         required_roles |= set([k])
52 |                     elif 'must' in v['descr']:
53 |                         # FIXME: not used right now
54 |                         required_location = set([k])
55 | 
56 |                 # Get roles
57 |                 roles = [
58 |                     trip[1][1:].replace('-of', '')
59 |                     for trip in amr.edges
60 |                     if trip[0] == pred_id and trip[1].startswith(':ARG')
61 |                 ]
62 |                 # Check no required missing
63 |                 missing_roles = required_roles - set(roles)
64 |                 if missing_roles:
65 |                     amr_alerts['missing required role'].append(
66 |                         (sid, pred_id, pred, " ".join(list(missing_roles)))
67 |                     )
68 |                 # Check no forbiden used
69 |                 forbidden_roles = set(roles) - set(probank_roles.keys())
70 |                 if forbidden_roles:
71 |                     amr_alerts['role not in propbank'].append(
72 |                         (sid, pred_id, pred, " ".join(list(forbidden_roles)))
73 |                     )
74 |         sid += 1
75 | 
76 |     print(f'{sid+1} sentences {num_preds} predicates')
77 |     for name, alerts in amr_alerts.items():
78 |         if alerts:
79 |             print(f'{len(alerts)} {name}')
80 | 


--------------------------------------------------------------------------------
/scripts/split_amrs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from tqdm import tqdm
 3 | import sys
 4 | import os
 5 | import penman
 6 | from transition_amr_parser.io import read_blocks
 7 | from ipdb import set_trace
 8 | 
 9 | 
10 | def main():
11 | 
12 |     in_amr, max_split_size, output_basename = sys.argv[1:]
13 |     dirname = os.path.dirname(output_basename)
14 |     os.makedirs(dirname, exist_ok=True)
15 | 
16 |     amrs = read_blocks(in_amr, return_tqdm=False)
17 |     max_split_size = int(max_split_size)
18 | 
19 |     num_amrs = len(amrs)
20 |     indices = list(range(num_amrs))
21 |     chunk_indices = [
22 |         indices[i:i + max_split_size]
23 |         for i in range(0, num_amrs, max_split_size)
24 |     ]
25 | 
26 |     for chunk_n, indices in enumerate(tqdm(chunk_indices)):
27 |         split_file = f'{output_basename}.{chunk_n}'
28 |         with open(split_file, 'w') as fid:
29 |             for i in indices:
30 |                 fid.write(f'{amrs[i]}\n')
31 |         print(split_file)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     main()
36 | 


--------------------------------------------------------------------------------
/scripts/tokenize_amr.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transition_amr_parser.amr import protected_tokenizer
 3 | 
 4 | 
 5 | def parse_arguments():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--in-amr", type=str, help="AMR file to be tokenized",
 8 |                         required=True)
 9 |     parser.add_argument("--simple", help="Use bare minimum tokenization",
10 |                         action='store_true')
11 |     return parser.parse_args()
12 | 
13 | 
14 | def main(args):
15 | 
16 |     # read
17 |     raw_amr = []
18 |     with open(args.in_amr) as fid:
19 |         for line in fid:
20 |             raw_amr.append(line.rstrip())
21 | 
22 |     # append tok line, ignoring previously existing ones
23 |     existing_tokenization = False
24 |     out_raw_amr = []
25 |     for line in raw_amr:
26 |         if line.strip().startswith('# ::snt'):
27 |             out_raw_amr.append(line)
28 |             # get tokens and also append
29 |             sentence = line.split('# ::snt')[-1].strip()
30 |             tokens, _ = protected_tokenizer(sentence, args.simple)
31 |             tokens_str = ' '.join(tokens)
32 |             out_raw_amr.append(f'# ::tok {tokens_str}')
33 |         elif line.strip().startswith('# ::tok'):
34 |             # ignore existing tokens
35 |             existing_tokenization = True
36 |         else:
37 |             out_raw_amr.append(line)
38 | 
39 |     if existing_tokenization:
40 |         print(
41 |             f'\033[93mWARNING\033[0m:'
42 |             f' Overwrote existing tokenization in {args.in_amr}'
43 |         )
44 | 
45 |     # write
46 |     with open(args.in_amr, 'w') as fid:
47 |         for line in out_raw_amr:
48 |             fid.write(f'{line}\n')
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main(parse_arguments())
53 | 


--------------------------------------------------------------------------------
/scripts/vimdiff_amr_files.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import subprocess
 3 | 
 4 | 
 5 | def get_one_amr(fid):
 6 |     amr = []
 7 |     line = fid.readline()
 8 |     while line.strip():
 9 |         amr.append(line)
10 |         line = fid.readline()
11 |     return amr
12 | 
13 | 
14 | def write(file_name, content):
15 |     with open(file_name, 'w') as fid:
16 |         fid.write(content)
17 | 
18 | 
19 | if __name__ == '__main__':
20 | 
21 |     amr1_file, amr2_file = sys.argv[1:]
22 | 
23 |     different_amrs = []
24 |     num_amrs = 0
25 |     with open(amr1_file) as fid1, open(amr2_file) as fid2:
26 |         while True:
27 |             amr1 = get_one_amr(fid1)
28 |             amr2 = get_one_amr(fid2)
29 |             penman1 = ''.join([x for x in amr1 if x[0] != '#'])
30 |             penman2 = ''.join([x for x in amr2 if x[0] != '#'])
31 |             if penman1 != penman2:
32 |                 different_amrs.append((num_amrs, penman1, penman2))
33 |             num_amrs += 1
34 |             print(f'\r{num_amrs}', end='')
35 |             if amr1 == [] and amr2 == []:
36 |                 break
37 | 
38 |     print(f'\n{len(different_amrs)}/{num_amrs} different AMRs')
39 | 
40 |     for n, p1, p2 in different_amrs:
41 |         input(f'\nPress any key to compare sentence {n}')
42 |         write('tmp1', p1)
43 |         write('tmp2', p2)
44 |         subprocess.call(['vimdiff', 'tmp1', 'tmp2'])
45 | 


--------------------------------------------------------------------------------
/service/amr.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | import "wordvec.proto";
 3 | service AMRServer {
 4 |     rpc process(AMRInput) returns(AMRResponse) {}
 5 | };
 6 | /** 
 7 |  * This contains information about a sentence, used as input by the parser
 8 |  */
 9 | message AMRInput{
10 |     message WordInfo {
11 |         string token=1;
12 |         string lemma=2;
13 |     }
14 |     repeated WordInfo word_infos=1;
15 |     WordVectors word_vectors=2;
16 |     bool doc_mode=3;
17 | };
18 | /** 
19 |  * The parser produces a single string with the amr parse of the sentence.
20 |  */
21 | message AMRResponse {
22 |     string amr_parse=1;
23 | }


--------------------------------------------------------------------------------
/service/amr2.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | service AMRBatchServer {
 3 |     rpc process(AMRBatchInput) returns(AMRBatchResponse) {}
 4 | };
 5 | /** 
 6 |  * This contains information about a batch of sentences, used as input by the parser
 7 |  */
 8 | message AMRBatchInput{
 9 |     message Sentence {
10 |         repeated string tokens=1;
11 |     }
12 |     repeated Sentence sentences=1;
13 |     bool doc_mode=2;
14 | };
15 | /** 
16 |  * The parser produces a list of strings with the amr parse of the sentences.
17 |  */
18 | message AMRBatchResponse {
19 |     repeated string amr_parse=1;
20 | }
21 | 


--------------------------------------------------------------------------------
/service/amr_client.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import grpc
 4 | import torch
 5 | import json
 6 | import amr_pb2
 7 | import amr_pb2_grpc
 8 | import argparse
 9 | 
10 | def argument_parser():
11 |     parser = argparse.ArgumentParser(description='AMR parser')
12 |     parser.add_argument(
13 |         "--port",
14 |         help="GRPC port",
15 |         type=str
16 |     )
17 |     args = parser.parse_args()
18 | 
19 |     # Sanity checks
20 |     assert args.port
21 | 
22 |     return args
23 | 
24 | def get_input_from_sentence(sentence,mode):
25 |     tokens = sentence.split()
26 |     input_tokens = []
27 |     for token in tokens:
28 |         input_tokens.append(amr_pb2.AMRInput.WordInfo(token=token))
29 |     
30 |     if mode.lower()=='doc' or mode.lower()=='document':
31 |         doc_mode = True
32 |     else:
33 |         doc_mode = False
34 |     return amr_pb2.AMRInput(word_infos=input_tokens,doc_mode=doc_mode)
35 | 
36 | def run():
37 |     # NOTE(gRPC Python Team): .close() is possible on a channel and should be
38 |     # used in circumstances in which the with statement does not fit the needs
39 |     # of the code.
40 |     # Argument handling
41 |     args = argument_parser()
42 |     channel = grpc.insecure_channel('localhost:' + args.port)
43 |     stub = amr_pb2_grpc.AMRServerStub(channel)
44 |     sentence = input("Enter the sentence: ")
45 |     mode = input("Enter the mode: ")
46 |     amr_input = get_input_from_sentence(sentence,mode)
47 |     response = stub.process(amr_input)
48 |     print("AMR parse received: \n" + response.amr_parse)
49 | 
50 | if __name__ == '__main__':
51 |     logging.basicConfig()
52 |     run()


--------------------------------------------------------------------------------
/service/wordvec.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | service WordVectorsServer {
 3 |     rpc vectorize(VectorizeRequest) returns(WordVectors) {}
 4 | }
 5 | enum WordVectorType {
 6 |     BERT_LARGE_EP5=0;
 7 | }
 8 | /** 
 9 |  * Tokenized sentence
10 |  */
11 | message VectorizeRequest {
12 |     WordVectorType type=1;
13 |     repeated string tokens=2;
14 | }
15 | /** 
16 |  * Concatenated vector for the entire sentence. 
17 |  * 'size' must match the number of tokens.
18 |  */
19 | message WordVectors {
20 |     WordVectorType type=1;
21 |     int32 size=3;
22 |     int32 dimension=2;
23 |     // Concatenated full vectors 
24 |     repeated float data=4;
25 | }


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys
 3 | 
 4 | VERSION = '0.5.4'
 5 | 
 6 | install_requires = [
 7 |     "torch==1.13.1",
 8 |     'numpy<=1.23.5',
 9 |     'ipython<=8.12', # python 3.8 vs ipython 8.13 incompatibility
10 |     'tqdm>=4.55.0',
11 |     'packaging>=20.8',
12 |     'requests>=2.25.1',
13 |     # for data (ELMO embeddings)
14 |     'h5py>=3.0.0',
15 |     'python-dateutil>=2.8.1',
16 |     # for scoringy
17 |     'penman>=1.1.0',
18 |     # needs tools to be importable > 1.0.4. As of now, no official release
19 |     'smatch',
20 |     # for debugging
21 |     'ipdb',
22 |     'line_profiler>=4.0.2',
23 |     'pyinstrument>=4.4.0',
24 |     # for aws download
25 |     'boto3>=1.26.1',
26 |     'progressbar',
27 | ]
28 | 
29 | # platform dependent fairseq version
30 | if sys.platform == 'darwin':
31 |     install_requires.append("fairseq==0.10.0")
32 | else:
33 |     install_requires.append("fairseq==0.10.2")
34 | 
35 | if __name__ == '__main__':
36 |     setup(
37 |         name='transition_amr_parser',
38 |         version=VERSION,
39 |         description="Trasition-based neural parser",
40 |         package_dir={"": "src"},
41 |         # packages=['fairseq_ext', 'transition_amr_parser'],
42 |         # packages=['neural_parser'],
43 |         packages=find_packages("src", exclude=('cenv_*', 'configs', 'tests', 'DATA','dist','docker','run','scripts','service','*egg-info')),
44 |         package_data={'': ['*.txt', '*.md', '*.opt', '*.cu', '*.cpp']},
45 |         entry_points={
46 |             'console_scripts': [
47 |                 'amr-parse = transition_amr_parser.parse:main',
48 |                 'amr-machine = transition_amr_parser.amr_machine:main',
49 |             ]
50 |         },
51 |         py_modules=['fairseq_ext', 'transition_amr_parser',"ibm_neural_aligner"],
52 |         install_requires=install_requires,
53 |         classifiers=[
54 |             "Programming Language :: Python :: 3.8",
55 |             "License :: OSI Approved :: Apache Software License",
56 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
57 |             "Natural Language :: English",
58 |         ],
59 |     )
60 | 
61 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/__init__.py:
--------------------------------------------------------------------------------
1 | # to register all the user defined modules to fairseq
2 | import fairseq_ext.criterions    # noqa
3 | import fairseq_ext.models    # noqa
4 | import fairseq_ext.tasks    # noqa
5 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/amr_reform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/amr_reform/__init__.py


--------------------------------------------------------------------------------
/src/fairseq_ext/amr_spec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/amr_spec/__init__.py


--------------------------------------------------------------------------------
/src/fairseq_ext/binarize.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from fairseq.data.indexed_dataset import __best_fitting_dtype, MMapIndexedDatasetBuilder, IndexedDatasetBuilder
 4 | from fairseq.tokenizer import tokenize_line
 5 | 
 6 | 
 7 | # TODO move this file into data folder
 8 | def make_builder(out_file, impl, vocab_size=None, dtype=None):
 9 |     if impl == 'mmap':
10 |         if dtype is None:
11 |             dtype = __best_fitting_dtype(vocab_size)
12 |         return MMapIndexedDatasetBuilder(out_file, dtype=dtype)
13 |     else:
14 |         return IndexedDatasetBuilder(out_file)
15 | 
16 | 
17 | def binarize_file(input_file, out_file_pref, impl, dtype=np.int64, tokenize=tokenize_line):
18 |     out_file = out_file_pref + '.bin'
19 |     index_file = out_file_pref + '.idx'
20 |     ds = make_builder(out_file, impl=impl, dtype=dtype)
21 |     with open(input_file, 'r') as f:
22 |         for line in f:
23 |             if line.strip():
24 |                 line = tokenize_line(line)
25 |                 line = list(map(int, line))
26 |                 line = torch.tensor(line)
27 |                 ds.add_item(line)
28 |             else:
29 |                 raise Exception('empty line')
30 | 
31 |     ds.finalize(index_file)
32 | 
33 |     return
34 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | 
 4 | 
 5 | # automatically infer the user module name (in case there is a change during the development)
 6 | user_module_name = os.path.split(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))[1]
 7 | submodule_name = os.path.split(os.path.abspath(os.path.dirname(__file__)))[1]
 8 | 
 9 | 
10 | # automatically import any Python files in the criterions/ directory
11 | # this is necessary for fairseq to register the user defined criterions
12 | for file in os.listdir(os.path.dirname(__file__)):
13 |     if file.endswith('.py') and not file.startswith('_'):
14 |         module = file[:file.find('.py')]
15 |         importlib.import_module(user_module_name + '.' + submodule_name + '.' + module)
16 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/data/__init__.py


--------------------------------------------------------------------------------
/src/fairseq_ext/extract_bart/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/extract_bart/__init__.py


--------------------------------------------------------------------------------
/src/fairseq_ext/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | 
 4 | 
 5 | # automatically infer the user module name (in case there is a change during the development)
 6 | user_module_name = os.path.split(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))[1]
 7 | submodule_name = os.path.split(os.path.abspath(os.path.dirname(__file__)))[1]
 8 | 
 9 | 
10 | # automatically import any Python files in the models/ directory
11 | # this is necessary for fairseq to register the user defined models
12 | models_dir = os.path.dirname(__file__)
13 | for file in os.listdir(models_dir):
14 |     path = os.path.join(models_dir, file)
15 |     if (file.endswith('.py') or os.path.isdir(path)) and not file.startswith('_'):
16 |         module = file[:file.find('.py')] if file.endswith('.py') else file
17 |         importlib.import_module(user_module_name + '.' + submodule_name + '.' + module)
18 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/modules/__init__.py


--------------------------------------------------------------------------------
/src/fairseq_ext/roberta/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/fairseq_ext/roberta/__init__.py


--------------------------------------------------------------------------------
/src/fairseq_ext/roberta/binarize_embeddings.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import shutil
  4 | import time
  5 | 
  6 | from ..data import indexed_dataset
  7 | from ..utils import time_since
  8 | 
  9 | 
 10 | def dataset_dest_prefix(args, output_prefix, lang):
 11 |     base = "{}/{}".format(args.embdir, output_prefix)
 12 |     lang_part = (
 13 |         ".{}-{}.{}".format(args.source_lang, args.target_lang, lang) if lang is not None else ""
 14 |     )
 15 |     return "{}{}".format(base, lang_part)
 16 | 
 17 | 
 18 | def dataset_dest_file(args, output_prefix, lang, extension):
 19 |     base = dataset_dest_prefix(args, output_prefix, lang)
 20 |     return "{}.{}".format(base, extension)
 21 | 
 22 | 
 23 | def get_scatter_indices(word2piece, reverse=False):
 24 |     if reverse:
 25 |         indices = range(len(word2piece))[::-1]
 26 |     else:
 27 |         indices = range(len(word2piece))
 28 |     # we will need as well the wordpiece to word indices
 29 |     wp_indices = [
 30 |         [index] * (len(span) if isinstance(span, list) else 1)
 31 |         for index, span in zip(indices, word2piece)
 32 |     ]
 33 |     wp_indices = [x for span in wp_indices for x in span]
 34 |     return torch.tensor(wp_indices)
 35 | 
 36 | 
 37 | def make_binary_bert_features(args, input_prefix, output_prefix, tokenize):
 38 | 
 39 |     # Load pretrained embeddings extractor
 40 |     if args.pretrained_embed.startswith('roberta'):
 41 |         from .pretrained_embeddings import PretrainedEmbeddings
 42 | 
 43 |         pretrained_embeddings = PretrainedEmbeddings(
 44 |             args.pretrained_embed,
 45 |             args.bert_layers
 46 |         )
 47 |     elif args.pretrained_embed.startswith('bert'):
 48 |         from .pretrained_embeddings_bert import PretrainedEmbeddings
 49 | 
 50 |         pretrained_embeddings = PretrainedEmbeddings(
 51 |             args.pretrained_embed,
 52 |             args.bert_layers
 53 |         )
 54 |     else:
 55 |         raise ValueError('arg.pretrained_embed should be either roberta.* or bert-*')
 56 | 
 57 |     # will store pre-extracted BERT layer
 58 |     indexed_data = indexed_dataset.make_builder(
 59 |         dataset_dest_file(args, output_prefix, 'en.bert', "bin"),
 60 |         impl=args.dataset_impl,
 61 |         dtype=np.float32
 62 |     )
 63 | 
 64 |     # will store wordpieces and wordpiece to word mapping
 65 |     indexed_wordpieces = indexed_dataset.make_builder(
 66 |         dataset_dest_file(args, output_prefix, 'en.wordpieces', "bin"),
 67 |         impl=args.dataset_impl,
 68 |     )
 69 | 
 70 |     indexed_wp2w = indexed_dataset.make_builder(
 71 |         dataset_dest_file(args, output_prefix, 'en.wp2w', "bin"),
 72 |         impl=args.dataset_impl,
 73 |     )
 74 | 
 75 |     num_sents = 0
 76 |     input_file = input_prefix + '.en'
 77 | 
 78 |     start = time.time()
 79 |     with open(input_file, 'r') as fid:
 80 |         for sentence in fid:
 81 | 
 82 |             # we only have tokenized data so we feed whitespace separated
 83 |             # tokens
 84 |             sentence = " ".join(tokenize(str(sentence).rstrip()))
 85 | 
 86 |             # extract embeddings, average them per token and return
 87 |             # wordpieces anyway
 88 |             word_features, worpieces_roberta, word2piece = \
 89 |                 pretrained_embeddings.extract(sentence)
 90 | 
 91 |             # note that data needs to be stored as a 1d array. Also check
 92 |             # that number nof woprds matches with embedding size
 93 |             assert word_features.shape[1] == len(sentence.split())
 94 |             indexed_data.add_item(word_features.cpu().view(-1))
 95 | 
 96 |             # just store the wordpiece indices, ignore BOS/EOS tokens
 97 |             indexed_wordpieces.add_item(worpieces_roberta)
 98 |             indexed_wp2w.add_item(
 99 |                 get_scatter_indices(word2piece, reverse=True)
100 |             )
101 | 
102 |             # udpate number of sents
103 |             num_sents += 1
104 |             if not num_sents % 100:
105 |                 print("\r%d sentences (time: %s)" % (num_sents, time_since(start)), end='')
106 |         print("")
107 | 
108 |     # close indexed data files
109 |     indexed_data.finalize(
110 |         dataset_dest_file(args, output_prefix, 'en.bert', "idx")
111 |     )
112 | 
113 |     indexed_wordpieces.finalize(
114 |         dataset_dest_file(args, output_prefix, 'en.wordpieces', "idx")
115 |     )
116 |     indexed_wp2w.finalize(
117 |         dataset_dest_file(args, output_prefix, 'en.wp2w', "idx")
118 |     )
119 | 
120 |     # copy the source sentence file to go together with the embeddings
121 |     shutil.copyfile(input_file, dataset_dest_prefix(args, output_prefix, 'en'))
122 | 
123 | 
124 | def make_roberta_embeddings(args, tokenize=None):
125 |     '''
126 |     Makes BERT features for source words
127 |     '''
128 | 
129 |     assert tokenize
130 | 
131 |     if args.trainpref:
132 |         make_binary_bert_features(args, args.trainpref, "train", tokenize)
133 | 
134 |     if args.validpref:
135 |         for k, validpref in enumerate(args.validpref.split(",")):
136 |             outprefix = "valid{}".format(k) if k > 0 else "valid"
137 |             make_binary_bert_features(args, validpref, outprefix, tokenize)
138 | 
139 |     if args.testpref:
140 |         for k, testpref in enumerate(args.testpref.split(",")):
141 |             outprefix = "test{}".format(k) if k > 0 else "test"
142 |             make_binary_bert_features(args, testpref, outprefix, tokenize)
143 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | 
 4 | 
 5 | # automatically infer the user module name (in case there is a change during the development)
 6 | user_module_name = os.path.split(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))[1]
 7 | 
 8 | 
 9 | # automatically import any Python files in the tasks/ directory
10 | # this is necessary for fairseq to register the user defined tasks
11 | for file in os.listdir(os.path.dirname(__file__)):
12 |     if file.endswith('.py') and not file.startswith('_'):
13 |         task_name = file[:file.find('.py')]
14 |         importlib.import_module(user_module_name + '.tasks.' + task_name)
15 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tests/test_action_info_graphmp_tofile.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from fairseq.data import Dictionary
 5 | 
 6 | from fairseq_ext.amr_spec.action_info_binarize_graphmp import (binarize_actstates_tofile,
 7 |                                                                binarize_actstates_tofile_workers,
 8 |                                                                load_actstates_fromfile)
 9 | 
10 | # import sys
11 | # import importlib
12 | # sys.path.insert(0, '..')
13 | # importlib.import_module('fairseq_ext')
14 | # sys.path.pop(0)
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     if len(sys.argv) > 1:
19 |         num_workers = int(sys.argv[1])
20 |     else:
21 |         num_workers = 1
22 | 
23 |     # split = 'dev'
24 |     split = 'train'
25 | 
26 |     en_file = f'/cephfs_nese/TRANSFER/rjsingh/DDoS/DDoS/jzhou/transition-amr-parser/EXP/data/o5_act-states/oracle/{split}.en'
27 |     actions_file = f'/cephfs_nese/TRANSFER/rjsingh/DDoS/DDoS/jzhou/transition-amr-parser/EXP/data/o5_act-states/oracle/{split}.actions'
28 |     actions_dict = Dictionary.load(
29 |         '/cephfs_nese/TRANSFER/rjsingh/DDoS/DDoS/jzhou/transition-amr-parser/EXP/data/o5_act-states/processed/dict.actions_nopos.txt'
30 |     )
31 |     out_file_pref = f'/cephfs_nese/TRANSFER/rjsingh/DDoS/DDoS/jzhou/transition-amr-parser/tmp/{split}.en-actions.actions'
32 | 
33 |     os.makedirs(os.path.dirname(out_file_pref), exist_ok=True)
34 | 
35 |     # res = binarize_actstates_tofile(en_file, actions_file, out_file_pref, actions_dict=actions_dict)
36 |     res = binarize_actstates_tofile_workers(en_file, actions_file, out_file_pref, actions_dict=actions_dict,
37 |                                             num_workers=num_workers)
38 |     print(
39 |         "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
40 |             'actions',
41 |             actions_file,
42 |             res['nseq'],
43 |             res['ntok'],
44 |             100 * res['nunk'] / res['ntok'],
45 |             actions_dict.unk_word,
46 |         )
47 |     )
48 | 
49 |     os.system(f'ls -lh {os.path.dirname(out_file_pref)}')
50 | 
51 |     tgt_actstates = load_actstates_fromfile(out_file_pref, actions_dict)
52 | 
53 |     breakpoint()
54 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tests/test_action_info_tofile.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from fairseq.data import Dictionary
 5 | 
 6 | from fairseq_ext.amr_spec.action_info_binarize import (binarize_actstates_tofile,
 7 |                                                            binarize_actstates_tofile_workers,
 8 |                                                            load_actstates_fromfile)
 9 | 
10 | # import sys
11 | # import importlib
12 | # sys.path.insert(0, '..')
13 | # importlib.import_module('fairseq_ext')
14 | # sys.path.pop(0)
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     if len(sys.argv) > 1:
19 |         num_workers = int(sys.argv[1])
20 |     else:
21 |         num_workers = 1
22 | 
23 |     split = 'dev'
24 |     split = 'train'
25 | 
26 |     en_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/oracle/{split}.en'
27 |     actions_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/oracle/{split}.actions'
28 |     actions_dict = Dictionary.load(
29 |         '/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/databin/dict.actions_nopos.txt'
30 |     )
31 |     out_file_pref = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/tmp/{split}.en-actions.actions'
32 | 
33 |     # en_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/data/o3align_roberta-base-last_act-noeos-states-2LAroot/oracle/{split}.en'
34 |     # actions_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/data/o3align_roberta-base-last_act-noeos-states-2LAroot/oracle/{split}.actions'
35 |     # actions_dict = Dictionary.load(
36 |     #     '/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/data/o3align_roberta-base-last_act-noeos-states-2LAroot/processed/dict.actions_nopos.txt'
37 |     #     )
38 |     # out_file_pref = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/data/o3align_roberta-base-last_act-noeos-states-2LAroot/processed/{split}.en-actions.actions'
39 | 
40 |     os.makedirs(os.path.dirname(out_file_pref), exist_ok=True)
41 | 
42 |     # binarize_actstates_tofile(en_file, actions_file, out_file_pref, actions_dict=actions_dict)
43 |     binarize_actstates_tofile_workers(en_file, actions_file, out_file_pref, actions_dict=actions_dict,
44 |                                       num_workers=num_workers)
45 | 
46 |     os.system(f'ls -lh {os.path.dirname(out_file_pref)}')
47 | 
48 |     tgt_vocab_masks, tgt_actnode_masks, tgt_src_cursors, \
49 |         tgt_actedge_masks, tgt_actedge_cur_nodes, tgt_actedge_pre_nodes, tgt_actedge_directions = \
50 |         load_actstates_fromfile(out_file_pref, actions_dict)
51 | 
52 |     import pdb; pdb.set_trace()
53 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tests/test_action_info_tolist.py:
--------------------------------------------------------------------------------
 1 | from fairseq.data import Dictionary
 2 | 
 3 | import sys
 4 | # import importlib
 5 | # sys.path.insert(0, '..')
 6 | # importlib.import_module('fairseq_ext')
 7 | # sys.path.pop(0)
 8 | from fairseq_ext.amr_spec.action_info_binarize import binarize_actstates_tolist, binarize_actstates_tolist_workers
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     if len(sys.argv) > 1:
13 |         num_workers = int(sys.argv[1])
14 |     else:
15 |         num_workers = 1
16 | 
17 |     split = 'dev'
18 |     # split = 'train'
19 | 
20 |     en_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/oracle/{split}.en'
21 |     actions_file = f'/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/oracle/{split}.actions'
22 |     actions_dict = Dictionary.load(
23 |         '/dccstor/ykt-parse/AMR/jiawei2020/transition-amr-parser/EXP/exp1/databin/dict.actions_nopos.txt'
24 |     )
25 | 
26 |     # tgt_vocab_masks, tgt_actnode_masks, tgt_src_cursors = binarize_actstates_tolist(en_file, actions_file,
27 |     #                                                                                 actions_dict=actions_dict)
28 |     # TODO not working for num_workers > 1
29 |     tgt_vocab_masks, tgt_actnode_masks, tgt_src_cursors, \
30 |         tgt_actedge_masks, tgt_actedge_cur_nodes, tgt_actedge_pre_nodes, tgt_actedge_directions = \
31 |         binarize_actstates_tolist_workers(en_file, actions_file, actions_dict=actions_dict, num_workers=num_workers)
32 | 
33 |     import pdb
34 |     pdb.set_trace()
35 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tests/test_amr_action_bpe.py:
--------------------------------------------------------------------------------
 1 | from fairseq_ext.data.amr_bpe import AMRActionBPEEncoder, AMRActionBartDictionary
 2 | 
 3 | 
 4 | if __name__ == '__main__':
 5 |     # file paths
 6 |     encoder_json_path = 'DATA/gpt2_bpe/encoder.json'
 7 |     vocab_bpe_path = 'DATA/gpt2_bpe/vocab.bpe'
 8 |     dict_txt_path = 'DATA/gpt2_bpe/dict.txt'
 9 |     node_file_path = 'DATA/AMR2.0/oracles/o10/train.actions.vocab.nodes'
10 |     others_file_path = 'DATA/AMR2.0/oracles/o10/train.actions.vocab.others'
11 | 
12 |     # build the bpe encoder
13 |     act_bpe = AMRActionBPEEncoder.build_bpe_encoder(encoder_json_path,    # or None to use cached
14 |                                                     vocab_bpe_path,    # or None to use cached
15 |                                                     # add new symbols
16 |                                                     node_freq_min=5,
17 |                                                     node_file_path=node_file_path,
18 |                                                     others_file_path=others_file_path
19 |                                                     )
20 | 
21 |     actions = 'SHIFT SHIFT clear-06 ROOT SHIFT SHIFT thing prepare-01 >RA(:ARG1-of) SHIFT prior-to >RA(:time) ' \
22 |         'SHIFT SHIFT SHIFT COPY >RA(:op1) SHIFT SHIFT - SHIFT construct-01 >LA(:polarity) >LA(:ARG1) >RA(:ARG1) ' \
23 |         'SHIFT SHIFT SHIFT base-02 >RA(:ARG1-of) SHIFT SHIFT SHIFT COPY SHIFT COPY >LA(:mod) SHIFT simulate-01 >LA(:ARG1) >RA(:ARG2) SHIFT SHIFT'
24 |     bpe_token_ids, bpe_tokens, tok_to_subtok_start, subtok_origin_index = act_bpe.encode_actions(actions)
25 | 
26 |     breakpoint()
27 | 
28 |     # build the action dictionary
29 |     act_dict = AMRActionBartDictionary(dict_txt_path,    # or None to use cached
30 |                                        node_freq_min=5,
31 |                                        node_file_path=node_file_path,
32 |                                        others_file_path=others_file_path)
33 | 
34 |     ids, bpe_tokens, tok_to_subtok_start, subtok_origin_index = act_dict.encode_actions(actions)
35 | 
36 |     breakpoint()
37 | 
38 |     print(act_dict.decode_actions(ids))
39 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tests/test_amr_action_unk.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from fairseq_ext.data.amr_bpe import AMRActionBPEEncoder, AMRActionBartDictionary
 6 | from fairseq_ext.amr_reform.o10_action_reformer_subtok import AMRActionReformerSubtok
 7 | from transition_amr_parser.amr_machine import AMRStateMachine
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     # file paths
12 |     encoder_json_path = 'DATA/gpt2_bpe/encoder.json'
13 |     vocab_bpe_path = 'DATA/gpt2_bpe/vocab.bpe'
14 |     dict_txt_path = 'DATA/gpt2_bpe/dict.txt'
15 |     node_file_path = 'DATA/AMR2.0/oracles/o10/train.actions.vocab.nodes'
16 |     others_file_path = 'DATA/AMR2.0/oracles/o10/train.actions.vocab.others'
17 | 
18 |     split = 'train'
19 |     en_file = f'/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart-o10/DATA/AMR2.0/oracles/o10/{split}.tokens'
20 |     actions_file = f'/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart-o10/DATA/AMR2.0/oracles/o10/{split}.actions'
21 |     machine_config = f'/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart-o10/DATA/AMR2.0/oracles/o10/machine_config.json'
22 | 
23 |     # # build the bpe encoder
24 |     # act_bpe = AMRActionBPEEncoder.build_bpe_encoder(encoder_json_path,    # or None to use cached
25 |     #                                                 vocab_bpe_path,    # or None to use cached
26 |     #                                                 # add new symbols
27 |     #                                                 node_freq_min=5,
28 |     #                                                 node_file_path=node_file_path,
29 |     #                                                 others_file_path=others_file_path
30 |     #                                                 )
31 | 
32 |     # actions = 'SHIFT SHIFT clear-06 ROOT SHIFT SHIFT thing prepare-01 >RA(:ARG1-of) SHIFT prior-to >RA(:time) ' \
33 |     #     'SHIFT SHIFT SHIFT COPY >RA(:op1) SHIFT SHIFT - SHIFT construct-01 >LA(:polarity) >LA(:ARG1) >RA(:ARG1) ' \
34 |     #     'SHIFT SHIFT SHIFT base-02 >RA(:ARG1-of) SHIFT SHIFT SHIFT COPY SHIFT COPY >LA(:mod) SHIFT simulate-01 >LA(:ARG1) >RA(:ARG2) SHIFT SHIFT'
35 |     # bpe_token_ids, bpe_tokens, tok_to_subtok_start, subtok_origin_index = act_bpe.encode_actions(actions)
36 | 
37 |     # breakpoint()
38 | 
39 |     # build the action dictionary
40 |     act_dict = AMRActionBartDictionary(dict_txt_path,    # or None to use cached
41 |                                        node_freq_min=5,
42 |                                        node_file_path=node_file_path,
43 |                                        others_file_path=others_file_path)
44 | 
45 |     # ids, bpe_tokens, tok_to_subtok_start, subtok_origin_index = act_dict.encode_actions(actions)
46 | 
47 |     # breakpoint()
48 | 
49 |     # print(act_dict.decode_actions(ids))
50 | 
51 |     # check for unk symbol in the data
52 |     machine = AMRStateMachine.from_config(machine_config)
53 | 
54 |     replaced = Counter()
55 |     current_unk = []
56 | 
57 |     def replaced_consumer(word, idx):
58 |         if idx == act_dict.unk_index and word != act_dict.unk_word:
59 |             replaced.update([word])
60 |             current_unk.append(word)
61 | 
62 |     with open(en_file, 'r') as f, open(actions_file, 'r') as g:
63 |         for tokens, actions in tqdm(zip(f, g)):
64 |             if tokens.strip():
65 |                 tokens = tokens.strip().split('\t')
66 |                 actions = actions.strip().split('\t')
67 | 
68 |                 if actions[-1] != 'CLOSE':
69 |                     actions = actions.copy()
70 |                     actions.append('CLOSE')
71 | 
72 |                 actions_states = AMRActionReformerSubtok.reform_actions_and_get_states(tokens, actions,
73 |                                                                                        act_dict, machine)
74 |                 v = actions_states['actions_nopos_out']
75 | 
76 |                 current_unk = []
77 | 
78 |                 ids = act_dict.encode_line(
79 |                     line=[act if act != 'CLOSE' else act_dict.eos_word for act in v],
80 |                     line_tokenizer=lambda x: x,    # already tokenized
81 |                     add_if_not_exist=False,
82 |                     consumer=replaced_consumer,
83 |                     append_eos=False,
84 |                     reverse_order=False
85 |                     )
86 | 
87 |                 if current_unk:
88 |                     print(replaced)
89 |                     print(current_unk)
90 |                     print(actions)
91 |                     breakpoint()
92 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tests/test_composite_embeddings.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from fairseq.data import Dictionary
 3 | from fairseq_ext.extract_bart.composite_embeddings import CompositeEmbeddingBART
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     vocab_path = '/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart/EXP/data/graphmp-swaparc-ptrlast_o8.3_act-states/processed/dict.actions_nopos.txt'
 8 |     vocab = Dictionary.load(vocab_path)
 9 | 
10 |     bart = torch.hub.load('pytorch/fairseq', 'bart.base')
11 | 
12 |     cemb = CompositeEmbeddingBART(bart, bart.model.decoder.embed_tokens, vocab)
13 | 
14 |     indices = torch.tensor([[1, 3, 8], [10, 5000, 666]])
15 | 
16 |     indices = indices.cuda()
17 |     cemb.to('cuda')
18 | 
19 |     embeddings = cemb(indices, update=True)
20 | 
21 |     breakpoint()
22 | 
23 |     # test backprop
24 |     optimizer = torch.optim.SGD(cemb.parameters(), lr=1)
25 |     for i in range(2):
26 |         print()
27 |         optimizer.zero_grad()
28 |         print(cemb.base_embeddings.weight[:1].sum())
29 |         print(cemb.base_embeddings.weight[:2])
30 |         ll = cemb(torch.tensor([0, 1, 2]).cuda(), update=True).sum() * 10
31 |         ll.backward()
32 |         print(cemb.base_embeddings.weight.grad)
33 |         optimizer.step()
34 |         print(cemb.base_embeddings.weight[:1].sum())
35 |         print(cemb.base_embeddings.weight[:2])
36 |         print()
37 | 
38 |     breakpoint()
39 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tests/test_composite_embeddings_mapping.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from tqdm import tqdm
 4 | import torch
 5 | from fairseq.data import Dictionary
 6 | from fairseq_ext.extract_bart.composite_embeddings import CompositeEmbeddingBART, transform_action_symbol
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     vocab_path = '/n/tata_ddos_ceph/jzhou/transition-amr-parser-bart-o10/EXP/data/o10_act-states/processed/dict.actions_nopos.txt'
11 |     vocab = Dictionary.load(vocab_path)
12 | 
13 |     bart = torch.hub.load('pytorch/fairseq', 'bart.base')
14 | 
15 |     cemb = CompositeEmbeddingBART(bart, bart.model.decoder.embed_tokens, vocab)
16 | 
17 |     trans_actions = []
18 |     for sym in tqdm(vocab.symbols):
19 |         new_sym = transform_action_symbol(sym)    # str
20 |         splitted = cemb.sub_tokens(new_sym)    # list
21 |         # trans_actions.append((new_sym, splitted))
22 |         trans_actions.append(new_sym + '  -->  ' + '|' + '|'.join(splitted) + '|' + '\n')
23 | 
24 |     tmp_dir = 'fairseq_ext/tests_data'
25 |     os.makedirs(tmp_dir, exist_ok=True)
26 |     with open(os.path.join(tmp_dir, 'dict.actions_nopos.bartmap.txt'), 'w') as f:
27 |         f.writelines(trans_actions)
28 | 
29 |     breakpoint()
30 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tests/test_factored_embeddings.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from fairseq.data import Dictionary
 3 | from fairseq_ext.modules.factored_embeddings import FactoredEmbeddings
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     vocab_path = '/n/tata_ddos_ceph/jzhou/transition-amr-parser-o8/EXP/data/graphmp-swaparc-ptrlast_o8.3_act-states/processed/dict.actions_nopos.txt'
 8 |     embed_dim = 256
 9 |     vocab = Dictionary.load(vocab_path)
10 |     femb = FactoredEmbeddings(vocab, embed_dim)
11 | 
12 |     indices = torch.tensor([[1, 3, 8], [10, 5000, 666]]).cuda()
13 |     femb.to('cuda')
14 |     
15 |     embeddings = femb(indices)
16 | 
17 |     breakpoint()
18 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/tokenizer.py:
--------------------------------------------------------------------------------
1 | def tokenize_line_tab(line):
2 |     line = line.strip()
3 |     return line.split('\t')
4 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/utils_font.py:
--------------------------------------------------------------------------------
 1 | # FONT_COLORORS
 2 | FONT_COLOR = {
 3 |     'black': 30, 'red': 31, 'green': 32, 'yellow': 33, 'blue': 34,
 4 |     'magenta': 35, 'cyan': 36, 'light gray': 37, 'dark gray': 90,
 5 |     'light red': 91, 'light green': 92, 'light yellow': 93,
 6 |     'light blue': 94, 'light magenta': 95, 'light cyan': 96, 'white': 97
 7 | }
 8 | 
 9 | # BG FONT_COLORORS
10 | BACKGROUND_COLOR = {
11 |     'black': 40, 'red': 41, 'green': 42, 'yellow': 43, 'blue': 44,
12 |     'magenta': 45, 'cyan': 46, 'light gray': 47, 'dark gray': 100,
13 |     'light red': 101, 'light green': 102, 'light yellow': 103,
14 |     'light blue': 104, 'light magenta': 105, 'light cyan': 106,
15 |     'white': 107
16 | }
17 | 
18 | 
19 | def white_background(string):
20 |     return "\033[107m%s\033[0m" % string
21 | 
22 | 
23 | def red_background(string):
24 |     return "\033[101m%s\033[0m" % string
25 | 
26 | 
27 | def black_font(string):
28 |     return "\033[30m%s\033[0m" % string
29 | 
30 | 
31 | def yellow_font(string):
32 |     return "\033[93m%s\033[0m" % string
33 | 
34 | 
35 | def stack_style(string):
36 |     return black_font(white_background(string))
37 | 
38 | 
39 | def ordered_exit(signum, frame):
40 |     print("\nStopped by user\n")
41 |     exit(0)
42 | 


--------------------------------------------------------------------------------
/src/fairseq_ext/utils_import.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import importlib
 4 | 
 5 | 
 6 | # ========== adapted from
 7 | # https://github.com/pytorch/fairseq/blob/83e615d66905b8ca7483122a37da1a85f13f4b8e/fairseq/utils.py#L431
 8 | # to avoid error in our setup
 9 | # ==========
10 | def import_user_module(args):
11 |     module_path = getattr(args, "user_dir", None)
12 |     if module_path is not None:
13 |         module_path = os.path.abspath(args.user_dir)
14 |         if not os.path.exists(module_path):
15 |             fairseq_rel_path = os.path.join(os.path.dirname(__file__), args.user_dir)
16 |             if os.path.exists(fairseq_rel_path):
17 |                 module_path = fairseq_rel_path
18 |             else:
19 |                 fairseq_rel_path = os.path.join(
20 |                     os.path.dirname(__file__), "..", args.user_dir
21 |                 )
22 |                 if os.path.exists(fairseq_rel_path):
23 |                     module_path = fairseq_rel_path
24 |                 else:
25 |                     raise FileNotFoundError(module_path)
26 | 
27 |         # ensure that user modules are only imported once
28 |         import_user_module.memo = getattr(import_user_module, "memo", set())
29 |         if module_path not in import_user_module.memo:
30 |             import_user_module.memo.add(module_path)
31 | 
32 |             module_parent, module_name = os.path.split(module_path)
33 |             if module_name not in sys.modules:
34 |                 sys.path.insert(0, module_parent)
35 |                 importlib.import_module(module_name)
36 |             # else:
37 |             #     raise ImportError(
38 |             #         "Failed to import --user-dir={} because the corresponding module name "
39 |             #         "({}) is not globally unique. Please rename the directory to "
40 |             #         "something unique and try again.".format(module_path, module_name)
41 |             #     )
42 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/README.md:
--------------------------------------------------------------------------------
 1 | # Install (Compatible w. AMR Parser)
 2 | 
 3 | ```
 4 | cd transition-amr-parser
 5 | 
 6 | conda create --name torch-1.4 python=3.6
 7 | conda activate torch-1.4
 8 | conda install -y pytorch==1.4.0 torchvision==0.5.0 cudatoolkit=10.1 -c pytorch
 9 | pip install h5py # Required for elmo embeddings.
10 | pip install -e .
11 | conda install -c dglteam "dgl-cuda10.1<0.5"
12 | ```
13 | 
14 | Changes for CPU:
15 | 
16 | ```
17 | conda install -y pytorch==1.4.0 torchvision==0.5.0 -c pytorch
18 | pip install dgl==0.4.3.post2
19 | ```
20 | 
21 | For GCN support, need to install latest torch-geometric.
22 | 
23 | # Install (with newer torch for easy GCN support)
24 | 
25 | ```
26 | conda create -n ibm-amr-aligner python=3.8 -y
27 | conda activate ibm-amr-aligner
28 | 
29 | # Use torch 1.8, since newer causes issue in torch-geometric.
30 | conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=11.1 -c pytorch -c conda-forge -y
31 | 
32 | pip install h5py # Required for elmo embeddings.
33 | # (NOT TESTED) # conda install -c dglteam dgl-cuda11.1 -y # Installs DGL for TreeLSTM support.
34 | conda install pyg -c pyg -c conda-forge -y # Installs torch-geometric for GCN support.
35 | 
36 | # The next step is tricky. Need to install AMR parser, but requires modifying `setup.py`
37 | 
38 | # Step 1:
39 | vim setup.py # Comment out line about torch 1.4.
40 | 
41 | # Step 2:
42 | pip install -e .
43 | ```
44 | 
45 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/ibm_neural_aligner/__init__.py


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/alignment_decoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | from vocab import PADDING_IDX
 5 | 
 6 | 
 7 | class AlignmentDecoder(object):
 8 | 
 9 |     def batch_decode(self, batch_map, model_output):
10 |         """
11 |         For each node, find most probable alignments.
12 |         """
13 |         x_t = batch_map['text_tokens']
14 |         y_a = model_output['labels']
15 |         y_a_mask = model_output['labels_mask']
16 |         y_a_node_ids = model_output['label_node_ids']
17 |         align = model_output['batch_align']
18 | 
19 |         batch_size, len_t = x_t.shape
20 |         len_a = y_a.shape[-1]
21 |         device = x_t.device
22 | 
23 |         for i_b in range(batch_size):
24 | 
25 |             # variables
26 | 
27 |             indexa = torch.arange(len_a).to(device)
28 |             indext = torch.arange(len_t).to(device)
29 | 
30 |             # select
31 | 
32 |             b_x_t = x_t[i_b]
33 |             b_y_a_mask = y_a_mask[i_b].view(-1)
34 |             b_align = align[i_b]
35 | 
36 |             # mask
37 | 
38 |             b_x_t_mask = b_x_t != PADDING_IDX
39 |             b_indexa = indexa[b_y_a_mask]
40 |             b_indext = indext[b_x_t_mask]
41 | 
42 |             n = b_y_a_mask.sum().item()
43 |             nt = b_x_t_mask.sum().item()
44 | 
45 |             assert b_align.shape == (n, nt, 1)
46 | 
47 |             # decode
48 | 
49 |             argmax = b_align.squeeze(2).argmax(1)
50 | 
51 |             assert argmax.shape == (n,)
52 | 
53 |             # node alignments
54 | 
55 |             node_alignments = []
56 |             for j in range(n):
57 |                 node_id = y_a_node_ids[i_b, b_indexa[j]].item()
58 |                 idx_txt = argmax[j].item()
59 |                 node_alignments.append((node_id, [idx_txt]))
60 | 
61 |             # fix order
62 | 
63 |             node_id_list = [x[0] for x in node_alignments]
64 |             order = np.argsort(node_id_list)
65 | 
66 |             node_alignments = [node_alignments[idx] for idx in order]
67 |             b_align = b_align[order]
68 |             argmax = argmax[order]
69 | 
70 |             # result
71 | 
72 |             info = {}
73 |             info['node_alignments'] = node_alignments
74 |             info['posterior'] = b_align
75 |             info['argmax'] = argmax
76 | 
77 |             yield info
78 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/dummy_align.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import copy
 3 | from transition_amr_parser.io import read_amr
 4 | 
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('--in-amr', default=None, required=True, type=str)
 8 | parser.add_argument('--out-amr', default=None, required=True, type=str)
 9 | args = parser.parse_args()
10 | 
11 | 
12 | def dummy_align(amr):
13 |     amr = copy.deepcopy(amr)
14 |     alignments = {}
15 |     for k in sorted(amr.nodes.keys()):
16 |         alignments[k] = [0]
17 |     amr.alignments = alignments
18 |     return amr
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     corpus = read_amr(args.in_amr, jamr=False)
23 |     with open(args.out_amr, 'w') as f:
24 |         for amr in corpus:
25 |             amr = dummy_align(amr)
26 |             f.write(f'{amr.__str__()}\n')
27 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/gcn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | try:
  5 |     from torch_geometric.data import Batch, Data
  6 |     import torch_geometric.nn as gnn
  7 | except:
  8 |     pass
  9 | 
 10 | from vocab_definitions import MaskInfo
 11 | 
 12 | 
 13 | class GCNEncoder(nn.Module):
 14 |     def __init__(self, embed, size, mode='gcn', dropout_p=0, num_layers=None):
 15 |         super().__init__()
 16 | 
 17 |         num_layers = 2 if num_layers is None else num_layers
 18 | 
 19 |         self.enc = GCN(embed, size, mode=mode, num_layers=num_layers)
 20 | 
 21 |         self.embed = embed
 22 |         self.size = size
 23 |         self.output_size = self.enc.output_size
 24 | 
 25 |         self.dropout_p = dropout_p
 26 |         self.dropout = nn.Dropout(p=dropout_p)
 27 | 
 28 |     @property
 29 |     def device(self):
 30 |         return next(self.parameters()).device
 31 | 
 32 |     def forward(self, batch_map):
 33 |         """
 34 |         Returns:
 35 | 
 36 |             - output: BxLxD
 37 |             - labels: BxL from AMR vocab.
 38 |             - labels_mask: True if label, else False. Useful for padding and edge labels.
 39 |             - label_node_ids: Roughly, torch.arange(len(nodes)).
 40 |         """
 41 |         device = batch_map['device']
 42 |         batch_size = len(batch_map['items'])
 43 | 
 44 |         data = Batch.from_data_list([x['geometric_data'].clone().to(device) for x in batch_map['items']])
 45 | 
 46 |         node_lengths = batch_map['amr_node_mask'].sum(-1).tolist()
 47 |         edge_lengths = [x['geometric_data'].y.shape[0] - n for x, n in zip(batch_map['items'], node_lengths)]
 48 |         max_node_length = max(node_lengths)
 49 |         size = self.enc.output_size
 50 | 
 51 |         gcn_output = self.enc(batch_map, data)
 52 | 
 53 |         shape = (sum(node_lengths) + sum(edge_lengths), size)
 54 |         assert gcn_output.shape == shape, (shape, gcn_output.shape)
 55 | 
 56 |         if True:
 57 |             new_h = torch.zeros(batch_size, max_node_length, size, dtype=torch.float, device=device)
 58 |             labels = torch.zeros(batch_size, max_node_length, dtype=torch.long, device=device)
 59 |             labels_mask = torch.zeros(batch_size, max_node_length, dtype=torch.bool, device=device)
 60 |             label_node_ids = torch.full((batch_size, max_node_length), -1, dtype=torch.long, device=device)
 61 | 
 62 |             offset = 0
 63 |             for i_b in range(batch_size):
 64 |                 n = node_lengths[i_b]
 65 |                 n_e = edge_lengths[i_b]
 66 |                 if batch_map['add_edges'] == False:
 67 |                     assert n_e == 0
 68 | 
 69 |                 #
 70 |                 new_h[i_b, :n] = gcn_output[offset:offset + n]
 71 |                 labels[i_b, :n] = data.y[offset:offset + n]
 72 |                 labels_mask[i_b, :n] = True
 73 |                 label_node_ids[i_b, :n] = torch.arange(n, dtype=torch.long, device=device)
 74 | 
 75 |                 #
 76 |                 offset += n + n_e
 77 | 
 78 |         output = new_h
 79 |         output = self.dropout(output)
 80 | 
 81 |         return output, labels, labels_mask, label_node_ids
 82 | 
 83 | 
 84 | class GCN(torch.nn.Module):
 85 |     def __init__(self, embed, size, mode='gcn', num_layers=2):
 86 |         super().__init__()
 87 | 
 88 |         self.num_layers = num_layers
 89 |         self.embed = embed
 90 |         self.size = size
 91 |         self.output_size = size
 92 | 
 93 |         input_size = embed.output_size
 94 | 
 95 |         self.W_node = nn.Linear(input_size, size)
 96 | 
 97 |         if mode == 'gcn':
 98 |             for i in range(num_layers):
 99 |                 setattr(self, 'conv{}'.format(i + 1), gnn.GCNConv(size, size))
100 |         elif mode == 'gcn_transformer':
101 |             for i in range(num_layers):
102 |                 setattr(self, 'conv{}'.format(i + 1), gnn.TransformerConv(size, size))
103 |         elif mode == 'gcn_film':
104 |             for i in range(num_layers):
105 |                 setattr(self, 'conv{}'.format(i + 1), gnn.FiLMConv(size, size))
106 |         elif mode == 'gcn_gated':
107 |             self.conv1 = gnn.GatedGraphConv(size, num_layers=num_layers)
108 |         self.mode = mode
109 | 
110 |         self.mask_vec = nn.Parameter(torch.FloatTensor(input_size).normal_())
111 | 
112 |     def compute_node_features(self, node_tokens, mask=None):
113 |         if mask is None:
114 |             return self.W_node(self.embed(node_tokens))
115 |         else:
116 |             m = torch.cat(mask, 0)
117 |             e = self.embed(node_tokens)
118 |             e[m == MaskInfo.masked] = self.mask_vec
119 |             return self.W_node(e)
120 | 
121 |     def forward(self, batch_map, data):
122 |         batch_size = len(batch_map['items'])
123 | 
124 |         data.x = self.compute_node_features(data.y, mask=batch_map['mask_for_gcn'])
125 | 
126 |         # Hacky way to support any graphs with no edges.
127 |         if any([data[i_b].edge_index.shape[0] == 0 for i_b in range(batch_size)]):
128 |             return data.x
129 | 
130 |         x, edge_index = data.x, data.edge_index
131 | 
132 |         if self.mode == 'gcn_gated':
133 |             x = self.conv1(x, edge_index)
134 | 
135 |         else:
136 |             for i in range(self.num_layers):
137 |                 x = getattr(self, 'conv{}'.format(i + 1))(x, edge_index)
138 |                 if i < self.num_layers - 1:
139 |                     x = torch.relu(x)
140 | 
141 |         return x
142 | 
143 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/gypsum/setup_amr2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TASK="AMR2.0"
 4 | CACHE="cache-amr2"
 5 | 
 6 | mkdir -p $CACHE
 7 | 
 8 | cp ./DATA/${TASK}/aligned/cofill/train.txt ./${CACHE}/train.aligned.txt
 9 | 
10 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/dev.txt ./DATA/${TASK}/corpora/dev.txt.no_wiki
11 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/test.txt ./DATA/${TASK}/corpora/test.txt.no_wiki
12 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/train.txt ./DATA/${TASK}/corpora/train.txt.no_wiki
13 | 
14 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/dev.txt.no_wiki --out-amr ./${CACHE}/dev.txt.no_wiki
15 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/test.txt.no_wiki --out-amr ./${CACHE}/test.txt.no_wiki
16 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/train.txt.no_wiki --out-amr ./${CACHE}/train.txt.no_wiki
17 | 
18 | python src/ibm_neural_aligner/vocab.py \
19 |     --in-amrs \
20 |         ./DATA/${TASK}/aligned/cofill/dev.txt \
21 |         ./DATA/${TASK}/aligned/cofill/test.txt \
22 |         ./DATA/${TASK}/aligned/cofill/train.txt \
23 |         \
24 |         ./DATA/${TASK}/corpora/dev.txt \
25 |         ./DATA/${TASK}/corpora/test.txt \
26 |         ./DATA/${TASK}/corpora/train.txt \
27 |         \
28 |         ./DATA/${TASK}/corpora/dev.txt.no_wiki \
29 |         ./DATA/${TASK}/corpora/test.txt.no_wiki \
30 |         ./DATA/${TASK}/corpora/train.txt.no_wiki \
31 |     --out-text ./${CACHE}/vocab.text.txt \
32 |     --out-amr ./${CACHE}/vocab.amr.txt
33 | 
34 | python src/ibm_neural_aligner/pretrained_embeddings.py --cuda --cache-dir ./${CACHE}/ --vocab ./${CACHE}/vocab.text.txt
35 | python src/ibm_neural_aligner/pretrained_embeddings.py --cuda --cache-dir ./${CACHE}/ --vocab ./${CACHE}/vocab.amr.txt
36 | 
37 | cp src/ibm_neural_aligner/setup_amr2.sh $CACHE/setup_data.sh
38 | 
39 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/gypsum/setup_amr3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TASK="AMR3.0"
 4 | CACHE="cache-amr3"
 5 | 
 6 | mkdir -p $CACHE
 7 | 
 8 | cp ./DATA/${TASK}/aligned/cofill/train.txt ./${CACHE}/train.aligned.txt
 9 | 
10 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/dev.txt ./DATA/${TASK}/corpora/dev.txt.no_wiki
11 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/test.txt ./DATA/${TASK}/corpora/test.txt.no_wiki
12 | python preprocess/remove_wiki.py ./DATA/${TASK}/corpora/train.txt ./DATA/${TASK}/corpora/train.txt.no_wiki
13 | 
14 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/dev.txt.no_wiki --out-amr ./${cache}/dev.txt.no_wiki
15 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/test.txt.no_wiki --out-amr ./${cache}/test.txt.no_wiki
16 | python src/ibm_neural_aligner/tokenize_amr.py --in-amr ./DATA/${TASK}/corpora/train.txt.no_wiki --out-amr ./${cache}/train.txt.no_wiki
17 | 
18 | python src/ibm_neural_aligner/vocab.py \
19 |     --in-amrs \
20 |         ./DATA/${TASK}/aligned/cofill/dev.txt \
21 |         ./DATA/${TASK}/aligned/cofill/test.txt \
22 |         ./DATA/${TASK}/aligned/cofill/train.txt \
23 |         \
24 |         ./DATA/${TASK}/corpora/dev.txt \
25 |         ./DATA/${TASK}/corpora/test.txt \
26 |         ./DATA/${TASK}/corpora/train.txt \
27 |         \
28 |         ./DATA/${TASK}/corpora/dev.txt.no_wiki \
29 |         ./DATA/${TASK}/corpora/test.txt.no_wiki \
30 |         ./DATA/${TASK}/corpora/train.txt.no_wiki \
31 |     --out-text ./${CACHE}/vocab.text.txt \
32 |     --out-amr ./${CACHE}/vocab.amr.txt
33 | 
34 | python src/ibm_neural_aligner/pretrained_embeddings.py --cuda --cache-dir ./${CACHE}/ --vocab ./${CACHE}/vocab.text.txt
35 | python src/ibm_neural_aligner/pretrained_embeddings.py --cuda --cache-dir ./${CACHE}/ --vocab ./${CACHE}/vocab.amr.txt
36 | 
37 | cp src/ibm_neural_aligner/setup_amr3.sh $CACHE/setup_data.sh
38 | 
39 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/gypsum/view_sweep.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import json
  4 | import os
  5 | 
  6 | 
  7 | errors = collections.Counter()
  8 | 
  9 | class DidNotEval(ValueError):
 10 |     pass
 11 | 
 12 | class DidNotTrain(ValueError):
 13 |     pass
 14 | 
 15 | class NoModel(ValueError):
 16 |     pass
 17 | 
 18 | 
 19 | def main(args):
 20 |     with open(args.file_list) as f:
 21 |         file_list = f.read().strip().split('\n')
 22 |     print('file_list', len(file_list))
 23 | 
 24 |     def readfile(path):
 25 |         eval_path, train_path = path.split()
 26 | 
 27 |         path = eval_path
 28 |         slurm_out = os.path.join(path, 'slurm.out')
 29 |         eval_json = os.path.join(path, 'train.aligned.txt.eval.json')
 30 |         flags_json = os.path.join(train_path, 'flags.json')
 31 | 
 32 |         if not os.path.exists(slurm_out):
 33 |             print('did not eval {} {}'.format(train_path, eval_path))
 34 |             raise DidNotEval('')
 35 | 
 36 |         if not os.path.exists(flags_json):
 37 |             print('did not train {} {}'.format(train_path, eval_path))
 38 |             raise DidNotTrain('')
 39 | 
 40 |         with open(flags_json) as f:
 41 |             train_flags = json.loads(f.read())
 42 | 
 43 |         eval_flags = None
 44 |         try:
 45 |             flags_json = os.path.join(eval_path, 'flags.json')
 46 |             with open(flags_json) as f:
 47 |                 eval_flags = json.loads(f.read())
 48 |         except:
 49 | 
 50 |             with open(slurm_out) as f:
 51 |                 for i, line in enumerate(f):
 52 |                     if line.startswith('{'):
 53 |                         if line[1] == "'":
 54 |                             eval_flags = eval(line)
 55 |                         else:
 56 |                             eval_flags = json.loads(line.strip())
 57 |                         break
 58 | 
 59 |         train_slurm = os.path.join(train_path, 'slurm.out')
 60 | 
 61 |         if eval_flags is None:
 62 |             print('nothing found', slurm_out, train_slurm)
 63 |             raise ValueError
 64 | 
 65 |         model_path = eval_flags['load']
 66 | 
 67 |         if not os.path.exists(model_path):
 68 |             print('no model {} {}'.format(train_path, eval_path))
 69 |             raise NoModel
 70 | 
 71 |         if os.path.exists(slurm_out) and not os.path.exists(eval_json):
 72 |             print('possible error {} {} {} {}'.format(slurm_out, eval_flags['hostname'], train_slurm, train_flags['hostname']))
 73 |             errors['train-{}'.format(train_flags['hostname'])] += 1
 74 |             errors['eval-{}'.format(eval_flags['hostname'])] += 1
 75 | 
 76 |         if not os.path.exists(eval_json):
 77 |             raise ValueError
 78 | 
 79 |         # read eval_json
 80 |         with open(eval_json) as f:
 81 |             o = json.loads(f.read())
 82 |         o['path'] = path
 83 |         o['train_flags'] = train_flags
 84 |         o['eval_flags'] = eval_flags
 85 |         return o
 86 | 
 87 |     def try_map(items, func):
 88 |         for x in items:
 89 |             try:
 90 |                 yield func(x)
 91 |             except ValueError:
 92 |                 continue
 93 | 
 94 |     def groupby(data):
 95 |         groups = collections.defaultdict(list)
 96 | 
 97 |         for ex in data:
 98 |             groups[ex['train_flags']['log_dir']].append(ex)
 99 | 
100 |         return groups
101 | 
102 |     for k, v in sorted(errors.items(), key=lambda x: x[1]):
103 |         print(k, v)
104 | 
105 |     data = [x for x in try_map(file_list, readfile)]
106 |     for ex in data:
107 |         recall = ex['Corpus Recall using spans for gold']['recall']
108 |         ex['recall'] = recall
109 | 
110 |     groups = groupby(data)
111 | 
112 |     for group in sorted(groups.values(), key=lambda x: max(map(lambda x: x['recall'], x))):
113 |         print(group[0]['train_flags']['log_dir'])
114 |         for ex in sorted(group, key=lambda x: x['path']):
115 |             print(ex['recall'], ex['path'])
116 | 
117 |     print('data', len(data), 'groups', len(groups))
118 | 
119 | if __name__ == '__main__':
120 |     parser = argparse.ArgumentParser()
121 |     parser.add_argument('--file-list', default='eval_json.2021-11-05a.txt', type=str)
122 |     args = parser.parse_args()
123 | 
124 |     print(args.__dict__)
125 | 
126 |     main(args)
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/install.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | 
 4 | # activate normal env
 5 | . set_environment.sh
 6 | 
 7 | # load normal env
 8 | # install DGL, in addition to normall install in README
 9 | conda install -y -c dglteam "dgl-cuda10.1<0.5"
10 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/leamr_align.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import copy
 3 | 
 4 | from austin_amr_utils.amr_readers import AMR_Reader
 5 | from transition_amr_parser.io import read_amr2
 6 | 
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--in-amr', default=None, required=True, type=str)
10 | parser.add_argument('--out-amr', default=None, required=True, type=str)
11 | args = parser.parse_args()
12 | 
13 | corpus = AMR_Reader().load(args.in_amr)
14 | 
15 | with open(args.out_amr,'w') as f:
16 |     for amr in corpus:
17 |         f.write(amr.amr_string().strip() + '\n\n')
18 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/lexicon.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | import os
  4 | 
  5 | from amr_utils import safe_read as safe_read_
  6 | from tqdm import tqdm
  7 | 
  8 | def safe_read(path, **kwargs):
  9 |     kwargs['ibm_format'], kwargs['tokenize'] = True, False
 10 |     return safe_read_(path, **kwargs)
 11 | 
 12 | #path_old = './DATA/AMR2.0/aligned/cofill/dev.txt'
 13 | #path_neu = './tmp_out/dev.aligned.txt'
 14 | path_old = './DATA/AMR2.0/aligned/cofill/train.txt'
 15 | path_neu = './tmp_out/train.aligned.txt'
 16 | 
 17 | def build_lexicon(path):
 18 |     amrs = safe_read(path)
 19 |     lexicon = collections.defaultdict(collections.Counter)
 20 | 
 21 |     for amr in tqdm(amrs):
 22 |         for node_id, text_id_list in amr.alignments.items():
 23 |             if len(text_id_list) > 1:
 24 |                 continue
 25 | 
 26 |             for text_id in text_id_list:
 27 |                 text = amr.tokens[text_id]
 28 |                 node = amr.nodes[node_id]
 29 | 
 30 |             lexicon[node][text] += 1
 31 | 
 32 |     return lexicon
 33 | 
 34 | lex_old = build_lexicon(path_old)
 35 | lex_neu = build_lexicon(path_neu)
 36 | 
 37 | def compare_lexicon(lex_old, lex_neu):
 38 |     for node, lex in sorted(lex_old.items(), key=lambda x: len(x[1])):
 39 | 
 40 |         row_old = []
 41 |         for text in sorted(lex_old[node].keys()):
 42 |             row_old.append(text)
 43 | 
 44 |         row_neu = []
 45 |         for text in sorted(lex_neu[node].keys()):
 46 |             row_neu.append(text)
 47 | 
 48 |         print(node)
 49 |         print('old', row_old)
 50 |         print('neu', row_neu)
 51 |         print('')
 52 | 
 53 | def compare_lexicon_stats(lex_old, lex_neu):
 54 |     threshold = 10
 55 |     stats = collections.Counter()
 56 |     for node, lex in sorted(lex_old.items(), key=lambda x: len(x[1])):
 57 | 
 58 |         row_old = []
 59 |         for text in sorted(lex_old[node].keys()):
 60 |             row_old.append(text)
 61 | 
 62 |         row_neu = []
 63 |         for text in sorted(lex_neu[node].keys()):
 64 |             row_neu.append(text)
 65 | 
 66 |         stats['total'] += 1
 67 | 
 68 |         if len(row_old) >= len(row_neu):
 69 |             stats['old >= neu'] += 1
 70 |         else:
 71 |             print(f'old < neu, {len(row_old)} - {len(row_neu)} = {len(row_old) - len(row_neu)}')
 72 |             print(node)
 73 |             print('old', row_old)
 74 |             print('neu', row_neu)
 75 |             print('')
 76 | 
 77 |         if len(row_old) == len(row_neu):
 78 |             stats['old == neu'] += 1
 79 | 
 80 |         if len(row_old) <= threshold:
 81 |             stats['old <= t'] += 1
 82 | 
 83 |         if len(row_neu) <= threshold:
 84 |             stats['neu <= t'] += 1
 85 |         else:
 86 |             print(f'neu > t, {len(row_old)} - {len(row_neu)} = {len(row_old) - len(row_neu)}')
 87 |             print(node)
 88 |             print('old', row_old)
 89 |             print('neu', row_neu)
 90 |             print('')
 91 | 
 92 |     for k, v in stats.items():
 93 |         if k == 'total':
 94 |             continue
 95 | 
 96 |         n = stats['total']
 97 |         print(f'{k} : {v} / {n} ({v/n:.3f})')
 98 | 
 99 | def view_lexicon(lexicon):
100 |     for node, lex in sorted(lexicon.items(), key=lambda x: len(x[1])):
101 | 
102 |         row = []
103 |         for text in sorted(lexicon[node].keys()):
104 |             row.append(text)
105 | 
106 |         print('{} {}'.format(node, row))
107 | 
108 | compare_lexicon(lex_old, lex_neu)
109 | compare_lexicon_stats(lex_old, lex_neu)
110 | 
111 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/make_splits.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | import collections
 5 | import numpy as np
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--input', default=os.path.expanduser('~/data/AMR2.0/aligned/cofill/train.txt'), type=str)
 9 | args = parser.parse_args()
10 | 
11 | def readfile(path):
12 |     data = []
13 |     b = None
14 |     with open(path) as f:
15 |         for line in f:
16 |             if line.strip():
17 |                 if b is None:
18 |                     b = ''
19 |                 b += line
20 |             else:
21 |                 if b is not None:
22 |                     data.append(b)
23 |                 b = None
24 |         if b is not None:
25 |             data.append(b)
26 |     return data
27 | 
28 | def writefile(data, path):
29 |     print('writing', path)
30 |     with open(path, 'w') as f:
31 |         for b in data:
32 |             f.write(b)
33 |             f.write('\n')
34 | 
35 | 
36 | # read
37 | data = readfile(args.input)
38 | print(len(data))
39 | 
40 | # shuffle
41 | np.random.seed(113)
42 | np.random.shuffle(data)
43 | 
44 | # split
45 | n = 1000
46 | 
47 | # train
48 | train = data[n:]
49 | 
50 | # unseen dev
51 | unseen = data[:n]
52 | 
53 | # seen dev
54 | seen = train[:n]
55 | 
56 | # write
57 | writefile(train, args.input + '.train-v1')
58 | writefile(unseen, args.input + '.dev-unseen-v1')
59 | writefile(seen, args.input + '.dev-seen-v1')
60 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/metric_utils.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | import numpy as np
 4 | 
 5 | from ibm_neural_aligner.amr_utils import convert_amr_to_tree, compute_pairwise_distance, get_node_ids
 6 | 
 7 | 
 8 | def fertility_proxy(amr, ignore_nodes=('country', '-', 'and', 'person', 'name')):
 9 |     """ Measures the average number of aligned words per sentence.
10 | 
11 |         Lower indicates higher fertility.
12 |     """
13 |     alignments = amr.alignments.copy()
14 | 
15 |     for k in list(alignments.keys()):
16 |         if ignore_nodes is not None and amr.nodes[k] in ignore_nodes:
17 |             del alignments[k]
18 | 
19 |     return len(set([v[0] for k, v in alignments.items()]))
20 | 
21 | 
22 | def distortion_proxy(amr, pairwise_dist=None):
23 |     """ Measures the difference between implied and actual distance.
24 | 
25 |         Lower indicates lower distortion.
26 |     """
27 |     if len(amr.nodes) == 1 or len(amr.alignments) == 1:
28 |         return 0, []
29 | 
30 |     if pairwise_dist is None:
31 |         tree = convert_amr_to_tree(amr)
32 |         pairwise_dist = compute_pairwise_distance(tree)
33 |     node_ids = get_node_ids(amr)
34 | 
35 |     c = collections.defaultdict(list)
36 | 
37 |     for i in range(len(node_ids)):
38 |         for j in range(len(node_ids)):
39 |             if i <= j:
40 |                 continue
41 |             node1, node2 = node_ids[i], node_ids[j]
42 |             if node1 not in amr.alignments or node2 not in amr.alignments:
43 |                 continue
44 |             pos1, pos2 = amr.alignments[node1][0], amr.alignments[node2][0]
45 |             c['i'].append(i)
46 |             c['j'].append(j)
47 |             c['pos1'].append(pos1)
48 |             c['pos2'].append(pos2)
49 | 
50 |     actual_distance = np.abs(np.array(c['pos1']) - np.array(c['pos2']))
51 |     implied_distance = pairwise_dist[c['i'], c['j']].numpy()
52 |     proxy = np.power(np.clip(actual_distance - implied_distance, 0, np.inf), 2)
53 | 
54 |     return proxy.mean(), proxy
55 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/pretrained_embeddings.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import hashlib
  3 | import json
  4 | import os
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from tqdm import tqdm
 10 | 
 11 | from ibm_neural_aligner.vocab_definitions import BOS_TOK, EOS_TOK, special_tokens
 12 | from ibm_neural_aligner.standalone_elmo import batch_to_ids, ElmoCharacterEncoder, remove_sentence_boundaries
 13 | 
 14 | 
 15 | # files for original elmo model
 16 | weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
 17 | options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json'
 18 | 
 19 | 
 20 | def maybe_download(remote_url, cache_dir):
 21 |     path = os.path.join(cache_dir, os.path.basename(remote_url))
 22 |     if not os.path.exists(path):
 23 |         os.system(f'curl {remote_url} -o {path} -L')
 24 |     return path
 25 | 
 26 | 
 27 | def hash_string_list(string_list):
 28 |     m = hashlib.sha256()
 29 |     for s in string_list:
 30 |         m.update(str.encode(s))
 31 |     return m.hexdigest()[:8]
 32 | 
 33 | 
 34 | def read_text_vocab_file(path):
 35 |     output = []
 36 |     with open(path) as f:
 37 |         for token in f.read().splitlines():
 38 |             output.append(token)
 39 |     return output
 40 | 
 41 | 
 42 | def read_amr_vocab_file(path):
 43 |     output = []
 44 |     with open(path) as f:
 45 |         for token in f.read().splitlines():
 46 |             output.append(token)
 47 |     return output
 48 | 
 49 | 
 50 | def get_character_embeddings_from_elmo(tokens, cache_dir, cuda=False):
 51 |     assert len(special_tokens) == 3
 52 |     assert tokens[1] == BOS_TOK and tokens[2] == EOS_TOK
 53 | 
 54 |     # Remove special tokens.
 55 |     vocab_to_cache = tokens[3:]
 56 | 
 57 |     size = 512
 58 |     batch_size = 1024
 59 | 
 60 |     char_embedder = ElmoCharacterEncoder(
 61 |         options_file=maybe_download(options_file, cache_dir=cache_dir),
 62 |         weight_file=maybe_download(weights_file, cache_dir=cache_dir),
 63 |         requires_grad=False)
 64 |     if cuda:
 65 |         char_embedder.cuda()
 66 | 
 67 |     all_vocab_to_cache = [BOS_TOK, EOS_TOK] + vocab_to_cache
 68 | 
 69 |     shape = (1 + len(all_vocab_to_cache), size)
 70 |     embeddings = np.zeros(shape, dtype=np.float32)
 71 | 
 72 |     for start in tqdm(range(0, len(all_vocab_to_cache), batch_size), desc='embed'):
 73 |         end = min(start + batch_size, len(all_vocab_to_cache))
 74 |         batch = all_vocab_to_cache[start:end]
 75 |         batch_ids = batch_to_ids([[x] for x in batch])
 76 |         if cuda:
 77 |             batch_ids = batch_ids.cuda()
 78 |         output = char_embedder(batch_ids)
 79 |         vec = remove_sentence_boundaries(output['token_embedding'], output['mask'])[0].squeeze(1)
 80 | 
 81 |         embeddings[1 + start:1 + end] = vec.cpu()
 82 | 
 83 |     return embeddings
 84 | 
 85 | 
 86 | def read_embeddings(tokens, path=None, cache_dir=None):
 87 |     if path is None:
 88 |         token_hash = hash_string_list(tokens)
 89 |         if cache_dir:
 90 |             path = '{}/elmo.{}.npy'.format(cache_dir, token_hash)
 91 |         else:
 92 |             path = 'elmo.{}.npy'.format(token_hash)
 93 |         assert os.path.exists(path), path
 94 |     print('reading embeddings from {} for {} tokens'.format(path, len(tokens)))
 95 |     embeddings = np.load(path)
 96 |     assert embeddings.shape[0] == len(tokens)
 97 |     return embeddings
 98 | 
 99 | 
100 | def write_embeddings(path, embeddings):
101 |     np.save(path, embeddings)
102 | 
103 |     with open(path + '.shape', 'w') as f:
104 |         f.write(json.dumps(embeddings.shape))
105 | 
106 | 
107 | def main(arg):
108 | 
109 |     tokens = read_text_vocab_file(args.vocab)
110 |     token_hash = hash_string_list(tokens)
111 | 
112 |     print('found {} tokens with hash = {}'.format(len(tokens), token_hash))
113 |     path = f'{args.cache_dir}/elmo.{token_hash}.npy'
114 | 
115 |     if os.path.exists(path):
116 |         print('embeddings found at {}, exiting'.format(path))
117 |         sys.exit()
118 | 
119 |     embeddings = get_character_embeddings_from_elmo(tokens, args.cache_dir, args.cuda)
120 | 
121 |     print(f'writing to {path}')
122 |     write_embeddings(path, embeddings)
123 | 
124 | 
125 | if __name__ == '__main__':
126 | 
127 |     parser = argparse.ArgumentParser()
128 |     parser.add_argument("--vocab", type=str, help="Vocab file.",
129 |                         required=True)
130 |     parser.add_argument('--cuda', action='store_true',
131 |                         help='If true, then use GPU.')
132 |     parser.add_argument('--cache-dir', type=str, required=True,
133 |                         help='Folder to save elmo weights and embeddings.')
134 |     args = parser.parse_args()
135 | 
136 |     if not torch.cuda.is_available():
137 |         print('WARNING: CUDA not available. Falling back to CPU.')
138 |         args.cuda = False
139 | 
140 |     main(args)
141 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/pretrained_embeddings.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | . set_environment.sh
 4 | # this requires a special environment with allennlp
 5 | [ -z "$1" ] && echo -e "\n$0 /path/to/embeddings/ (where vocab.<amr|text>.txt are) \n"
 6 | FOLDER=$1
 7 | 
 8 | set -o nounset
 9 | 
10 | python ibm_neural_aligner/pretrained_embeddings.py --cuda --allow-cpu \
11 |     --vocab $FOLDER/vocab.text.txt \
12 |     --cache-dir $FOLDER/
13 | python ibm_neural_aligner/pretrained_embeddings.py --cuda --allow-cpu \
14 |     --vocab $FOLDER/vocab.amr.txt \
15 |     --cache-dir $FOLDER/
16 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/run_detailed_eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import json
  4 | import os
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | import numpy as np
  9 | 
 10 | from evaluation import EvalAlignments
 11 | from formatter import amr_to_pretty_format
 12 | from transition_amr_parser.io import read_amr
 13 | 
 14 | 
 15 | class CorpusRecall_WithGoldSpans_WithSomeNodes(object):
 16 | 
 17 |     def __init__(self):
 18 |         self.state = collections.defaultdict(list)
 19 | 
 20 |     def update(self, gold, pred, pred_ref):
 21 |         gold_align, pred_align = gold.alignments, pred.alignments
 22 |         total, correct = 0, 0
 23 | 
 24 |         for node_id in gold_align.keys():
 25 | 
 26 |             # Ignore unaligned nodes
 27 |             if gold_align[node_id] is None:
 28 |                 continue
 29 | 
 30 |             # Penalty for not predicting.
 31 |             if node_id not in pred_align or pred_align[node_id] is None:
 32 |                 total += 1
 33 |                 continue
 34 | 
 35 |             # Be fair.
 36 |             if node_id not in pred_ref.alignments:
 37 |                 total += 1
 38 |                 continue
 39 | 
 40 |             total += 1
 41 | 
 42 |             g0 = gold_align[node_id][0] - 1
 43 |             g1 = gold_align[node_id][-1] - 1
 44 | 
 45 |             p0 = pred_align[node_id][0] - 1
 46 |             p1 = pred_align[node_id][-1] - 1
 47 | 
 48 |             gset = set(range(g0, g1 + 1))
 49 |             pset = set(range(p0, p1 + 1))
 50 | 
 51 |             if len(set.intersection(pset, gset)) > 0:
 52 |                 correct += 1
 53 | 
 54 |         self.state['total'].append(total)
 55 |         self.state['correct'].append(correct)
 56 | 
 57 |     def finish(self):
 58 |         total = np.sum(self.state['total']).item()
 59 |         correct = np.sum(self.state['correct']).item()
 60 |         if total:
 61 |             recall = correct / total
 62 |         else:
 63 |             recall = 0
 64 |         result = collections.OrderedDict()
 65 |         result['correct'] = correct
 66 |         result['total'] = total
 67 |         result['recall'] = recall
 68 | 
 69 |         return result
 70 | 
 71 | 
 72 | def main(args):
 73 |     gold = read_amr(args.gold)
 74 |     neural = read_amr(args.neural)
 75 |     cofill = read_amr(args.cofill)
 76 | 
 77 |     d_gold = {amr.id: amr for amr in gold}
 78 |     d_neural = {amr.id: amr for amr in neural}
 79 |     d_cofill = {amr.id: amr for amr in cofill}
 80 | 
 81 |     keys = [k for k in d_gold.keys() if k in d_neural and k in d_cofill]
 82 | 
 83 |     gold = [d_gold[k] for k in keys]
 84 |     neural = [d_neural[k] for k in keys]
 85 |     cofill = [d_cofill[k] for k in keys]
 86 | 
 87 |     def check_1():
 88 |         for g, p in zip(gold, neural):
 89 |             for k, v in g.nodes.items():
 90 |                 assert v == p.nodes[k], (k, v, p.nodes[k], g.id)
 91 | 
 92 |         for g, p in zip(gold, cofill):
 93 |             for k, v in g.nodes.items():
 94 |                 assert v == p.nodes[k], (k, v, p.nodes[k], g.id)
 95 | 
 96 |     def print_result(result, header=None):
 97 |         def format_value(val):
 98 |             if isinstance(val, float):
 99 |                 return '{:.3f}'.format(val)
100 |             return val
101 | 
102 |         underl = '-' * len(header)
103 |         output = '{}\n{}\n'.format(header, underl)
104 |         for k, v in result.items():
105 |             output += '- {} = {}\n'.format(k, format_value(v))
106 |         print(output)
107 | 
108 |     def run_eval():
109 |         m_n = CorpusRecall_WithGoldSpans_WithSomeNodes()
110 |         m_c = CorpusRecall_WithGoldSpans_WithSomeNodes()
111 | 
112 |         for i, (g, p_n, p_c) in tqdm(enumerate(zip(gold, neural, cofill)), desc='eval'):
113 |             m_n.update(g, p_n, p_c)
114 |             m_c.update(g, p_c, p_c)
115 | 
116 |         res_n = m_n.finish()
117 |         res_c = m_c.finish()
118 | 
119 |         print_result(res_n, 'Neural')
120 |         print_result(res_c, 'COFILL')
121 | 
122 | 
123 |     check_1()
124 |     run_eval()
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     parser = argparse.ArgumentParser()
129 |     parser.add_argument('--gold', default=None, required=True, type=str)
130 |     parser.add_argument('--neural', default=None, required=True, type=str)
131 |     parser.add_argument('--cofill', default=None, required=True, type=str)
132 |     args = parser.parse_args()
133 | 
134 |     main(args)
135 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/run_eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | from amr_utils import safe_read
 6 | from evaluation import EvalAlignments
 7 | from formatter import amr_to_pretty_format
 8 | from transition_amr_parser.io import read_amr2
 9 | 
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--gold', default=None, required=True, type=str)
13 | parser.add_argument('--pred', default=None, required=True, type=str)
14 | parser.add_argument('--out-json', default=None, type=str)
15 | parser.add_argument('--subset', action='store_true')
16 | parser.add_argument('--increment', action='store_true')
17 | args = parser.parse_args()
18 | 
19 | if args.out_json is None:
20 |     args.out_json = args.pred + '.eval.json'
21 | 
22 | print('start eval')
23 | 
24 | eval_output = EvalAlignments().run(args.gold, args.pred, flexible=True, subset=args.subset, increment=args.increment)
25 | 
26 | print(eval_output)
27 | 
28 | with open(args.out_json, 'w') as f:
29 |     f.write(json.dumps(eval_output))
30 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/tokenize_amr.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transition_amr_parser.amr import protected_tokenizer
 3 | 
 4 | 
 5 | def parse_arguments():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--in-amr", type=str, help="AMR file to be tokenized",
 8 |                         required=True)
 9 |     parser.add_argument("--out-amr", type=str, help="Output AMR file.",
10 |                         required=True)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def main(args):
15 |     """
16 |     Add `# ::tok` line with newly tokenized sentence.
17 |     """
18 | 
19 |     # read and write
20 |     with open(args.in_amr) as f_in, open(args.out_amr, 'w') as f_out:
21 |         for line in f_in:
22 | 
23 |             if line.startswith('# ::tok'):
24 |                 raise Exception("File already tokenized!")
25 | 
26 |             elif line.startswith('# ::snt'):
27 |                 f_out.write(line)
28 | 
29 |                 # tokenize
30 |                 sentence = line.split('# ::snt')[-1].strip()
31 |                 tokens, _ = protected_tokenizer(sentence)
32 |                 tokens_str = ' '.join(tokens)
33 |                 f_out.write(f'# ::tok {tokens_str}\n')
34 | 
35 |             else:
36 |                 f_out.write(line)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main(parse_arguments())
41 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/transformer_lm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Originally from:
  3 | https://github.com/pytorch/examples/blob/13acec6d7c78dacd5e1fe9b0b4a325e1d39abc15/word_language_model/model.py
  4 | """
  5 | 
  6 | import math
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | class PositionalEncoding(nn.Module):
 13 |     r"""Inject some information about the relative or absolute position of the tokens
 14 |         in the sequence. The positional encodings have the same dimension as
 15 |         the embeddings, so that the two can be summed. Here, we use sine and cosine
 16 |         functions of different frequencies.
 17 |     .. math::
 18 |         \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
 19 |         \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
 20 |         \text{where pos is the word position and i is the embed idx)
 21 |     Args:
 22 |         d_model: the embed dim (required).
 23 |         dropout: the dropout value (default=0.1).
 24 |         max_len: the max. length of the incoming sequence (default=5000).
 25 |     Examples:
 26 |         >>> pos_encoder = PositionalEncoding(d_model)
 27 |     """
 28 | 
 29 |     def __init__(self, d_model, dropout=0.1, max_len=5000):
 30 |         super().__init__()
 31 |         self.dropout = nn.Dropout(p=dropout)
 32 | 
 33 |         pe = torch.zeros(max_len, d_model)
 34 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
 35 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
 36 |         pe[:, 0::2] = torch.sin(position * div_term)
 37 |         pe[:, 1::2] = torch.cos(position * div_term)
 38 |         pe = pe.unsqueeze(0).transpose(0, 1)
 39 |         self.register_buffer('pe', pe)
 40 | 
 41 |     def forward(self, x):
 42 |         r"""Inputs of forward function
 43 |         Args:
 44 |             x: the sequence fed to the positional encoder model (required).
 45 |         Shape:
 46 |             x: [sequence length, batch size, embed dim]
 47 |             output: [sequence length, batch size, embed dim]
 48 |         Examples:
 49 |             >>> output = pos_encoder(x)
 50 |         """
 51 | 
 52 |         x = x + self.pe[:x.size(0), :]
 53 |         return self.dropout(x)
 54 | 
 55 | 
 56 | class TransformerModel(nn.Module):
 57 | 
 58 |     def __init__(self, ninp, nhead, nhid, nlayers, dropout=0.5):
 59 |         super().__init__()
 60 |         try:
 61 |             from torch.nn import TransformerEncoder, TransformerEncoderLayer
 62 |         except:
 63 |             raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
 64 |         self.model_type = 'Transformer'
 65 |         self.src_mask = None
 66 |         self.pos_encoder = PositionalEncoding(ninp, dropout)
 67 |         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
 68 |         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
 69 |         self.ninp = ninp
 70 | 
 71 |     def _generate_square_subsequent_mask(self, sz):
 72 |         mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
 73 |         mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
 74 |         return mask
 75 | 
 76 |     def forward(self, src, src2=None, has_mask=True):
 77 |         if has_mask:
 78 |             device = src.device
 79 |             if self.src_mask is None or self.src_mask.size(0) != len(src):
 80 |                 mask = self._generate_square_subsequent_mask(len(src)).to(device)
 81 |                 self.src_mask = mask
 82 |         else:
 83 |             self.src_mask = None
 84 | 
 85 |         if src2 is None:
 86 |             src2 = self.pos_encoder(src * math.sqrt(self.ninp))
 87 |         output = self.transformer_encoder(src, self.src_mask)
 88 |         return output
 89 | 
 90 | 
 91 | class BiTransformer(nn.Module):
 92 |     def __init__(self, ninp, nhead, nhid, nlayers, dropout=0.5):
 93 |         super().__init__()
 94 | 
 95 |         self.fwd_enc = TransformerModel(ninp, nhead, nhid, nlayers, dropout)
 96 |         self.bwd_enc = TransformerModel(ninp, nhead, nhid, nlayers, dropout)
 97 | 
 98 |     def forward(self, src):
 99 |         assert len(src.shape) == 3
100 | 
101 |         src = self.fwd_enc.pos_encoder(src * math.sqrt(self.fwd_enc.ninp))
102 | 
103 |         # FORWARD
104 |         fwd_out = self.fwd_enc(src, src)
105 | 
106 |         # BACKWARD
107 |         bwd_src = torch.flip(src, [1])
108 |         bwd_out = self.bwd_enc(bwd_src, bwd_src)
109 |         bwd_out = torch.flip(bwd_out, [1])
110 | 
111 |         output = torch.cat([fwd_out, bwd_out], -1)
112 | 
113 |         return output
114 | 
115 | 
116 | class TransformerLM(nn.Module):
117 |     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
118 |         super().__init__()
119 |         self.ninp = ninp
120 |         self.decoder = nn.Linear(ninp, ntoken)
121 |         self.transformer_encoder = TransformerModel(ninp, nhead, nhid, nlayers, dropout)
122 | 
123 |         self.init_weights()
124 | 
125 |     def init_weights(self):
126 |         initrange = 0.1
127 |         nn.init.uniform_(self.encoder.weight, -initrange, initrange)
128 |         nn.init.zeros_(self.decoder.weight)
129 |         nn.init.uniform_(self.decoder.weight, -initrange, initrange)
130 | 
131 |     def forward(self, src, has_mask=True):
132 |         src = self.encoder(src)
133 |         output = self.transformer_encoder(src)
134 |         output = self.decoder(output)
135 |         return F.log_softmax(output, dim=-1)
136 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/view_manual_alignments.py:
--------------------------------------------------------------------------------
  1 | """
  2 | not found manual_dev
  3 | - overlap 150
  4 | - notfound 0
  5 | not found manual_test
  6 | - overlap 188
  7 | - notfound 12
  8 | - edinburgh_1003.8
  9 | - edinburgh_1003.9
 10 | - edinburgh_1003.3
 11 | - edinburgh_1003.4
 12 | - edinburgh_1003.7
 13 | - edinburgh_1003.10
 14 | - edinburgh_1003.1
 15 | - edinburgh_1003.2
 16 | - edinburgh_1003.5
 17 | - NATHANS_EXAMPLE
 18 | - edinburgh_1003.6
 19 | - AUSTINS_EXAMPLE
 20 | """
 21 | 
 22 | 
 23 | import argparse
 24 | import collections
 25 | import json
 26 | 
 27 | from tqdm import tqdm
 28 | 
 29 | from transition_amr_parser.io import read_amr2
 30 | 
 31 | 
 32 | MY_GLOBALS = {}
 33 | MY_GLOBALS['found'] = set()
 34 | 
 35 | 
 36 | def read_json(filename):
 37 |     with open(filename) as f:
 38 |         return json.loads(f.read())
 39 | 
 40 | 
 41 | def get_keys(corpus):
 42 |     if isinstance(corpus, (tuple, list)):
 43 |         d = {}
 44 |         deleted = set()
 45 |         for amr in corpus:
 46 |             if amr.id in deleted:
 47 |                 continue
 48 |             if amr.id in d:
 49 |                 del d[amr.id]
 50 |                 print(f'deleted {amr.id}')
 51 |                 continue
 52 |             d[amr.id] = amr
 53 |         return get_keys(d)
 54 |     return corpus.keys()
 55 | 
 56 | 
 57 | def print_overlap(datasets, name_a, name_b):
 58 |     keys_a = set(get_keys(datasets[name_a]))
 59 |     keys_b = set(get_keys(datasets[name_b]))
 60 |     overlap = set.intersection(keys_a, keys_b)
 61 |     print(f'overlap\n- {name_a} = {len(keys_a)}\n- {name_b} = {len(keys_b)}\n- overlap = {len(overlap)}')
 62 | 
 63 |     # Update found.
 64 |     MY_GLOBALS['found'] = set.union(MY_GLOBALS['found'], overlap)
 65 | 
 66 | 
 67 | def check_overlap_austin_and_manual(datasets):
 68 |     print_overlap(datasets, 'austin', 'manual_dev')
 69 |     print_overlap(datasets, 'austin', 'manual_test')
 70 | 
 71 | 
 72 | def check_overlap_prince_and_manual(datasets):
 73 |     print_overlap(datasets, 'prince_amr', 'manual_dev')
 74 |     print_overlap(datasets, 'prince_amr', 'manual_test')
 75 | 
 76 | 
 77 | def check_overlap_amr3_and_manual(datasets):
 78 |     print_overlap(datasets, 'amr3_train', 'manual_dev')
 79 |     print_overlap(datasets, 'amr3_train', 'manual_test')
 80 | 
 81 |     print_overlap(datasets, 'amr3_dev', 'manual_dev')
 82 |     print_overlap(datasets, 'amr3_dev', 'manual_test')
 83 | 
 84 |     print_overlap(datasets, 'amr3_test', 'manual_dev')
 85 |     print_overlap(datasets, 'amr3_test', 'manual_test')
 86 | 
 87 | 
 88 | def check_notfound(datasets):
 89 | 
 90 |     for name in ['manual_dev', 'manual_test']:
 91 |         print(f'not found {name}')
 92 |         keys = set(datasets[name].keys())
 93 |         overlap = set.intersection(MY_GLOBALS['found'], keys)
 94 |         notfound = {k for k in keys if k not in overlap}
 95 |         print(f'- overlap {len(overlap)}')
 96 |         print(f'- notfound {len(notfound)}')
 97 |         for k in notfound:
 98 |             print(f'- {k}')
 99 | 
100 | 
101 | def main():
102 |     paths = {}
103 | 
104 |     # This has some useful information such as node names, but it is not clear
105 |     # which are manual alignments.
106 |     paths['austin'] = 'ldc+little_prince.subgraph_alignments.json'
107 | 
108 |     # This does not have node names, but does have AMR ids for manually aligned AMR.
109 |     paths['manual_dev'] = "leamr/data-release/alignments/leamr_dev.subgraph_alignments.gold.json"
110 |     paths['manual_test'] = "leamr/data-release/alignments/leamr_test.subgraph_alignments.gold.json"
111 | 
112 |     # Path to little prince data.
113 |     paths['prince_amr'] = 'amr-bank-struct-v1.6.dummy_align.txt'
114 | 
115 |     # Path to amr3 data.
116 |     paths['amr3_train'] = 'DATA/AMR3.0/corpora/train.dummy_align.txt'
117 |     paths['amr3_dev'] = 'DATA/AMR3.0/corpora/dev.dummy_align.txt'
118 |     paths['amr3_test'] = 'DATA/AMR3.0/corpora/test.dummy_align.txt'
119 | 
120 |     for k, v in paths.items():
121 |         print(k, v)
122 | 
123 |     datasets = {}
124 |     datasets = {k: read_amr2(v, ibm_format=True, tokenize=False) if 'amr' in k else read_json(v) for k, v in paths.items()}
125 | 
126 |     check_overlap_austin_and_manual(datasets)
127 |     check_overlap_prince_and_manual(datasets)
128 |     check_overlap_amr3_and_manual(datasets)
129 |     check_notfound(datasets)
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/vocab.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | from ipdb import set_trace
  4 | from ibm_neural_aligner.vocab_definitions import (
  5 |     PADDING_IDX, PADDING_TOK, BOS_IDX, BOS_TOK, EOS_IDX, EOS_TOK, special_tokens
  6 | )
  7 | from transition_amr_parser.amr import protected_tokenizer
  8 | from transition_amr_parser.io import read_amr
  9 | 
 10 | 
 11 | def get_tokens(path, jamr=False, tokenize=False):
 12 |     local_tokens = set()
 13 |     local_graph_tokens = set()
 14 | 
 15 |     for amr in read_amr(path, jamr=jamr):
 16 | 
 17 |         if tokenize:
 18 |             assert amr.sentence
 19 |             tokens, _ = protected_tokenizer(amr.sentence)
 20 | 
 21 |         else:
 22 |             assert amr.tokens, \
 23 |                 f"Unless --tokenize used, {path} must contain a # ::tok field"
 24 |             tokens = amr.tokens
 25 | 
 26 |         # surface tokens
 27 |         local_tokens.update(tokens)
 28 |         # graph tokens
 29 |         for _, label, _ in amr.edges:
 30 |             local_graph_tokens.add(label)
 31 |         local_graph_tokens.update(amr.nodes.values())
 32 | 
 33 |     return local_tokens, local_graph_tokens
 34 | 
 35 | 
 36 | def main(args):
 37 | 
 38 |     summary = collections.defaultdict(list)
 39 | 
 40 |     # collect information for all AMR
 41 |     tokens = set()
 42 |     graph_tokens = set()
 43 |     for amr_file in args.in_amrs:
 44 | 
 45 |         print('reading {}\n'.format(amr_file))
 46 |         txt_toks, amr_toks = get_tokens(
 47 |             amr_file, jamr=False, tokenize=args.tokenize
 48 |         )
 49 |         tokens = set.union(tokens, txt_toks)
 50 |         graph_tokens = set.union(graph_tokens, amr_toks)
 51 | 
 52 |         o = {}
 53 |         o['txt'] = len(txt_toks)
 54 |         o['amr'] = len(amr_toks)
 55 |         o['success'] = True
 56 | 
 57 |         summary[amr_file].append(o)
 58 |         print(o)
 59 | 
 60 |     # graph_tokens.add('<NA>')
 61 |     graph_tokens.add('(')
 62 |     graph_tokens.add(')')
 63 |     for tok in special_tokens:
 64 |         if tok in tokens:
 65 |             tokens.remove(tok)
 66 |         if tok in graph_tokens:
 67 |             graph_tokens.remove(tok)
 68 | 
 69 |     # Add special symbols at the beginning
 70 |     # surface
 71 |     tokens = special_tokens + sorted(tokens)
 72 |     # graph
 73 |     # useful for linearized parse
 74 |     graph_tokens = special_tokens + sorted(graph_tokens)
 75 | 
 76 |     # print summary
 77 |     print('summary\n-------')
 78 | 
 79 |     for k, v in summary.items():
 80 |         print(k)
 81 |         for vv in v:
 82 |             print(vv)
 83 |         print('')
 84 | 
 85 |     print('writing...')
 86 | 
 87 |     # write files
 88 |     print('found {} text tokens'.format(len(tokens)))
 89 |     with open(args.out_text, 'w') as f:
 90 |         for tok in tokens:
 91 |             f.write(tok + '\n')
 92 |     print('found {} amr tokens'.format(len(graph_tokens)))
 93 |     with open(args.out_amr, 'w') as f:
 94 |         for tok in graph_tokens:
 95 |             f.write(tok + '\n')
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     import argparse
100 | 
101 |     # Argument handling
102 |     parser = argparse.ArgumentParser()
103 |     parser.add_argument(
104 |         "--in-amrs", help="Read AMR files to determine vocabulary.",
105 |         nargs='+', required=True)
106 |     parser.add_argument(
107 |         "--out-text", help="Output text vocab.",
108 |         required=True)
109 |     parser.add_argument(
110 |         "--out-amr", help="Output amr vocab.",
111 |         required=True)
112 |     parser.add_argument(
113 |         "--tokenize", help="Use JAMR-like tokenization instad of # ::tok.",
114 |         action='store_true')
115 |     args = parser.parse_args()
116 | 
117 |     print(json.dumps(args.__dict__))
118 | 
119 |     main(args)
120 | 


--------------------------------------------------------------------------------
/src/ibm_neural_aligner/vocab_definitions.py:
--------------------------------------------------------------------------------
 1 | class MaskInfo:
 2 |     unchanged = 0
 3 |     masked = 1
 4 |     unchanged_and_predict = 2
 5 | 
 6 | PADDING_IDX = 0
 7 | PADDING_TOK = '<PAD>'
 8 | 
 9 | BOS_IDX = 1
10 | BOS_TOK = '<S>'
11 | 
12 | EOS_IDX = 2
13 | EOS_TOK = '</S>'
14 | 
15 | special_tokens = [PADDING_TOK, BOS_TOK, EOS_TOK]
16 | 
17 | assert special_tokens.index(PADDING_TOK) == PADDING_IDX
18 | assert special_tokens.index(BOS_TOK) == BOS_IDX
19 | assert special_tokens.index(EOS_TOK) == EOS_IDX
20 | 


--------------------------------------------------------------------------------
/src/transition_amr_parser/__init__.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import warnings
 3 | # check for installation of torch-scatter
 4 | try:
 5 |     import torch_scatter
 6 | except:
 7 |     warnings.warn("torch-scatter is either not installed or not properly installed; please check for the appropriate version", UserWarning)
 8 |     raise Exception("please review README.d instructions on installing the appropriate version of torch-scatter")
 9 |     # cmd = ["pip", "install", "torch-scatter", "-f", "https://data.pyg.org/whl/torch-1.13.1+cu117.html"]
10 |     # print("try downloading torch-scatter")
11 |     # subprocess.call(cmd)
12 | 
13 | # set this to true to start the debugger on any exception
14 | DEBUG_MODE = False
15 | if DEBUG_MODE:
16 |     import sys
17 |     import ipdb
18 |     import traceback
19 | 
20 |     def debughook(etype, value, tb):
21 |         traceback.print_exception(etype, value, tb)
22 |         print()
23 |         # post-mortem debugger
24 |         ipdb.pm()
25 |     sys.excepthook = debughook
26 | 


--------------------------------------------------------------------------------
/src/transition_amr_parser/action_pointer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/transition-amr-parser/6edc0f116ebc906a49b780eae665a9ed860b4f73/src/transition_amr_parser/action_pointer/__init__.py


--------------------------------------------------------------------------------
/src/transition_amr_parser/action_pointer/amr_parser.py:
--------------------------------------------------------------------------------
 1 | # Standalone AMR parser
 2 | 
 3 | import os
 4 | import json
 5 | import torch
 6 | from transition_amr_parser.model import AMRModel
 7 | import transition_amr_parser.utils as utils
 8 | from fairseq.models.roberta import RobertaModel
 9 | from transition_amr_parser.roberta_utils import extract_features_aligned_to_words
10 | 
11 | 
12 | class AMRParser():
13 | 
14 |     def __init__(self, model_path, roberta_cache_path=None, oracle_stats_path=None, config_path=None, model_use_gpu=False, roberta_use_gpu=False, verbose=False, logger=None):
15 |         if not oracle_stats_path:
16 |             model_folder = os.path.dirname(model_path)
17 |             oracle_stats_path = os.path.join(model_folder, "train.rules.json")
18 |             assert os.path.isfile(oracle_stats_path), \
19 |                 f'Expected train.rules.json in {model_folder}'
20 |         if not config_path:
21 |             model_folder = os.path.dirname(model_path)
22 |             config_path = os.path.join(model_folder, "config.json")
23 |             assert os.path.isfile(config_path), \
24 |                 f'Expected config.json in {model_folder}'
25 |         self.model = self.load_model(model_path, oracle_stats_path, config_path, model_use_gpu)
26 |         self.roberta = self.load_roberta(roberta_use_gpu, roberta_cache_path)
27 |         self.logger = logger
28 | 
29 |     def load_roberta(self, roberta_use_gpu, roberta_cache_path=None):
30 | 
31 |         if not roberta_cache_path:
32 |             # Load the Roberta Model from torch hub
33 |             roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
34 |         else:
35 |             roberta = RobertaModel.from_pretrained(roberta_cache_path, checkpoint_file='model.pt')
36 |         roberta.eval()
37 |         if roberta_use_gpu:
38 |             roberta.cuda()
39 |         return roberta
40 | 
41 |     def load_model(self, model_path, oracle_stats_path, config_path, model_use_gpu):
42 | 
43 |         oracle_stats = json.load(open(oracle_stats_path))
44 |         config = json.load(open(config_path))
45 |         model = AMRModel(
46 |             oracle_stats=oracle_stats,
47 |             embedding_dim=config["embedding_dim"],
48 |             action_embedding_dim=config["action_embedding_dim"],
49 |             char_embedding_dim=config["char_embedding_dim"],
50 |             hidden_dim=config["hidden_dim"],
51 |             char_hidden_dim=config["char_hidden_dim"],
52 |             rnn_layers=config["rnn_layers"],
53 |             dropout_ratio=config["dropout_ratio"],
54 |             pretrained_dim=config["pretrained_dim"],
55 |             use_bert=config["use_bert"],
56 |             use_gpu=model_use_gpu,
57 |             use_chars=config["use_chars"],
58 |             use_attention=config["use_attention"],
59 |             use_function_words=config["use_function_words"],
60 |             use_function_words_rels=config["use_function_words_rels"],
61 |             parse_unaligned=config["parse_unaligned"],
62 |             weight_inputs=config["weight_inputs"],
63 |             attend_inputs=config["attend_inputs"]
64 |         )
65 | 
66 |         model.load_state_dict(torch.load(model_path))
67 |         model.eval()
68 |         return model
69 | 
70 |     def get_embeddings(self, tokens):
71 |         features = extract_features_aligned_to_words(self.roberta, tokens=tokens, use_all_layers=True, return_all_hiddens=True)
72 |         embeddings = []
73 |         for tok in features:
74 |             if str(tok) not in ['<s>', '</s>']:
75 |                 embeddings.append(tok.vector)
76 |         embeddings = torch.stack(embeddings).detach().cpu().numpy()
77 |         return embeddings
78 | 
79 |     def parse_sentence(self, tokens):
80 |         # The model expects <ROOT> token at the end of the input sentence
81 |         if tokens[-1] != "<ROOT>":
82 |             tokens.append("<ROOT>")
83 |         sent_rep = utils.vectorize_words(self.model, tokens, training=False, gpu=self.model.use_gpu)
84 |         bert_emb = self.get_embeddings(tokens)
85 |         amr = self.model.parse_sentence(tokens, sent_rep, bert_emb)
86 |         return amr
87 | 


--------------------------------------------------------------------------------
/src/transition_amr_parser/action_pointer/roberta_utils.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import torch
  3 | from spacy.tokens import Doc
  4 | import copy
  5 | from fairseq.models.roberta.alignment_utils import spacy_nlp
  6 | from fairseq.data.data_utils import collate_tokens
  7 | 
  8 | 
  9 | def get_tokens(roberta, word):
 10 |     return roberta.task.source_dictionary.encode_line(roberta.bpe.encode(word), append_eos=False, add_if_not_exist=False)
 11 | 
 12 | 
 13 | def get_alignments_and_tokens(roberta, words):
 14 |     bpe_tokens = []
 15 |     alignment_position = 1
 16 |     alignments = []
 17 |     first_word_tokens = get_tokens(roberta, words[0])
 18 |     bpe_tokens.extend(first_word_tokens)
 19 |     alignments.append([(alignment_position + i) for i in range(0, len(first_word_tokens))])
 20 |     alignment_position = alignment_position + len(first_word_tokens)
 21 | 
 22 |     for word in words[1:]:
 23 |         tokens = get_tokens(roberta, " " + word)
 24 |         bpe_tokens.extend(tokens)
 25 |         alignments.append([(alignment_position + i) for i in range(0, len(tokens))])
 26 |         alignment_position = alignment_position + len(tokens)
 27 | 
 28 |     final_bpe_tokens = [roberta.task.source_dictionary.index('<s>')] + bpe_tokens + [roberta.task.source_dictionary.index('</s>')]
 29 |     return alignments, torch.LongTensor(final_bpe_tokens)
 30 | 
 31 | 
 32 | def align_features_to_words(roberta, features, alignment):
 33 |     """
 34 |     Align given features to words.
 35 | 
 36 |     Args:
 37 |         roberta (RobertaHubInterface): RoBERTa instance
 38 |         features (torch.Tensor): features to align of shape `(T_bpe x C)`
 39 |         alignment: alignment between BPE tokens and words returned by
 40 |             func:`align_bpe_to_words`.
 41 |     """
 42 |     assert features.dim() == 2
 43 | 
 44 |     bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices)
 45 |     assert bpe_counts[0] == 0  # <s> shouldn't be aligned
 46 |     denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))])
 47 |     weighted_features = features / denom.unsqueeze(-1)
 48 |     output = [weighted_features[0]]
 49 |     largest_j = -1
 50 |     for bpe_indices in alignment:
 51 |         output.append(weighted_features[bpe_indices].sum(dim=0))
 52 |         largest_j = max(largest_j, *bpe_indices)
 53 |     for j in range(largest_j + 1, len(features)):
 54 |         output.append(weighted_features[j])
 55 |     output = torch.stack(output)
 56 |     return output
 57 | 
 58 | 
 59 | def extract_features_aligned_to_words_batched(model, sentences: list, use_all_layers: bool = True, return_all_hiddens: bool = False) -> torch.Tensor:
 60 |     nlp = spacy_nlp()
 61 |     bpe_toks = []
 62 |     alignments = []
 63 |     spacy_tokens = []
 64 |     for sentence in sentences:
 65 |         toks = sentence.split()
 66 |         alignment, bpe_tok = get_alignments_and_tokens(model, toks)
 67 |         bpe_toks.append(bpe_tok)
 68 |         alignments.append(alignment)
 69 |         spacy_tokens.append(toks)
 70 | 
 71 |     bpe_toks_collated = collate_tokens(bpe_toks, pad_idx=1)
 72 | 
 73 |     features = model.extract_features(bpe_toks_collated, return_all_hiddens=return_all_hiddens)
 74 |     final_features = sum(features[1:])/(len(features)-1)
 75 | 
 76 |     results = []
 77 |     for bpe_tok, final_feature, alignment, toks in zip(bpe_toks, final_features, alignments, spacy_tokens):
 78 |         aligned_feats = align_features_to_words(model, final_feature[0:bpe_tok.shape[0]], alignment)
 79 |         doc = Doc(
 80 |             nlp.vocab,
 81 |             words=['<s>'] + [x for x in toks] + ['</s>'],
 82 |         )
 83 |         doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i]
 84 |         results.append(copy.copy(doc))
 85 | 
 86 |     return results
 87 | 
 88 | 
 89 | def extract_features_aligned_to_words(model, tokens: list, use_all_layers: bool = True, return_all_hiddens: bool = False) -> torch.Tensor:
 90 |     nlp = spacy_nlp()
 91 |     alignment, bpe_tok = get_alignments_and_tokens(model, tokens)
 92 |     features = model.extract_features(bpe_tok, return_all_hiddens=return_all_hiddens)
 93 |     final_features = sum(features[1:])/(len(features)-1)
 94 |     final_features = final_features.squeeze(0)
 95 |     aligned_feats = align_features_to_words(model, final_features, alignment)
 96 |     doc = Doc(
 97 |             nlp.vocab,
 98 |             words=['<s>'] + [x for x in tokens] + ['</s>']
 99 |             )
100 |     doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i]
101 |     return doc
102 | 


--------------------------------------------------------------------------------
/src/transition_amr_parser/add_id_to_amr.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def amr_add_id(file_path,file_path_id):
 4 | 
 5 |     with open(file_path_id) as fid1:
 6 |         ids_list = []
 7 |         for line in fid1.readlines():
 8 |             if '# ::id ' in line:
 9 |                 ids_list.append(line)
10 | 
11 | 
12 |     with open(file_path) as fid2:
13 |         raw_amr = []
14 |         ids_idx = 0
15 |         for line in fid2.readlines():
16 |             if '::tok' in line :
17 |                 raw_amr.append(ids_list[ids_idx])
18 |                 ids_idx+=1
19 |             raw_amr.append(line)
20 |         assert len(ids_list)==ids_idx
21 | 
22 |     with open(file_path.rstrip('.txt')+'_id-added.txt','w') as fid3:
23 |         for line in raw_amr:
24 |             fid3.write(line)
25 |                 
26 |         
27 | 
28 |     
29 | 
30 | if __name__ == '__main__':
31 |     parser = argparse.ArgumentParser(
32 |         description='Produces oracle sequences given AMR alignerd to sentence'
33 |     )
34 |     # Single input parameters
35 |     parser.add_argument(
36 |         "--in-aligned-amr",
37 |         help="In file containing AMR in penman format AND isi alignments ",
38 |         type=str,
39 |         default='DATA/AMR3.0/aligned/cofill_isi/train.txt'
40 |     )
41 | 
42 |     parser.add_argument(
43 |         "--amr-with-id",
44 |         help="add id to --in-aligned-amr using the file given",
45 |         type=str,
46 |         default='DATA/AMR3.0/aligned/cofill/train.txt'
47 |     )
48 |     args = parser.parse_args()
49 | 
50 |     amr_add_id(args.in_aligned_amr,args.amr_with_id)
51 | 


--------------------------------------------------------------------------------
/src/transition_amr_parser/add_sentence_amrs_to_file.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from transition_amr_parser.io import read_blocks
 3 | 
 4 | 
 5 | 
 6 | def main(args):
 7 | 
 8 |     tqdm_amrs_str = read_blocks(args.in_amr)
 9 | 
10 | 
11 |     with open(args.out_amr, 'a') as fid:
12 |         for idx, penman_str in enumerate(tqdm_amrs_str):
13 |             fid.write(penman_str+'\n')
14 | 
15 |            
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     parser = ArgumentParser()
20 |     parser.add_argument(
21 |         "--in-amr",
22 |         help="In file containing AMR in penman format",  
23 |         type=str
24 |     )
25 |     parser.add_argument(
26 |         "--out-amr",
27 |         help="path to save amr",
28 |         type=str,
29 |     )
30 |     args = parser.parse_args()
31 |     main(args)


--------------------------------------------------------------------------------
/src/transition_amr_parser/clbar.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2021 International Business Machines
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | #
 15 | # This file is standalone and intended to be used as well separately of the
 16 | # repository, hence the attached license above.
 17 | 
 18 | import shutil
 19 | from collections import Counter
 20 | from datetime import datetime
 21 | # external module
 22 | import numpy as np
 23 | 
 24 | 
 25 | def red_background(string):
 26 |     return "\033[101m%s\033[0m" % string
 27 | 
 28 | 
 29 | def yellow_font(string):
 30 |     return "\033[93m%s\033[0m" % string
 31 | 
 32 | 
 33 | def green_font(string):
 34 |     return "\033[92m%s\033[0m" % string
 35 | 
 36 | 
 37 | def print_log(module, string):
 38 |     """formats printing of log to stdout"""
 39 |     timestamp = str(datetime.now()).split('.')[0]
 40 |     print(f'{timestamp} [{module}] {string}')
 41 | 
 42 | 
 43 | def clbar(
 44 |     xy=None,  # list of (x, y) tuples or Counter
 45 |     x=None,
 46 |     y=None,
 47 |     ylim=(None, None),
 48 |     ncol=None,    # Max number of lines for display (defauly window size)
 49 |     # show only top and bottom values
 50 |     topx=None,
 51 |     botx=None,
 52 |     topy=None,
 53 |     boty=None,
 54 |     # normalize to sum to 1
 55 |     norm=False,
 56 |     xfilter=None,  # f(x) returns bool to not skip this example in display
 57 |     yform=None     # Function receiveing single y value returns string
 58 | ):
 59 |     """Print data structure in command line"""
 60 |     # Sanity checks
 61 |     if x is None and y is None:
 62 |         if isinstance(xy, np.ndarray):
 63 |             labels = [f'{i}' for i in range(xy.shape[0])]
 64 |             xy = list(zip(labels, list(xy)))
 65 |         elif isinstance(xy, Counter):
 66 |             xy = [(str(x), y) for x, y in xy.items()]
 67 |         else:
 68 |             assert isinstance(xy, list), "Expected list of tuples"
 69 |             assert isinstance(xy[0], tuple), "Expected list of tuples"
 70 |     else:
 71 |         assert x is not None and y is not None
 72 |         assert isinstance(x, list)
 73 |         assert isinstance(y, list) or isinstance(y, np.ndarray)
 74 |         assert len(x) == len(list(y))
 75 |         xy = list(zip(x, y))
 76 | 
 77 |     # normalize
 78 |     if norm:
 79 |         z = sum([x[1] for x in xy])
 80 |         xy = [(k, v / z) for k, v in xy]
 81 |     # show only top x
 82 |     if topx is not None:
 83 |         xy = sorted(xy, key=lambda x: float(x[0]))[-topx:]
 84 |     if botx is not None:
 85 |         xy = sorted(xy, key=lambda x: float(x[0]))[:botx]
 86 |     if boty is not None:
 87 |         xy = sorted(xy, key=lambda x: x[1])[:boty]
 88 |     if topy is not None:
 89 |         xy = sorted(xy, key=lambda x: x[1])[-topy:]
 90 |     # print list of tuples
 91 |     # determine variables to fit data to command line
 92 |     x_data, y_data = zip(*xy)
 93 |     width = max([
 94 |         len(str(x)) if x is not None else len('None') for x in x_data
 95 |     ])
 96 |     number_width = max([len(f'{y}') for y in y_data])
 97 |     # max and min values
 98 |     if ylim[1] is not None:
 99 |         max_y_data = ylim[1]
100 |     else:
101 |         max_y_data = max(y_data)
102 |     if ylim[0] is not None:
103 |         min_y_data = ylim[0]
104 |     else:
105 |         min_y_data = min(y_data)
106 |     # determine scaling factor from screen size
107 |     data_range = max_y_data - min_y_data
108 |     if ncol is None:
109 |         ncol, _ = shutil.get_terminal_size((80, 20))
110 |     max_size = ncol - width - number_width - 3
111 |     scale = max_size / data_range
112 | 
113 |     # plot
114 |     print()
115 |     blank = ' '
116 |     if yform:
117 |         min_y_data_str = yform(min_y_data)
118 |         print(f'{blank:<{width}}{min_y_data_str}')
119 |     else:
120 |         print(f'{blank:<{width}}{min_y_data}')
121 |     for (x, y) in xy:
122 | 
123 |         # Filter example by x
124 |         if xfilter and not xfilter(x):
125 |             continue
126 | 
127 |         if y > max_y_data:
128 |             # cropped bars
129 |             num_col = int((ylim[1] - min_y_data) * scale)
130 |             if num_col == 0:
131 |                 bar = ''
132 |             else:
133 |                 half_width = (num_col // 2)
134 |                 if num_col % 2:
135 |                     bar = '\u25A0' * (half_width - 1)
136 |                     bar += '//'
137 |                     bar += '\u25A0' * (half_width - 1)
138 |                 else:
139 |                     bar = '\u25A0' * half_width
140 |                     bar += '//'
141 |                     bar += '\u25A0' * (half_width - 1)
142 |         else:
143 |             bar = '\u25A0' * int((y - min_y_data) * scale)
144 |         if x is None:
145 |             x = 'None'
146 |         if yform:
147 |             y = yform(y)
148 |             print(f'{x:<{width}} {bar} {y}')
149 |         else:
150 |             print(f'{x:<{width}} {bar} {y}')
151 |     print()
152 | 


--------------------------------------------------------------------------------
/src/transition_amr_parser/force_overlap_actions.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | from transition_amr_parser.io import read_blocks
  3 | import re
  4 | 
  5 | arc_regex = re.compile(r'>[RL]A\((.*),(.*)\)')
  6 | 
  7 | 
  8 | def decrement_pointers_to_future(action_lists, li, ai, ignored):
  9 | 
 10 |     pos = sum([len(alist) for alist in action_lists[:li]]) + ai + 1
 11 |     for (i,action_list) in enumerate(action_lists):
 12 |         for (j, action) in enumerate(action_list):
 13 |             if i > li or ( i==li and j>ai):
 14 |                 if arc_regex.match(action):
 15 |                     (idx, lbl) = arc_regex.match(action).groups()
 16 |                     if int(idx)+ignored >= pos:
 17 |                         idx = str(int(idx) - 1)
 18 |                         action_lists[i][j] = action[:3]+"("+idx+","+lbl+")"
 19 | 
 20 | def sanity_check(actions):
 21 |     for action in actions:
 22 |         if arc_regex.match(action):
 23 |             (idx, lbl) = arc_regex.match(action).groups()
 24 |             if arc_regex.match(actions[int(idx)]) or actions[int(idx)] in ['SHIFT','ROOT','CLOSE_SENTENCE']:
 25 |                 import ipdb; ipdb.set_trace()
 26 |                 print("*****bad pointer to from " + action + " to " + actions[int(idx)])                        
 27 | 
 28 | 
 29 | def force_overlap(actions, force_actions, start_idx):
 30 |     
 31 |     actions_per_token = []
 32 |     this_token_actions = []
 33 |     for action in actions:
 34 |         this_token_actions.append(action)
 35 |         if action in ['SHIFT','CLOSE_SENTENCE']:
 36 |             actions_per_token.append(this_token_actions)
 37 |             this_token_actions = []
 38 |     
 39 |     start_action_index = sum([len(acts) for acts in actions_per_token[:start_idx]]) if start_idx else 0
 40 |             
 41 |     out_actions = ""
 42 |     overlap_actions = []
 43 |     ignored = 0
 44 |     for ti in range(start_idx,len(actions_per_token)):
 45 |         useful_actions = []
 46 |         for (ai,action) in enumerate(actions_per_token[ti]):
 47 |             if arc_regex.match(action):
 48 |                 (idx, lbl) = arc_regex.match(action).groups()
 49 |                 idx = str(int(idx) - start_action_index)
 50 |                 if int(idx) >= 0:
 51 |                     useful_actions.append(action[:3]+"("+idx+","+lbl+")")
 52 |                 else:
 53 |                     decrement_pointers_to_future(actions_per_token,ti,ai,ignored)
 54 |                     ignored += 1
 55 |             else:
 56 |                 useful_actions.append(action)
 57 |         overlap_actions.append(useful_actions)
 58 | 
 59 |     flat_actions = []
 60 |     for actions in overlap_actions:
 61 |         flat_actions.extend(actions)
 62 |     sanity_check(flat_actions)
 63 |                 
 64 |     out_force_actions = overlap_actions
 65 |     
 66 |     if force_actions is not None:
 67 |         out_force_actions.extend(force_actions[len(overlap_actions):])
 68 |         
 69 |     #there can be a sanity check here
 70 |     return out_force_actions
 71 |     
 72 |                 
 73 | 
 74 | def force_overlap_all(all_windows, all_actions, all_force_actions, in_widx):
 75 | 
 76 |     all_out_force_actions = []
 77 |     
 78 |     fidx = 0
 79 |     pidx = 0
 80 |     for (i, _) in enumerate(all_windows):
 81 |         if len(all_windows[i]) > in_widx:
 82 |             this_window = all_windows[i][in_widx]
 83 |             prev_window = all_windows[i][in_widx-1]
 84 |             actions = all_actions[pidx]
 85 |             force_actions = all_force_actions[fidx]
 86 |             
 87 |             start_idx = this_window[0] - prev_window[0]
 88 | 
 89 |             out_force_actions = force_overlap(actions, force_actions, start_idx)            
 90 |             
 91 |             all_out_force_actions.append(out_force_actions)
 92 |             
 93 |             fidx += 1
 94 |             
 95 |         if len(all_windows[i]) >= in_widx:
 96 |             pidx += 1
 97 | 
 98 |     return all_out_force_actions
 99 | 
100 | def make_forced_overlap(in_pred, in_force, in_windows, in_widx, out_force):
101 |     
102 |     fpactions = open(in_pred)
103 |     ffactions = open(in_force)
104 |     fwindows = open(in_windows)
105 | 
106 |     all_windows = [eval(line.strip()) for line in fwindows]
107 |     all_actions = [ line.strip().split() for line in fpactions ]
108 |     all_force_actions = [ eval(line.strip()) for line in ffactions ]
109 | 
110 |     window_of_interest = in_widx
111 | 
112 |     ffactions.close()
113 |     ffout = open(out_force, 'w')
114 |     
115 |     if window_of_interest == 0:
116 |         return
117 | 
118 |     all_out_force_actions = force_overlap_all(all_windows, all_actions, all_force_actions, in_widx)
119 | 
120 |     for force_actions in all_out_force_actions:
121 |         ffout.write(str(force_actions) + "\n")
122 |         
123 | def main(args):
124 |     make_forced_overlap(args.in_pred, args.in_force, args.in_windows, args.in_widx, args.out_force)
125 |     
126 | if __name__ == '__main__':
127 |     parser = ArgumentParser()
128 |     parser.add_argument(
129 |         "--in-pred",
130 |         help="input pred actions to be forced in next window",  
131 |         type=str
132 |     )
133 |     parser.add_argument(
134 |         "--in-force",
135 |         help="input force actions to be updated",  
136 |         type=str
137 |     )
138 |     parser.add_argument(
139 |         "--in-windows",
140 |         help="info about sliding window",
141 |         type=str,
142 |     )
143 |     parser.add_argument(
144 |         "--out-force",
145 |         help="output force actions",
146 |         type=str,
147 |     )
148 |     parser.add_argument(
149 |         "--in-widx",
150 |         help="index of the window to be updated",
151 |         default=1,
152 |         type=int,
153 |     )
154 |     
155 |     args = parser.parse_args()
156 |     main(args)
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/tests/align_mode.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | . set_environment.sh
 4 | set -o nounset
 5 | # This will ensure that early exit shows tests having failed
 6 | function check_tests_passed {
 7 |         if [ "$TESTS_PASSED" == "Y" ];then
 8 |                 printf "[\033[92m OK \033[0m] $0\n"
 9 |         else
10 |                 printf "[\033[91m FAILED \033[0m] $0\n"
11 |         fi
12 | }
13 | trap check_tests_passed EXIT
14 | TESTS_PASSED="N"
15 | 
16 | # python tests/align_mode.py DATA/wiki25/aligned/cofill_isi/train.txt
17 | python tests/align_mode.py DATA/AMR2.0/aligned/cofill/train.txt
18 | 
19 | # if we reach here, we are good
20 | TESTS_PASSED="Y"
21 | 


--------------------------------------------------------------------------------
/tests/all.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | # This will ensure that early exit shows tests having failed
 3 | function check_tests_passed {
 4 |         if [ "$TESTS_PASSED" == "Y" ];then
 5 |                 printf "[\033[92m OK \033[0m] $0\n"
 6 |         else
 7 |                 printf "[\033[91m FAILED \033[0m] $0\n"
 8 |         fi
 9 | }
10 | trap check_tests_passed EXIT
11 | TESTS_PASSED="N"
12 | 
13 | # all conventional tests
14 | bash tests/correctly_installed.sh
15 | # small test with 25 sentences
16 | bash tests/minimal_test.sh
17 | # standalone parser
18 | bash tests/standalone.sh
19 | # oracle for wiki25 imperfect due to alignments
20 | bash tests/oracles/amr_o10.sh DATA/wiki25/aligned/cofill_isi/train.txt
21 | # if we reach here, we are good
22 | TESTS_PASSED="Y"
23 | 


--------------------------------------------------------------------------------
/tests/amr_io.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | . set_environment.sh
 4 | set -o nounset
 5 | 
 6 | # Seems not to be reading 
 7 | 
 8 | # passes IO test
 9 | python tests/amr_io.py --no-isi \
10 |    --in-amr DATA/AMR2.0/aligned/cofill_isi/dev.txt \
11 |    --ignore-errors 'amr2-dev' \
12 |    # --out-amr tmp.amr
13 | 
14 | # passes IO test
15 | python tests/amr_io.py --no-isi \
16 |    --in-amr DATA/AMR2.0/aligned/cofill_isi/train.txt \
17 |    --ignore-errors 'amr2-train' \
18 |    # --out-amr tmp.amr
19 | 
20 | # passes IO test
21 | python tests/amr_io.py --no-isi \
22 |    --in-amr DATA/AMR3.0/aligned/cofill_isi/dev.txt \
23 |    --ignore-errors 'amr3-dev' \
24 |    # --out-amr tmp.amr
25 | 
26 | # passes IO test
27 | python tests/amr_io.py --no-isi \
28 |    --in-amr DATA/AMR3.0/aligned/cofill_isi/train.txt \
29 |    --ignore-errors 'amr3-train' \
30 |    # --out-amr tmp.amr
31 | 
32 | printf "[\033[92m OK \033[0m] $0\n"
33 | 


--------------------------------------------------------------------------------
/tests/correctly_installed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import subprocess
 3 | # from ipdb import set_trace
 4 | 
 5 | 
 6 | def main():
 7 | 
 8 |     # Pytorch and CUDA
 9 |     passed = True
10 |     print()
11 |     import torch
12 |     print(f'pytorch {torch.__version__}')
13 |     if torch.cuda.is_available():
14 |         print(f'cuda {torch.version.cuda}')
15 |         # happens when CUDA missconfigured
16 |         assert torch.cuda.device_count(), "0 GPUs found"
17 |         try:
18 |             import apex
19 |             print("Apex installed")
20 |         except ImportError:
21 |             print("Apex not installed")
22 |         if torch.cuda.get_device_capability(0)[0] < 7:
23 |             print("GPU wont support --fp")
24 | 
25 |         # sanity check try to use CUDA
26 |         import torch
27 |         torch.zeros((100, 100)).cuda()
28 | 
29 |     else:
30 |         print("\033[93mNo CUDA available\033[0m")
31 | 
32 |     try:
33 |         import smatch
34 |         print("smatch installed")
35 |     except ImportError as e:
36 |         print("\033[93msmatch not installed\033[0m")
37 |         sucess = False
38 | 
39 |     try:
40 |         import torch_scatter
41 |         print("pytorch-scatter installed")
42 |     except ImportError:
43 |         print("\033[93mpytorch-scatter not installed\033[0m")
44 |         passed = False
45 | 
46 | #     if torch.cuda.is_available():
47 | #         try:
48 | #             import torch_scatter.scatter_cuda
49 | #             print("torch_scatter.scatter_cuda works")
50 | #         except ImportError:
51 | #             print(
52 | #                 "\033[93mmaybe LD_LIBRARY_PATH unconfigured?, "
53 | #                 "import torch_scatter.scatter_cuda dies\033[0m"
54 | #             )
55 | #             passed = False
56 | 
57 |     # fairseq
58 |     try:
59 |         import fairseq
60 |         print("fairseq works")
61 |     except ImportError:
62 |         print("\033[93mfairseq installation failed\033[0m")
63 |         passed = False
64 | 
65 |     # If we get here we passed
66 |     if passed:
67 |         print(f'[\033[92mOK\033[0m] correctly installed\n')
68 |     else:
69 |         print(f'[\033[91mFAILED\033[0m] some modules missing\n')
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/tests/correctly_installed.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | . set_environment.sh
 4 | set -o nounset
 5 | 
 6 | trap 'case $? in
 7 |     139) echo -e "\033[91mCode segfaulted!\033[0m (probably .cuda())\n";;
 8 | esac' EXIT
 9 | 
10 | python tests/correctly_installed.py
11 | 


--------------------------------------------------------------------------------
/tests/create_wiki25_mockup.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit 
 2 | set -o pipefail
 3 | set -o nounset 
 4 | 
 5 | # This simulates conventional corpora using the 25 wiki sentences
 6 | # Create original data
 7 | mkdir -p DATA/wiki25/corpora/
 8 | cp DATA/wiki25.jkaln DATA/wiki25/corpora/train.txt 
 9 | # remove JAMR meta-data, which we do not have in reality
10 | sed -i.bak '/^# ::tok.*/d' DATA/wiki25/corpora/train.txt
11 | sed -i.bak '/^# ::node.*/d' DATA/wiki25/corpora/train.txt
12 | sed -i.bak '/^# ::edge.*/d' DATA/wiki25/corpora/train.txt
13 | sed -i.bak '/^# ::root.*/d' DATA/wiki25/corpora/train.txt
14 | sed -i.bak '/^# ::alignments.*/d' DATA/wiki25/corpora/train.txt
15 | [ ! -f DATA/wiki25/corpora/dev.txt ] \
16 |     && ln -s ./train.txt DATA/wiki25/corpora/dev.txt
17 | [ ! -f DATA/wiki25/corpora/test.txt ] \
18 |     && ln -s ./train.txt DATA/wiki25/corpora/test.txt 
19 | 
20 | touch DATA/wiki25/corpora/.done
21 | 
22 | # Simulate aligned data from wiki25
23 | mkdir -p DATA/wiki25/aligned/cofill_isi/
24 | [ ! -f DATA/wiki25/aligned/cofill_isi/train.txt ] \
25 |     && ln -s ../../../wiki25.jkaln DATA/wiki25/aligned/cofill_isi/train.txt 
26 | echo "DATA/wiki25/aligned/cofill_isi/train.txt"
27 | [ ! -f DATA/wiki25/aligned/cofill_isi/dev.txt ] \
28 |     && ln -s ../../../wiki25.jkaln DATA/wiki25/aligned/cofill_isi/dev.txt
29 | echo "DATA/wiki25/aligned/cofill_isi/dev.txt"
30 | [ ! -f DATA/wiki25/aligned/cofill_isi/test.txt ] \
31 |     && ln -s ../../../wiki25.jkaln DATA/wiki25/aligned/cofill_isi/test.txt 
32 | touch DATA/wiki25/aligned/cofill_isi/.done
33 | echo "DATA/wiki25/aligned/cofill_isi/test.txt"
34 | 


--------------------------------------------------------------------------------
/tests/download_little_prince.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit 
 2 | set -o pipefail
 3 | set -o nounset
 4 | 
 5 | # [ -d DATA/LP/ ] && rm -R DATA/LP
 6 | mkdir -p DATA/LP/corpora/
 7 | 
 8 | # Download data
 9 | 
10 | if [ ! -f DATA/LP/corpora/dev.txt ];then
11 |     wget --no-check-certificate -O DATA/LP/corpora/dev.txt.tmp https://amr.isi.edu/download/amr-bank-struct-v1.6-dev.txt 
12 |     sed '1,2d' DATA/LP/corpora/dev.txt.tmp > DATA/LP/corpora/dev.txt    
13 |     rm DATA/LP/corpora/dev.txt.tmp
14 | fi
15 | 
16 | if [ ! -f DATA/LP/corpora/train.txt ];then
17 |     wget --no-check-certificate -O DATA/LP/corpora/train.txt.tmp https://amr.isi.edu/download/amr-bank-struct-v1.6-training.txt 
18 |     sed '1,2d' DATA/LP/corpora/train.txt.tmp > DATA/LP/corpora/train.txt
19 |     rm DATA/LP/corpora/train.txt.tmp
20 | fi    
21 | 
22 | if [ ! -f DATA/LP/corpora/test.txt ];then
23 |     wget --no-check-certificate -O DATA/LP/corpora/test.txt.tmp https://amr.isi.edu/download/amr-bank-struct-v1.6-test.txt 
24 |     sed '1,2d' DATA/LP/corpora/test.txt.tmp > DATA/LP/corpora/test.txt 
25 |     rm DATA/LP/corpora/test.txt.tmp
26 | fi
27 | 


--------------------------------------------------------------------------------
/tests/fairseq_data_iterator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test data iterator with e.g.
  3 | 
  4 | . set_environment.sh
  5 | 
  6 | arguments="
  7 |     DATA/amr/features/o3+Word100_RoBERTa-base/
  8 |     --gen-subset train
  9 |     --batch-size 128
 10 | "
 11 | 
 12 | # do not use @profile
 13 | #python tests/fairseq_data_iterator.py $arguments
 14 | 
 15 | # Use @profile
 16 | kernprof -l tests/fairseq_data_iterator.py $arguments
 17 | python -m line_profiler fairseq_data_iterator.py.lprof
 18 | """
 19 | 
 20 | from fairseq import tasks, utils
 21 | from fairseq_ext import options
 22 | from fairseq_ext.utils_import import import_user_module
 23 | from fairseq.data import data_utils, FairseqDataset
 24 | from tqdm import tqdm
 25 | 
 26 | 
 27 | def get_batch_iterator(
 28 |     dataset, max_tokens=None, max_sentences=None, max_positions=None,
 29 |     ignore_invalid_inputs=False, required_batch_size_multiple=1,
 30 |     seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0,
 31 |     large_sent_first=False
 32 | ):
 33 |     """
 34 |     Get an iterator that yields batches of data from the given dataset.
 35 | 
 36 |     Args:
 37 |         dataset (~fairseq.data.FairseqDataset): dataset to batch
 38 |         max_tokens (int, optional): max number of tokens in each batch
 39 |             (default: None).
 40 |         max_sentences (int, optional): max number of sentences in each
 41 |             batch (default: None).
 42 |         max_positions (optional): max sentence length supported by the
 43 |             model (default: None).
 44 |         ignore_invalid_inputs (bool, optional): don't raise Exception for
 45 |             sentences that are too long (default: False).
 46 |         required_batch_size_multiple (int, optional): require batch size to
 47 |             be a multiple of N (default: 1).
 48 |         seed (int, optional): seed for random number generator for
 49 |             reproducibility (default: 1).
 50 |         num_shards (int, optional): shard the data iterator into N
 51 |             shards (default: 1).
 52 |         shard_id (int, optional): which shard of the data iterator to
 53 |             return (default: 0).
 54 |         num_workers (int, optional): how many subprocesses to use for data
 55 |             loading. 0 means the data will be loaded in the main process
 56 |             (default: 0).
 57 |         epoch (int, optional): the epoch to start the iterator from
 58 |             (default: 0).
 59 | 
 60 |     Returns:
 61 |         ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
 62 |             given dataset split
 63 |     """
 64 |     assert isinstance(dataset, FairseqDataset)
 65 | 
 66 |     # get indices ordered by example size
 67 |     with data_utils.numpy_seed(seed):
 68 |         indices = dataset.ordered_indices()
 69 |         # invert order to start by bigger ones
 70 |         if large_sent_first:
 71 |             indices = indices[::-1]
 72 | 
 73 |     # filter examples that are too large
 74 |     if max_positions is not None:
 75 |         indices = data_utils.filter_by_size(
 76 |             indices, dataset.size, max_positions,
 77 |             raise_exception=(not ignore_invalid_inputs),
 78 |         )
 79 | 
 80 |     # create mini-batches with given size constraints
 81 |     batch_sampler = data_utils.batch_by_size(
 82 |         indices, dataset.num_tokens, max_tokens=max_tokens,
 83 |         max_sentences=max_sentences,
 84 |         required_batch_size_multiple=required_batch_size_multiple,
 85 |     )
 86 | 
 87 |     return batch_sampler
 88 | 
 89 | 
 90 | def main(args):
 91 | 
 92 |     # Load dataset
 93 |     import_user_module(args)
 94 |     task = tasks.setup_task(args)
 95 |     task.load_dataset(args.gen_subset)
 96 |     dataset = task.dataset(args.gen_subset)
 97 | 
 98 |     # Get iterator over batches
 99 |     batch_index_iterator = get_batch_iterator(
100 |         dataset=dataset,
101 |         max_tokens=args.max_tokens,
102 |         max_sentences=args.max_sentences,
103 |         max_positions=None,
104 |         ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
105 |         required_batch_size_multiple=args.required_batch_size_multiple,
106 |         num_shards=args.num_shards,
107 |         shard_id=args.shard_id,
108 |         num_workers=args.num_workers,
109 |         large_sent_first=False
110 |     )
111 | 
112 |     # collate batch of sentences into single tensor for all data
113 |     for batch_ids in tqdm(batch_index_iterator):
114 |         samples = [dataset[i] for i in batch_ids]
115 |         dataset.collater(samples)
116 | 
117 | 
118 | def cli_main():
119 |     parser = options.get_generation_parser()
120 |     args = options.parse_args_and_arch(parser)
121 |     main(args)
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     cli_main()
126 | 


--------------------------------------------------------------------------------
/tests/fairseq_data_iterator.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit 
 2 | set -o pipefail
 3 | . set_environment.sh
 4 | [ -z "$1" ] || [ -z "$2" ] && \
 5 |     echo -e "\nbash $0 <features folder> <embeddings folder>\n" && \
 6 |     exit 1
 7 | features_folder=$1
 8 | embeddings_folder=$2
 9 | set -o nounset 
10 | 
11 | # pyinstrument tests/fairseq_data_iterator.py \
12 | python tests/fairseq_data_iterator.py \
13 |     $features_folder  \
14 |     --emb-dir $embeddings_folder \
15 |     --user-dir fairseq_ext \
16 |     --task amr_action_pointer_bart \
17 |     --gen-subset train \
18 |     --max-tokens 3584 \
19 |     --path dummpy.pt
20 | 


--------------------------------------------------------------------------------
/tests/minimal_test.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | if [ -z $1 ];then
 4 | 
 5 |     # Standard mini-test with wiki25
 6 |     config=configs/wiki25-structured-bart-base-neur-al-sampling5.sh
 7 | 
 8 |     ELMO_WEIGHTS="DATA/wiki25/aligned/ibm_neural_aligner/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
 9 | 
10 |     if [ -f "$ELMO_WEIGHTS" ]; then
11 |         echo "$ELMO_WEIGHTS exists."
12 | 
13 |         # Backup weights because expensive to download.
14 |         mv $ELMO_WEIGHTS ./tmp.elmo
15 | 
16 |         # Delete previous runs is exist.
17 |         rm -Rf DATA/wiki25/*
18 | 
19 |         # Restore elmo weights.
20 |         mkdir -p DATA/wiki25/aligned/ibm_neural_aligner
21 |         mv tmp.elmo $ELMO_WEIGHTS
22 |     else
23 |         echo "$ELMO_WEIGHTS does not exist."
24 | 
25 |         # Delete previous runs if exists
26 |         rm -Rf DATA/wiki25/*
27 |     fi
28 | 
29 |     # replace code above with less restrictive deletion
30 |     # rm -R -f DATA/wiki25/embeddings
31 |     # rm -R -f DATA/wiki25/features
32 |     # rm -R -f DATA/wiki25/oracles
33 |     # rm -R -f DATA/wiki25/models
34 | 
35 |     # simulate completed corpora extraction and alignment
36 |     bash tests/create_wiki25_mockup.sh
37 | 
38 | else
39 | 
40 |     # custom config mini-test
41 |     config=$1
42 | fi
43 | set -o nounset
44 | 
45 | bash run/run_experiment.sh $config
46 | 
47 | # check if final result is there
48 | . $config
49 | 
50 | if [ -f "${MODEL_FOLDER}seed42/beam10/valid_${DECODING_CHECKPOINT}.wiki.smatch" ];then
51 |     printf "\n[\033[92mOK\033[0m] $0\n"
52 | else
53 |     printf "\n[\033[91mFAILED\033[0m] $0\n"
54 | fi
55 | 


--------------------------------------------------------------------------------
/tests/minimal_test_lsf.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit 
 2 | set -o pipefail
 3 | if [ -z $1 ];then 
 4 | 
 5 |     # Standard mini-test with wiki25
 6 |     config=configs/wiki25-structured-bart-base-neur-al-sampling5.sh
 7 | 
 8 |     ELMO_WEIGHTS="DATA/wiki25/aligned/ibm_neural_aligner/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
 9 | 
10 |     if [ -f "$ELMO_WEIGHTS" ]; then
11 |         echo "$ELMO_WEIGHTS exists."
12 | 
13 |         # Backup weights because expensive to download.
14 |         mv $ELMO_WEIGHTS ./tmp.elmo
15 | 
16 |         # Delete previous runs is exist.
17 |         rm -Rf DATA/wiki25/*
18 | 
19 |         # Restore elmo weights.
20 |         mkdir -p DATA/wiki25/aligned/ibm_neural_aligner
21 |         mv tmp.elmo $ELMO_WEIGHTS
22 |     else
23 |         echo "$ELMO_WEIGHTS does not exist."
24 | 
25 |         # Delete previous runs is exist
26 |         rm -Rf DATA/wiki25/*
27 |     fi
28 | 
29 |     # replace code above with less restrictive deletion
30 |     # rm -R -f DATA/wiki25/embeddings
31 |     # rm -R -f DATA/wiki25/features
32 |     # rm -R -f DATA/wiki25/oracles
33 |     # rm -R -f DATA/wiki25/models
34 | 
35 |     # simulate completed corpora extraction and alignment
36 |     bash tests/create_wiki25_mockup.sh
37 | 
38 | else
39 | 
40 |     # custom config mini-test
41 |     config=$1
42 | fi
43 | set -o nounset 
44 | 
45 | # Run local test
46 | bash run/lsf/run_experiment.sh $config  
47 | 
48 | # check if final result is there
49 | . $config
50 | 
51 | if [ -f "${MODEL_FOLDER}seed42/beam10/valid_${DECODING_CHECKPOINT}.wiki.smatch" ];then
52 |     printf "\n[\033[92mOK\033[0m] $0\n"
53 | else
54 |     printf "\n[\033[91mFAILED\033[0m] $0\n"
55 | fi
56 | 


--------------------------------------------------------------------------------
/tests/neural_aligner.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit 
 2 | set -o pipefail
 3 | if [ -z $1 ];then 
 4 | 
 5 |     # Standard mini-test with wiki25, sampling
 6 |     config=configs/wiki25-structured-bart-base-neur-al.sh
 7 | 
 8 | else
 9 | 
10 |     # custom config mini-test
11 |     config=$1
12 | fi
13 | . set_environment.sh
14 | set -o nounset
15 | 
16 | # load config
17 | . $config 
18 | 
19 | # Clean-up
20 | [ -d "$ALIGNED_FOLDER" ] && rm -R "$ALIGNED_FOLDER"
21 | mkdir -p "$ALIGNED_FOLDER"
22 | 
23 | # Train aligner
24 | bash run/train_aligner.sh $config 
25 | 
26 | # Align data.
27 | mkdir -p $ALIGNED_FOLDER/version_20210709c_exp_0_seed_0_write_amr2
28 | python -u src/ibm_neural_aligner/main.py \
29 |     --no-jamr \
30 |     --cuda --allow-cpu \
31 |     --vocab-text $ALIGN_VOCAB_TEXT \
32 |     --vocab-amr $ALIGN_VOCAB_AMR \
33 |     --write-single \
34 |     --single-input ${AMR_TRAIN_FILE_WIKI}.no_wiki \
35 |     --single-output $ALIGNED_FOLDER/version_20210709c_exp_0_seed_0_write_amr2/alignment.trn.out.pred \
36 |     --cache-dir $ALIGNED_FOLDER \
37 |     --verbose \
38 |     --load $ALIGN_MODEL  \
39 |     --load-flags $ALIGN_MODEL_FLAGS \
40 |     --batch-size 8 \
41 |     --max-length 0
42 | 
43 | # results should be written to
44 | if [ -f "$ALIGNED_FOLDER/version_20210709c_exp_0_seed_0_write_amr2/alignment.trn.out.pred" ];then
45 |     printf "\n[\033[92mOK\033[0m] $0\n\n"
46 | else
47 |     printf "\n[\033[91mFAILED\033[0m] $0\n\n"
48 | fi
49 | 


--------------------------------------------------------------------------------
/tests/oracles/amr_o10.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | . set_environment.sh
 4 | [ -z $1 ] && echo "$0 <amr_file (no wiki)>" && exit 1
 5 | gold_amr=$1
 6 | set -o nounset 
 7 | 
 8 | oracle_folder=DATA/unit_test_$(basename $(dirname $gold_amr))/
 9 | mkdir -p $oracle_folder 
10 |  
11 | # get actions from oracle
12 | python src/transition_amr_parser/amr_machine.py \
13 |     --in-aligned-amr $gold_amr \
14 |     --out-machine-config $oracle_folder/machine_config.json \
15 |     --out-actions $oracle_folder/train.actions \
16 |     --out-tokens $oracle_folder/train.tokens \
17 |     --use-copy 1 \
18 |     --absolute-stack-positions  \
19 |     # --if-oracle-error stop
20 |     # --reduce-nodes all
21 | 
22 | # play actions on state machine
23 | python src/transition_amr_parser/amr_machine.py \
24 |     --in-machine-config $oracle_folder/machine_config.json \
25 |     --in-tokens $oracle_folder/train.tokens \
26 |     --in-actions $oracle_folder/train.actions \
27 |     --out-amr $oracle_folder/train_oracle.amr
28 | 
29 | # score
30 | echo "Computing Smatch (make take long for 1K or more sentences)"
31 | python scripts/smatch_aligner.py \
32 |     --in-amr $oracle_folder/train_oracle.amr \
33 |     --in-reference-amr $gold_amr \
34 |     # --stop-if-different
35 | 
36 | printf "[\033[92m OK \033[0m] $0\n"
37 | 


--------------------------------------------------------------------------------
/tests/oracles/amr_o10_doc.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | . set_environment.sh
 4 | [ -z $1 ] && echo "$0 <amr_file (no wiki)>" && exit 1
 5 | gold_amr=$1
 6 | set -o nounset 
 7 | 
 8 | oracle_folder=DATA/AMR3.0/oracles/o10_pinitos_doc_v0.4/
 9 | mkdir -p $oracle_folder 
10 | NORM=no-merge
11 | #make_doc_amr
12 | python scripts/doc-amr/get_doc_amr_from_sen.py \
13 |             --in-amr $gold_amr \
14 |             --coref-fof DATA/AMR3.0/coref/train_coref.fof \
15 |             --fof-path DATA/AMR3.0/amr_annotation_3.0/ \
16 |             --norm $NORM \
17 |             --out-amr $oracle_folder/train_${NORM}.docamr
18 | 
19 | # get actions from oracle
20 | python src/transition_amr_parser/amr_machine.py \
21 |     --in-aligned-amr $oracle_folder/train_${NORM}.docamr \
22 |     --out-machine-config $oracle_folder/machine_config.json \
23 |     --out-actions $oracle_folder/train.actions \
24 |     --out-tokens $oracle_folder/train.tokens \
25 |     --use-copy 1 \
26 |     --absolute-stack-positions
27 |     # --reduce-nodes all
28 | 
29 | # play actions on state machine
30 | python src/transition_amr_parser/amr_machine.py \
31 |     --in-machine-config $oracle_folder/machine_config.json \
32 |     --in-tokens $oracle_folder/train.tokens \
33 |     --in-actions $oracle_folder/train.actions \
34 |     --out-amr $oracle_folder/train_oracle_no-merge.amr
35 | 
36 | sed 's@\~[0-9]\{1,\}@@g' $oracle_folder/train_oracle_no-merge.amr > $oracle_folder/train_oracle_no-merge.amr.no_isi
37 | sed 's@\~[0-9]\{1,\}@@g' $oracle_folder/train_no-merge.docamr > $oracle_folder/train_no-merge.docamr.no_isi
38 | # score
39 | echo "Computing Smatch (make take long for 1K or more sentences)"
40 | doc-smatch -r 1 --significant 4 --coref-subscore \
41 |                 -f $oracle_folder/train_${NORM}.docamr.no_isi \
42 |                 $oracle_folder/train_oracle_no-merge.amr.no_isi \
43 | 
44 | printf "[\033[92m OK \033[0m] $0\n"
45 | 


--------------------------------------------------------------------------------
/tests/smatch.sh:
--------------------------------------------------------------------------------
 1 | set -o errexit
 2 | set -o pipefail
 3 | . set_environment.sh
 4 | set -o nounset
 5 | 
 6 | # DOES NOT PASS Smatch test (due to Smatch read BUGs)
 7 | # :mod 277703234 in dev[97]
 8 | python scripts/smatch_aligner.py \
 9 |     --in-amr DATA/AMR2.0/corpora/dev.txt \
10 |     --in-reference-amr DATA/AMR2.0/corpora/dev.txt \
11 |     # --stop-if-different
12 | 
13 | exit
14 | 
15 | # DOES NOT PASS Smatch test (due to Smatch read BUGs)
16 | # bolt12_10511_2844.2 ignores :mod "A"
17 | # bolt12_12120_6501.3 ignores b2 :mod 106
18 | # bolt12_12120_6501.4 ignores b :mod 920
19 | # bolt12_12120_6501.5 ignores b :mod 17, b :mod 14
20 | # ... (stopped counting)
21 | python scripts/smatch_aligner.py \
22 |     --in-amr DATA/AMR2.0/corpora/train.txt \
23 |     --in-reference-amr DATA/AMR2.0/corpora/train.txt \
24 |     --stop-if-different
25 | 
26 | # DOES NOT PASS Smatch test (due to Smatch read BUGs)
27 | # :mod 277703234 in dev[97]
28 | python scripts/smatch_aligner.py \
29 |     --in-amr DATA/AMR2.0/corpora/dev.txt \
30 |     --in-reference-amr DATA/AMR2.0/corpora/dev.txt \
31 |     --stop-if-different
32 | 
33 | # DOES NOT PASS Smatch test (due to Smatch read BUGs)
34 | # bolt12_10511_2844.2 ignores :mod "A"
35 | # bolt12_12120_6501.3 ignores b2 :mod 106
36 | # bolt12_12120_6501.4 ignores b :mod 920
37 | # bolt12_12120_6501.5 ignores b :mod 17, b :mod 14
38 | # ... (stopped counting)
39 | python scripts/smatch_aligner.py \
40 |     --in-amr DATA/AMR2.0/corpora/train.txt \
41 |     --in-reference-amr DATA/AMR2.0/corpora/train.txt \
42 |     --stop-if-different
43 | 


--------------------------------------------------------------------------------
/tests/standalone-doc.sh:
--------------------------------------------------------------------------------
  1 | set -o errexit
  2 | set -o pipefail
  3 | if [ -z $1 ];then
  4 | 
  5 |     # Standard mini-test with wiki25
  6 |     config=configs/both_doc+sen_trainsliding_ws400x100.sh
  7 | 
  8 | else
  9 | 
 10 |     # custom config mini-test
 11 |     config=$1
 12 | fi
 13 | . set_environment.sh
 14 | set -o nounset
 15 | 
 16 | # load config
 17 | . $config
 18 | 
 19 | # use first seed
 20 | seed=$(echo $SEEDS | sed 's@ .*@@g')
 21 | # rest, from config
 22 | sset=test
 23 | 
 24 | reference_amr=$ORACLE_FOLDER/${sset}_docAMR.docamr
 25 | # wiki=$LINKER_CACHE_PATH/${sset}.wiki
 26 | checkpoint=${MODEL_FOLDER}seed${seed}/$DECODING_CHECKPOINT
 27 | 
 28 | force_actions=$ORACLE_FOLDER/${sset}.force_actions
 29 | # where to put results
 30 | FOLDER=${MODEL_FOLDER}seed${seed}/beam${BEAM_SIZE}/
 31 | results_prefix=$FOLDER/${sset}_$DECODING_CHECKPOINT
 32 | 
 33 | # needs data and model
 34 | [ ! -f "$checkpoint" ] \
 35 |     && echo "Missing $checkpoint" \
 36 |     && exit 1
 37 | 
 38 | # prepare unit test folder
 39 | # [ -d "$FOLDER" ] && rm -R  $FOLDER/
 40 | mkdir -p $FOLDER
 41 | 
 42 | [ ! -f "$reference_amr" ] \
 43 |     && echo "Missing $reference_amr" \
 44 |     && exit 1
 45 | 
 46 | [ ! -f "$force_actions" ] \
 47 |     && echo "Missing $force_actions" \
 48 |     && force_actions=""
 49 | 
 50 | #    # extract sentences from test
 51 | #    grep '# ::tok ' $ALIGNED_FOLDER/${sset}.txt \
 52 | #        | sed 's@# ::tok @@g' > ${results_prefix}.tokens
 53 | #    
 54 | #    # run first seed of model
 55 | #    echo "amr-parse --beam ${BEAM_SIZE} --batch-size 128 -c $checkpoint -i ${results_prefix}.tokens -o ${results_prefix}.amr"
 56 | #    amr-parse --beam ${BEAM_SIZE} --batch-size 128 -c $checkpoint -i ${results_prefix}.tokens -o ${results_prefix}.amr
 57 | 
 58 | # extract sentences from test
 59 | # grep '# ::tok ' $reference_amr \
 60 | #     | sed 's@# ::tok @@g' > ${results_prefix}.sentences
 61 | cp $ORACLE_FOLDER/${sset}.en ${results_prefix}.sentences
 62 | sed -e 's/[[:space:]]\+/ /g' ${results_prefix}.sentences > ${results_prefix}.sentences_notab
 63 | # run first seed of model
 64 | cmd="amr-parse --fp16 --beam ${BEAM_SIZE} --batch-size ${BATCH_SIZE} -c $checkpoint -i ${results_prefix}.sentences_notab -o ${results_prefix}.amr --sliding --window-size 400 --window-overlap 100 --in-actions $force_actions"
 65 | echo "$cmd"
 66 | eval "$cmd"
 67 |     
 68 | # GRAPH POST-PROCESSING
 69 | 
 70 | # if [ "$LINKER_CACHE_PATH" == "" ];then
 71 | 
 72 | #     # just copy AMR to wiki AMR
 73 | #     cp ${results_prefix}.amr ${results_prefix}.wiki.amr
 74 | 
 75 | # # TODO: Unelegant detection of linker method (temporary)
 76 | # elif [ -f "${LINKER_CACHE_PATH}/trn.wikis" ];then
 77 | 
 78 | #     # Legacy linker 
 79 | #     python scripts/add_wiki.py \
 80 | #         ${results_prefix}.amr $wiki $LINKER_CACHE_PATH \
 81 | #         > ${results_prefix}.wiki.amr
 82 | 
 83 | # else
 84 | 
 85 | #     # BLINK cache
 86 | #     python scripts/retyper.py \
 87 | #         --inputfile ${results_prefix}.amr \
 88 | #         --outputfile ${results_prefix}.wiki.amr \
 89 | #         --skipretyper \
 90 | #         --wikify \
 91 | #         --blinkcachepath $LINKER_CACHE_PATH \
 92 | #         --blinkthreshold 0.0
 93 | 
 94 | # fi
 95 | 
 96 | ## Change rep of docamr to docAMR for smatch
 97 |     
 98 | echo -e "\n Changing rep of dev/test data to docAMR "
 99 | doc-amr \
100 |     --in-doc-amr-pairwise ${results_prefix}.amr \
101 |     --rep docAMR \
102 |     --pairwise-coref-rel same-as \
103 |     --out-amr ${results_prefix}_docAMR.amr
104 | results_prefix=${results_prefix}_docAMR
105 | 
106 | ##### SMATCH evaluation
107 | if [[ "$EVAL_METRIC" == "smatch" ]]; then
108 | 
109 |     # Smatch evaluation without wiki
110 | 
111 |     # until smatch is fixed, we need to remove the ISI alignment annotations
112 |     sed 's@\~[0-9]\{1,\}@@g' ${results_prefix}.amr > ${results_prefix}.amr.no_isi
113 | 
114 |     echo "Computing SMATCH between ---"
115 |     echo "$reference_amr"
116 |     echo "${results_prefix}.amr"
117 |     doc-smatch -r 1 --significant 4 \
118 |          -f $reference_amr \
119 |          ${results_prefix}.amr.no_isi \
120 |          | tee ${results_prefix}.smatch
121 | 
122 | elif [[ "$EVAL_METRIC" == "wiki.smatch" ]]; then
123 | 
124 |     # Smatch evaluation without wiki
125 | 
126 |     # until smatch is fixed, we need to remove the ISI alignment annotations
127 |     sed 's@\~[0-9]\{1,\}@@g' ${results_prefix}.wiki.amr > ${results_prefix}.wiki.amr.no_isi
128 | 
129 |     # compute score
130 |     echo "Computing SMATCH between ---"
131 |     echo "$reference_amr"
132 |     echo "${results_prefix}.wiki.amr"
133 |     doc-smatch -r 1 --significant 4  \
134 |          -f $reference_amr \
135 |          ${results_prefix}.wiki.amr.no_isi \
136 |          | tee ${results_prefix}.wiki.smatch
137 | 
138 | fi
139 | 


--------------------------------------------------------------------------------
/tests/standalone.sh:
--------------------------------------------------------------------------------
  1 | set -o errexit
  2 | set -o pipefail
  3 | if [ -z $1 ];then
  4 | 
  5 |     # Standard mini-test with wiki25
  6 |     config=configs/wiki25-structured-bart-base-neur-al-sampling5.sh
  7 | 
  8 | else
  9 | 
 10 |     # custom config mini-test
 11 |     config=$1
 12 | fi
 13 | . set_environment.sh
 14 | set -o nounset
 15 | 
 16 | # load config
 17 | . $config
 18 | 
 19 | # use first seed
 20 | seed=$(echo $SEEDS | sed 's@ .*@@g')
 21 | # rest, from config
 22 | sset=test
 23 | 
 24 | reference_amr_wiki=$AMR_TEST_FILE_WIKI
 25 | wiki=$LINKER_CACHE_PATH/${sset}.wiki
 26 | checkpoint=${MODEL_FOLDER}seed${seed}/$DECODING_CHECKPOINT
 27 | 
 28 | # where to put results
 29 | FOLDER=${MODEL_FOLDER}seed${seed}/beam${BEAM_SIZE}/
 30 | results_prefix=$FOLDER/${sset}_$DECODING_CHECKPOINT
 31 | 
 32 | # needs data and model
 33 | [ ! -f "$checkpoint" ] \
 34 |     && echo "Missing $checkpoint" \
 35 |     && exit 1
 36 | 
 37 | # prepare unit test folder
 38 | [ -d "$FOLDER" ] && rm -R  $FOLDER/
 39 | mkdir -p $FOLDER
 40 | 
 41 | [ ! -f "$reference_amr_wiki" ] \
 42 |     && echo "Missing $reference_amr_wiki" \
 43 |     && exit 1
 44 | 
 45 | #    # extract sentences from test
 46 | #    grep '# ::tok ' $ALIGNED_FOLDER/${sset}.txt \
 47 | #        | sed 's@# ::tok @@g' > ${results_prefix}.tokens
 48 | #    
 49 | #    # run first seed of model
 50 | #    echo "amr-parse --beam ${BEAM_SIZE} --batch-size 128 -c $checkpoint -i ${results_prefix}.tokens -o ${results_prefix}.amr"
 51 | #    amr-parse --beam ${BEAM_SIZE} --batch-size 128 -c $checkpoint -i ${results_prefix}.tokens -o ${results_prefix}.amr
 52 | 
 53 | # extract sentences from test
 54 | grep '# ::snt ' $reference_amr_wiki \
 55 |     | sed 's@# ::snt @@g' > ${results_prefix}.sentences
 56 | 
 57 | # run first seed of model; --fp16 
 58 | cmd="amr-parse --beam ${BEAM_SIZE} --batch-size ${BATCH_SIZE} --tokenize -c $checkpoint -i ${results_prefix}.sentences -o ${results_prefix}.amr --out-tokens ${results_prefix}.tokens --out-actions ${results_prefix}.actions"
 59 | echo "$cmd"
 60 | eval "$cmd"
 61 |     
 62 | # GRAPH POST-PROCESSING
 63 | 
 64 | if [ "$LINKER_CACHE_PATH" == "" ];then
 65 | 
 66 |     # just copy AMR to wiki AMR
 67 |     cp ${results_prefix}.amr ${results_prefix}.wiki.amr
 68 | 
 69 | # TODO: Unelegant detection of linker method (temporary)
 70 | elif [ -f "${LINKER_CACHE_PATH}/trn.wikis" ];then
 71 | 
 72 |     # Legacy linker 
 73 |     python scripts/add_wiki.py \
 74 |         ${results_prefix}.amr $wiki $LINKER_CACHE_PATH \
 75 |         > ${results_prefix}.wiki.amr
 76 | 
 77 | else
 78 | 
 79 |     # BLINK cache
 80 |     python scripts/retyper.py \
 81 |         --inputfile ${results_prefix}.amr \
 82 |         --outputfile ${results_prefix}.wiki.amr \
 83 |         --skipretyper \
 84 |         --wikify \
 85 |         --blinkcachepath $LINKER_CACHE_PATH \
 86 |         --blinkthreshold 0.0
 87 | 
 88 | fi
 89 | 
 90 | 
 91 | ##### SMATCH evaluation
 92 | if [[ "$EVAL_METRIC" == "smatch" ]]; then
 93 | 
 94 |     # Smatch evaluation without wiki
 95 | 
 96 |     # until smatch is fixed, we need to remove the ISI alignment annotations
 97 |     sed 's@\~[0-9]\{1,\}@@g' ${results_prefix}.amr > ${results_prefix}.amr.no_isi
 98 | 
 99 |     echo "Computing SMATCH between ---"
100 |     echo "$reference_amr_wiki"
101 |     echo "${results_prefix}.amr"
102 |     smatch.py -r 10 --significant 4 \
103 |          -f $reference_amr_wiki \
104 |          ${results_prefix}.amr.no_isi \
105 |          | tee ${results_prefix}.smatch
106 | 
107 | elif [[ "$EVAL_METRIC" == "wiki.smatch" ]]; then
108 | 
109 |     # Smatch evaluation without wiki
110 | 
111 |     # until smatch is fixed, we need to remove the ISI alignment annotations
112 |     sed 's@\~[0-9]\{1,\}@@g' ${results_prefix}.wiki.amr > ${results_prefix}.wiki.amr.no_isi
113 | 
114 |     # compute score
115 |     echo "Computing SMATCH between ---"
116 |     echo "$reference_amr_wiki"
117 |     echo "${results_prefix}.wiki.amr"
118 |     smatch.py -r 10 --significant 4  \
119 |          -f $reference_amr_wiki \
120 |          ${results_prefix}.wiki.amr.no_isi \
121 |          | tee ${results_prefix}.wiki.smatch
122 | 
123 | fi
124 | 


--------------------------------------------------------------------------------
/tests/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | from transition_amr_parser.io import protected_tokenizer
 4 | from random import shuffle
 5 | 
 6 | 
 7 | def read_raw_amr(amr_file):
 8 | 
 9 |     # AMR file with ::snt and ::tok fields (JAMR)
10 |     tokens = []
11 |     sents = []
12 |     with open(amr_file) as fid:
13 |         for line in fid:
14 |             if line.strip().startswith('# ::snt'):
15 |                 sents.append(line.split('# ::snt')[-1].strip())
16 |             elif line.strip().startswith('# ::tok'):
17 |                 tokens.append(line.split('# ::tok')[-1].strip())
18 |     assert len(tokens) == len(sents)
19 |     return sents, tokens
20 | 
21 | 
22 | def main(amr_file, do_break=False):
23 | 
24 | #     # indices to ignore
25 | #     ignore_indices =[
26 | #         384, 385, 973, 1541,
27 | #         669,                  # 'a
28 | #         865,                  # 120.
29 | #         1335,                 # gov't
30 | #         1411,                 # !!!)
31 | #         1520,                 # PA.
32 | #     ]
33 |     ignore_indices = []
34 | 
35 |     # read data
36 |     sents, tokens = read_raw_amr(amr_file)
37 | 
38 |     # random order
39 |     indices = list(range(len(tokens)))
40 |     shuffle(indices)
41 | 
42 |     # simple tokenizer
43 |     count = 0
44 |     for index in indices:
45 |         new_tokens = ' '.join(protected_tokenizer(sents[index], simple=True)[0])
46 |         if tokens[index] == new_tokens:
47 |             count += 1
48 |         elif do_break and index not in ignore_indices:
49 |             print(index)
50 |             print(sents[index])
51 |             print(tokens[index])
52 |             print(new_tokens)
53 |             import ipdb; ipdb.set_trace(context=30)
54 |             protected_tokenizer(sents[index])
55 | 
56 |     perc = count * 100. / len(tokens)
57 |     print(f'simple match {count}/{len(tokens)} {perc:.2f} %')
58 | 
59 |     # JAMR like tokenizer
60 |     count = 0
61 |     for index in indices:
62 |         new_tokens = ' '.join(protected_tokenizer(sents[index])[0])
63 | 
64 |         if tokens[index] == new_tokens:
65 |             count += 1
66 |         elif do_break and index not in ignore_indices:
67 |             print(index)
68 |             print(sents[index])
69 |             print(tokens[index])
70 |             print(new_tokens)
71 |             import ipdb; ipdb.set_trace(context=30)
72 |             protected_tokenizer(sents[index])
73 | 
74 |     perc = count * 100. / len(tokens)
75 |     print(f'JAMR-like match {count}/{len(tokens)} {perc:.2f} %')
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main(sys.argv[1], do_break=False)
80 | 


--------------------------------------------------------------------------------