├── .gitignore ├── .gitmodules ├── README.md ├── amr_aligner ├── amr │ ├── __init__.py │ └── aligned.py ├── eager_actions_evaluator.py ├── eager_oracle.py ├── refresh_alignments.py ├── resources │ ├── morphosemantic_links │ │ ├── README.md │ │ ├── extract_morphosemantic.py │ │ └── morphosemantic-links.dic.bz2 │ └── word2vec │ │ └── README.md ├── rule_base_align.py ├── rule_based_aligner │ ├── __init__.py │ ├── aligned_results.py │ ├── match_result.py │ ├── matcher.py │ ├── morphosemantic-links.dic │ ├── stemmer.py │ └── updater.py ├── smatch │ ├── README.md │ ├── __init__.py │ ├── _gain.cc │ ├── _gain.h │ ├── _smatch.cpp │ ├── _smatch.pyx │ ├── amr.py │ ├── api.py │ ├── fast_smatch.py │ ├── setup.py │ ├── smatch-table.py │ └── smatch.py └── system │ ├── __init__.py │ ├── eager │ ├── __init__.py │ ├── oracle.py │ └── state.py │ ├── edge.py │ ├── misc.py │ └── node.py ├── amr_parser ├── CMakeLists.txt ├── cmake │ └── FindEigen3.cmake ├── scripts │ └── eval_eager.sh └── src │ ├── CMakeLists.txt │ ├── corpus.cc │ ├── corpus.h │ ├── ds.cc │ ├── ds.h │ ├── left_to_right │ ├── CMakeLists.txt │ ├── decode │ │ ├── CMakeLists.txt │ │ ├── testing.cc │ │ └── testing.h │ ├── ensemble.cc │ ├── evaluate │ │ ├── CMakeLists.txt │ │ ├── evaluate.cc │ │ └── evaluate.h │ ├── main.cc │ ├── parser │ │ ├── CMakeLists.txt │ │ ├── parser.cc │ │ ├── parser.h │ │ ├── parser_builder.cc │ │ ├── parser_builder.h │ │ ├── parser_eager.cc │ │ ├── parser_eager.h │ │ ├── parser_swap.cc │ │ └── parser_swap.h │ ├── system │ │ ├── CMakeLists.txt │ │ ├── eager.cc │ │ ├── eager.h │ │ ├── state.cc │ │ ├── state.h │ │ ├── swap.cc │ │ ├── swap.h │ │ ├── system.cc │ │ └── system.h │ └── train │ │ ├── CMakeLists.txt │ │ ├── algorithm.h │ │ ├── train.cc │ │ ├── train.h │ │ ├── train_supervised.cc │ │ └── train_supervised.h │ ├── logging.cc │ ├── logging.h │ ├── lstm.cc │ ├── lstm.h │ ├── math_utils.cc │ ├── math_utils.h │ ├── sys_utils.cc │ ├── sys_utils.h │ ├── trainer_utils.cc │ └── trainer_utils.h ├── awesome.md ├── pipeline.sh └── release ├── ldc2014t12 ├── README.md ├── amr-release-1.0-training_fix.patch ├── amr-release-1.0-training_fix.txt.cdec_tok.tamr_alignment.bz2 └── amr-release-1.0-training_fix.txt.sd_tok.tamr_alignment.bz2 └── ldc2017t10 ├── README.md ├── amr-release-2.0-amrs-training-bolt.txt.cdec_tok.tamr_alignment.bz2 ├── amr-release-2.0-amrs-training-cctv.txt.no_wiki.cdec_tok.tamr_alignment.bz2 ├── amr-release-2.0-amrs-training-dfa_fix.patch ├── amr-release-2.0-amrs-training-dfa_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 ├── amr-release-2.0-amrs-training-dfb_fix.patch ├── amr-release-2.0-amrs-training-dfb_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 ├── amr-release-2.0-amrs-training-guidelines_fix.patch ├── amr-release-2.0-amrs-training-guidelines_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 ├── amr-release-2.0-amrs-training-mt09sdl_fix.patch ├── amr-release-2.0-amrs-training-mt09sdl_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 ├── amr-release-2.0-amrs-training-proxy_fix.patch ├── amr-release-2.0-amrs-training-proxy_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 ├── amr-release-2.0-amrs-training-wb_fix.patch ├── amr-release-2.0-amrs-training-wb_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 └── amr-release-2.0-amrs-training-xinhua.txt.no_wiki.cdec_tok.tamr_alignment.bz2 /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | .static_storage/ 58 | .media/ 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | ## Core latex/pdflatex auxiliary files: 108 | *.aux 109 | *.lof 110 | *.log 111 | *.lot 112 | *.fls 113 | *.out 114 | *.toc 115 | *.fmt 116 | *.fot 117 | *.cb 118 | *.cb2 119 | .*.lb 120 | 121 | ## Intermediate documents: 122 | *.dvi 123 | *.xdv 124 | *-converted-to.* 125 | # these rules might exclude image files for figures etc. 126 | # *.ps 127 | # *.eps 128 | # *.pdf 129 | 130 | ## Generated if empty string is given at "Please type another file name for output:" 131 | .pdf 132 | 133 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 134 | *.bbl 135 | *.bcf 136 | *.blg 137 | *-blx.aux 138 | *-blx.bib 139 | *.run.xml 140 | 141 | ## Build tool auxiliary files: 142 | *.fdb_latexmk 143 | *.synctex 144 | *.synctex(busy) 145 | *.synctex.gz 146 | *.synctex.gz(busy) 147 | *.pdfsync 148 | 149 | ## Auxiliary and intermediate files from other packages: 150 | # algorithms 151 | *.alg 152 | *.loa 153 | 154 | # achemso 155 | acs-*.bib 156 | 157 | # amsthm 158 | *.thm 159 | 160 | # beamer 161 | *.nav 162 | *.pre 163 | *.snm 164 | *.vrb 165 | 166 | # changes 167 | *.soc 168 | 169 | # cprotect 170 | *.cpt 171 | 172 | # elsarticle (documentclass of Elsevier journals) 173 | *.spl 174 | 175 | # endnotes 176 | *.ent 177 | 178 | # fixme 179 | *.lox 180 | 181 | # feynmf/feynmp 182 | *.mf 183 | *.mp 184 | *.t[1-9] 185 | *.t[1-9][0-9] 186 | *.tfm 187 | 188 | #(r)(e)ledmac/(r)(e)ledpar 189 | *.end 190 | *.?end 191 | *.[1-9] 192 | *.[1-9][0-9] 193 | *.[1-9][0-9][0-9] 194 | *.[1-9]R 195 | *.[1-9][0-9]R 196 | *.[1-9][0-9][0-9]R 197 | *.eledsec[1-9] 198 | *.eledsec[1-9]R 199 | *.eledsec[1-9][0-9] 200 | *.eledsec[1-9][0-9]R 201 | *.eledsec[1-9][0-9][0-9] 202 | *.eledsec[1-9][0-9][0-9]R 203 | 204 | # glossaries 205 | *.acn 206 | *.acr 207 | *.glg 208 | *.glo 209 | *.gls 210 | *.glsdefs 211 | 212 | # gnuplottex 213 | *-gnuplottex-* 214 | 215 | # gregoriotex 216 | *.gaux 217 | *.gtex 218 | 219 | # hyperref 220 | *.brf 221 | 222 | # knitr 223 | *-concordance.tex 224 | # TODO Comment the next line if you want to keep your tikz graphics files 225 | *.tikz 226 | *-tikzDictionary 227 | 228 | # listings 229 | *.lol 230 | 231 | # makeidx 232 | *.idx 233 | *.ilg 234 | *.ind 235 | *.ist 236 | 237 | # minitoc 238 | *.maf 239 | *.mlf 240 | *.mlt 241 | *.mtc[0-9]* 242 | *.slf[0-9]* 243 | *.slt[0-9]* 244 | *.stc[0-9]* 245 | 246 | # minted 247 | _minted* 248 | *.pyg 249 | 250 | # morewrites 251 | *.mw 252 | 253 | # nomencl 254 | *.nlo 255 | 256 | # pax 257 | *.pax 258 | 259 | # pdfpcnotes 260 | *.pdfpc 261 | 262 | # sagetex 263 | *.sagetex.sage 264 | *.sagetex.py 265 | *.sagetex.scmd 266 | 267 | # scrwfile 268 | *.wrt 269 | 270 | # sympy 271 | *.sout 272 | *.sympy 273 | sympy-plots-for-*.tex/ 274 | 275 | # pdfcomment 276 | *.upa 277 | *.upb 278 | 279 | # pythontex 280 | *.pytxcode 281 | pythontex-files-*/ 282 | 283 | # thmtools 284 | *.loe 285 | 286 | # TikZ & PGF 287 | *.dpth 288 | *.md5 289 | *.auxlock 290 | 291 | # todonotes 292 | *.tdo 293 | 294 | # easy-todo 295 | *.lod 296 | 297 | # xindy 298 | *.xdy 299 | 300 | # xypic precompiled matrices 301 | *.xyc 302 | 303 | # endfloat 304 | *.ttt 305 | *.fff 306 | 307 | # Latexian 308 | TSWLatexianTemp* 309 | 310 | ## Editors: 311 | # WinEdt 312 | *.bak 313 | *.sav 314 | 315 | # Texpad 316 | .texpadtmp 317 | 318 | # Kile 319 | *.backup 320 | 321 | # KBibTeX 322 | *~[0-9]* 323 | 324 | # auto folder when using emacs and auctex 325 | ./auto/* 326 | *.el 327 | 328 | # expex forward references with \gathertags 329 | *-tags.tex 330 | 331 | # standalone packages 332 | *.sta 333 | 334 | data/ 335 | align_results/ 336 | reports/ 337 | analysis/ 338 | output/ 339 | parser_l2r* 340 | *.pdf 341 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "amr_parser/dynet"] 2 | path = amr_parser/dynet 3 | url = https://github.com/clab/dynet.git 4 | [submodule "amr_parser/dynet_layer"] 5 | path = amr_parser/dynet_layer 6 | url = https://github.com/Oneplus/dynet_layer.git 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tamr 2 | ==== 3 | 4 | A transition-based AMR parser along with an aligner tuned by the parser. 5 | Used in our EMNLP 2018 paper [An AMR Aligner Tuned by Transition-based Parser](https://arxiv.org/pdf/1810.03541.pdf). 6 | 7 | 8 | ## Notion 9 | 10 | In the following sections, we will use the following notions: 11 | 12 | - `${TAMR_HOME}`: the root directory of the project 13 | - `${TAMR_ALIGNER}`: the directory of the AMR aligner, which equals 14 | to `${TAMR_HOME}/amr_aligner` 15 | - `${TAMR_PARSER}`: the directory of the transition-based aligner, which equals 16 | to `${TAMR_HOME}/amr_parser` 17 | 18 | ## Aligner 19 | 20 | The code for AMR aligner is under `${TAMR_ALIGNER}`. 21 | 22 | ### Pre-requisites 23 | 24 | - python2.7 25 | - JAMR 26 | - nltk 27 | - gensim 28 | - penman 29 | - Cython (optional, for fast_smatch.py) 30 | 31 | ### Prepare resource 32 | We use `word2vec` for semantic matching. See the [README.md](https://github.com/Oneplus/tamr/tree/master/amr_aligner/resources/word2vec) 33 | for more information about filtering wordvec. 34 | 35 | ### Prepare data 36 | Our alignment is built on the JAMR alignment results. 37 | You can get the input data with the following commends: 38 | ``` 39 | pushd "$JAMR_HOME" > /dev/null 40 | . scripts/config.sh 41 | scripts/ALIGN.sh < /path/to/your/input/data > /path/to/your/baseline/data 42 | ``` 43 | 44 | ### Run the Aligner 45 | Go into `${TAMR_ALIGNER}` and run the following commands: 46 | 47 | ``` 48 | python rule_base_align.py \ 49 | -verbose \ 50 | -data \ 51 | /path/to/your/baseline/data \ 52 | -output \ 53 | /path/to/your/alignment/data \ 54 | -wordvec \ 55 | /path/to/your/wordvec/data \ 56 | -trials \ 57 | 10000 \ 58 | -improve_perfect \ 59 | -morpho_match \ 60 | -semantic_match 61 | ``` 62 | 63 | The quality of an alignment is evaluated by the smatch 64 | score of the graph 65 | it leads to. Here using `-improve_perfect` will 66 | update the alignment even with the baseline alignment 67 | achieve an smatch score of 1.0. 68 | 69 | The output alignment is shown as blocks of results in the following format: 70 | ``` 71 | id 72 | # ::alignment: 73 | ``` 74 | 75 | **[2018/12/20 update]** old `replace_comments.py` 76 | does not update the alignment in `# ::node` fields 77 | which was used in `eager_oracle.py`. Please use the 78 | `refresh_alignments.py` script to generate new alignment data. 79 | Thanks @jcyk for bug shooting! 80 | 81 | After getting the alignment, use the following commands to generate 82 | new alignment: 83 | ``` 84 | python refresh_alignments.py \ 85 | -lexicon \ 86 | /path/to/your/alignment/data \ 87 | -data \ 88 | /path/to/your/baseline/data \ 89 | > /path/to/your/new/alignment/data 90 | ``` 91 | 92 | You can also use `refresh_alignments.py` to yield aligned AMR file 93 | for LDC2014T12 with the alignment we release. 94 | 95 | ## Parser 96 | 97 | ### Pre-requisites 98 | 99 | - cmake 100 | - c++ supporting c++11 101 | - eigen 102 | 103 | ### Build 104 | 105 | Before compiling, you need to fetch the `dynet` and `dynet_layer` with 106 | ``` 107 | git submodule init 108 | git submodule update 109 | ``` 110 | under `${TAMR_HOME}`. 111 | 112 | After fetching the submodules, run the following commends. 113 | 114 | ``` 115 | cd amr_parser 116 | mkdir build 117 | cd build 118 | cmake .. -DEIGEN3_INCLUDE_DIR=/path/to/your/eigen/ 119 | make 120 | ``` 121 | 122 | The compilation will generate an executable under `${TAMR_PARSER}/bin/`. 123 | 124 | ### Prepare data 125 | 126 | After getting your data with alignment, 127 | do run the `${TAMR_ALIGNER}/eager_oracle.py` 128 | to generate training action file for the alignment as 129 | ``` 130 | python eager_oracle.py \ 131 | -mod \ 132 | dump \ 133 | -aligned \ 134 | /path/to/your/new/alignment/data \ 135 | > /path/to/your/actions 136 | ``` 137 | 138 | ### Training the Parser 139 | With the following commands under `$TAMR_PARSER`: 140 | ``` 141 | ./amr_parser/bin/parser_l2r \ 142 | --dynet-seed \ 143 | 1 \ 144 | --train \ 145 | --training_data \ 146 | /path/to/your/new/actions/training/data \ 147 | --devel_data \ 148 | /path/to/your/new/actions/dev/data \ 149 | --test_data \ 150 | /path/to/your/new/actions/test/data \ 151 | --pretrained \ 152 | /path/to/your/embedding/file \ 153 | --model \ 154 | data/little_prince/model \ 155 | --optimizer_enable_eta_decay \ 156 | true \ 157 | --optimizer_enable_clipping \ 158 | true \ 159 | --external_eval \ 160 | ./amr_parser/scripts/eval_eager.sh \ 161 | --devel_gold \ 162 | /path/to/your/new/alignment/dev/data \ 163 | --test_gold \ 164 | /path/to/your/new/alignment/test/data \ 165 | --max_iter \ 166 | 1 167 | ``` 168 | 169 | ## Released Alignments 170 | 171 | ### [LDC2014T12](https://catalog.ldc.upenn.edu/LDC2014T12) 172 | 173 | You can find our alignment for LDC2014T12 under `${TAMR_HOME}/release/ldc2014t12`. 174 | Since JAMR and CAMR use different tokenization, our release includes 175 | the alignment processed with cdec tokenization and stanford tokenization. 176 | 177 | ### [LDC2017T10](https://catalog.ldc.upenn.edu/LDC2017T10) 178 | 179 | You can find our alignment for LDC2014T12 under `${TAMR_HOME}/release/ldc2017t10`. 180 | Our release only contains alignment processed with cdec tokenization. 181 | 182 | ## Pipeline Script 183 | 184 | We demonstrate the process in the `pipeline.sh` script. 185 | 186 | ## Awesome AMR 187 | 188 | Our alignment helps other AMR parser to achieve better performance. 189 | We show how to hack into several open-source AMR parser and replace 190 | their alignment with ours in the [awesome.md](https://github.com/Oneplus/tamr/blob/master/awesome.md). 191 | 192 | ## Contact 193 | 194 | Yijia Liu <> 195 | -------------------------------------------------------------------------------- /amr_aligner/amr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/amr_aligner/amr/__init__.py -------------------------------------------------------------------------------- /amr_aligner/eager_actions_evaluator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import argparse 5 | import codecs 6 | from system.node import TokenNode, EntityNode, ConceptNode 7 | from system.eager.state import State 8 | from amr.aligned import AlignmentReader, Alignment 9 | from smatch.api import SmatchScorer 10 | 11 | 12 | class Generator(object): 13 | def __init__(self, verbose=False): 14 | self.verbose = verbose 15 | 16 | def parse(self, align, actions): 17 | state = State(align) 18 | 19 | for action in actions: 20 | if action[0] == 'SHIFT': 21 | state.shift() 22 | elif action[0] == 'DROP': 23 | state.drop() 24 | elif action[0] == 'REDUCE': 25 | state.reduce() 26 | elif action[0] == 'CACHE': 27 | state.cache() 28 | elif action[0] == 'MERGE': 29 | state.merge() 30 | elif action[0] == 'CONFIRM': 31 | if action[2] == '_UNK_': 32 | action[2] = state.buffer_[0].name 33 | state.confirm(action[2]) 34 | elif action[0] == 'ENTITY': 35 | state.entity(action[1], None) 36 | elif action[0] == 'LEFT': 37 | state.left(action[1]) 38 | elif action[0] == 'RIGHT': 39 | state.right(action[1]) 40 | elif action[0] == 'NEWNODE': 41 | state.add_newnode(ConceptNode(action[1], None, None)) 42 | state.newnode() 43 | else: 44 | assert false 45 | 46 | return state 47 | 48 | 49 | def main(): 50 | cmd = argparse.ArgumentParser(usage='the evaluate script.') 51 | cmd.add_argument('-gold', help='the path to the gold amr graph.') 52 | cmd.add_argument('-pred_actions', help='the path to the predicted actions.') 53 | opt = cmd.parse_args() 54 | 55 | reader = AlignmentReader(opt.gold) 56 | generator = Generator() 57 | scorer = SmatchScorer() 58 | 59 | predict_dataset = codecs.open(opt.pred_actions, 'r', encoding='utf-8').read().strip().split('\n\n') 60 | for block, predict_data in zip(reader, predict_dataset): 61 | graph = Alignment(block) 62 | actions = [line.replace('# ::action\t', '').split('\t') 63 | for line in predict_data.splitlines() if line.startswith('# ::action')] 64 | try: 65 | state = generator.parse(graph, actions) 66 | predict_amr_graph = str(state.arcs_).encode('utf-8') 67 | except: 68 | # print('{0}'.format(graph.n)) 69 | # print('Failed to parse actions:') 70 | # for action in actions: 71 | # print(' - {0}'.format('\t'.join(action).encode('utf-8'))) 72 | predict_amr_graph = '(a / amr-empty)' 73 | scorer.update(graph.amr_graph, predict_amr_graph) 74 | print(scorer.f_score()) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /amr_aligner/eager_oracle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import sys 5 | import traceback 6 | import argparse 7 | import time 8 | from amr.aligned import Alignment, AlignmentReader 9 | from system.eager.oracle import Oracle 10 | from smatch.api import smatch 11 | 12 | 13 | def main(): 14 | cmd = argparse.ArgumentParser('Test the program.') 15 | cmd.add_argument('-mod', default='evaluate', choices=('parse', 'evaluate', 'dump'), 16 | help='the running mode. -parse: evaluate the best AMR graph achieved by the alignment ' 17 | '(specified in ::alignment field) and use the resulted graph to replace the original' 18 | 'AMR graph; -evaluate: same as parser without replacement; -dump: dump action file.') 19 | cmd.add_argument('-aligned', help='the path to the filename.') 20 | cmd.add_argument('-verbose', default=False, action='store_true', help='verbose the actions.') 21 | opt = cmd.parse_args() 22 | 23 | align_handler = AlignmentReader(opt.aligned) 24 | parser = Oracle(verbose=opt.verbose) 25 | 26 | for align_block in align_handler: 27 | graph = Alignment(align_block) 28 | try: 29 | actions, state = parser.parse(graph) 30 | 31 | if opt.mod in ('parse', 'evaluate'): 32 | predicted_amr_graph = str(state.arcs_) 33 | f_score = smatch(predicted_amr_graph, graph.amr_graph) 34 | for line in align_block: 35 | if line.startswith('# ::alignments'): 36 | line = line + ' ::parser eager_oracle.py' \ 37 | ' ::smatch {0} ::n_actions {1}'.format(f_score, len(actions)) 38 | if line.startswith('('): 39 | break 40 | print(line.encode('utf-8')) 41 | if opt.mod == 'parse': 42 | print(str(state.arcs_)) 43 | else: 44 | print(graph.amr_graph) 45 | else: 46 | print('# ::id {0}'.format(graph.n)) 47 | for line in align_block: 48 | if line.startswith('# ::tok') or line.startswith('# ::pos'): 49 | print(line.encode('utf-8')) 50 | print('\n'.join(['# ::action {0}'.format(action) for action in actions])) 51 | print() 52 | 53 | if opt.verbose: 54 | print(graph.n, file=sys.stderr) 55 | print('\n'.join(actions), file=sys.stderr, end='\n\n') 56 | except Exception: 57 | print(graph.n, file=sys.stderr) 58 | traceback.print_exc(file=sys.stderr) 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /amr_aligner/refresh_alignments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import argparse 5 | from amr.aligned import AlignmentReader, Alignment 6 | 7 | 8 | def main(): 9 | cmd = argparse.ArgumentParser('Get the block that contains certain amr graph.') 10 | cmd.add_argument('-lexicon', help='the path to the alignment file.') 11 | cmd.add_argument('-data', help='the path to the alignment file.') 12 | cmd.add_argument('-keep_alignment_in_node', default=False, action='store_true', help='') 13 | opt = cmd.parse_args() 14 | 15 | lexicon = {} 16 | for data in open(opt.lexicon, 'r').read().strip().split('\n\n'): 17 | lines = data.splitlines() 18 | assert len(lines) == 2 19 | lexicon[lines[0].strip()] = lines[1].strip() 20 | 21 | handler = AlignmentReader(opt.data) 22 | for block in handler: 23 | graph = Alignment(block) 24 | new_alignment = lexicon[graph.n] 25 | 26 | graph.alignments = Alignment._parse_alignment([new_alignment]) 27 | graph.refill_alignment() 28 | 29 | for line in block: 30 | if line.startswith('#'): 31 | if line.startswith('# ::alignments'): 32 | print(new_alignment) 33 | else: 34 | if not opt.keep_alignment_in_node and line.startswith('# ::node'): 35 | tokens = line.split() 36 | level = tokens[2] 37 | alignment = graph.get_node_by_level(level).alignment 38 | print('# ::node\t{0}\t{1}\t{2}'.format( 39 | tokens[2], tokens[3], '{0}-{1}'.format(alignment[0], alignment[1]) if alignment else '')) 40 | else: 41 | print(line.encode('utf-8')) 42 | 43 | print(graph.amr_graph.encode('utf-8'), end='\n\n') 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /amr_aligner/resources/morphosemantic_links/README.md: -------------------------------------------------------------------------------- 1 | Processed morphosemantic links 2 | ============================== 3 | 4 | The original [Morphosemantic Links Database](https://wordnet.princeton.edu/download/standoff-files) 5 | contains derivational links connecting noun and verb senses. 6 | In this project, we provide a processed mapping 7 | between a verb and its noun form in `morphosemantic-links.dic`. 8 | We use the `extract_morphosemantic.py` for 9 | getting the dictionary from the CSV formatted file of 10 | [morphosemantic-links.xls](http://wordnetcode.princeton.edu/standoff-files/morphosemantic-links.xls) 11 | 12 | Do use bzip2 to extract the raw text from the zip file. 13 | 14 | ### LICENSE 15 | 16 | ``` 17 | WordNet Release 3.0 This software and database is being provided to you, the LICENSEE, 18 | by Princeton University under the following license. By obtaining, using and/or copying 19 | this software and database, you agree that you have read, understood, and will comply 20 | with these terms and conditions.: Permission to use, copy, modify and distribute this 21 | software and database and its documentation for any purpose and without fee or royalty 22 | is hereby granted, provided that you agree to comply with the following copyright notice 23 | and statements, including the disclaimer, and that the same appear on ALL copies of the 24 | software, database and documentation, including modifications that you make for internal 25 | use or for distribution. WordNet 3.0 Copyright 2006 by Princeton University. All rights 26 | reserved. THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES 27 | NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT 28 | LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF 29 | MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED 30 | SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, 31 | COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton University or Princeton 32 | may not be used in advertising or publicity pertaining to distribution of the software 33 | and/or database. Title to copyright in this software, database and any associated 34 | documentation shall at all times remain with Princeton University and LICENSEE agrees 35 | to preserve same. 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /amr_aligner/resources/morphosemantic_links/extract_morphosemantic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import sys 4 | 5 | 6 | def main(): 7 | lexicon = set() 8 | is_header = True 9 | for line in open(sys.argv[1], 'r'): 10 | if is_header: 11 | is_header = False 12 | continue 13 | tokens = line.strip().split(',') 14 | verb, noun = tokens[0], tokens[3] 15 | verb = verb.split('%')[0] 16 | noun = noun.split('%')[0] 17 | lexicon.add((verb, noun)) 18 | for verb, noun in lexicon: 19 | print('{0},{1}'.format(verb, noun)) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /amr_aligner/resources/morphosemantic_links/morphosemantic-links.dic.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/amr_aligner/resources/morphosemantic_links/morphosemantic-links.dic.bz2 -------------------------------------------------------------------------------- /amr_aligner/resources/word2vec/README.md: -------------------------------------------------------------------------------- 1 | Word2vec for our aligner 2 | ======================== 3 | 4 | We use the [glove.840B.300d](http://nlp.stanford.edu/data/glove.840B.300d.zip). 5 | We suggest to filter the embeddings by the words and concepts 6 | (trimming the tail in word sense) in the data. -------------------------------------------------------------------------------- /amr_aligner/rule_based_aligner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/amr_aligner/rule_based_aligner/__init__.py -------------------------------------------------------------------------------- /amr_aligner/rule_based_aligner/aligned_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | class AlignedResults(object): 5 | def __init__(self, multiple=True): 6 | self.spans_to_levels = {} 7 | self.levels_to_spans = {} 8 | self.multiple = multiple 9 | 10 | def add(self, start, end, level, dependent): 11 | if self.multiple: 12 | return self._mutualisticly_add(start, end, level, dependent) 13 | else: 14 | return self._exclusively_add(start, end, level, dependent) 15 | 16 | def _mutualisticly_add(self, start, end, level, dependent): 17 | added = False 18 | if (start, end) not in self.spans_to_levels: 19 | self.spans_to_levels[start, end] = set() 20 | if (level, dependent) not in self.spans_to_levels[start, end]: 21 | added = True 22 | self.spans_to_levels[start, end].add((level, dependent)) 23 | 24 | if level not in self.levels_to_spans: 25 | self.levels_to_spans[level] = set() 26 | self.levels_to_spans[level].add((start, end, dependent)) 27 | return added 28 | 29 | def _exclusively_add(self, start, end, level, dependent): 30 | # first check if the concept is aligned. 31 | if level in self.levels_to_spans: 32 | return False 33 | self.levels_to_spans[level] = {(start, end, dependent)} 34 | added = False 35 | if dependent is not None: 36 | if (start, end) not in self.spans_to_levels: 37 | self.spans_to_levels[start, end] = set() 38 | if (level, dependent) not in self.spans_to_levels[start, end]: 39 | added = True 40 | self.spans_to_levels[start, end].add((level, dependent)) 41 | else: 42 | overlap = False 43 | for new_start, new_end in self.spans_to_levels: 44 | if (start < new_start < end) or (start < new_end < end): 45 | overlap = True 46 | break 47 | if not overlap: 48 | if (start, end) not in self.spans_to_levels: 49 | self.spans_to_levels[start, end] = set() 50 | if (level, dependent) not in self.spans_to_levels[start, end]: 51 | added = True 52 | self.spans_to_levels[start, end].add((level, dependent)) 53 | return added 54 | 55 | def contains(self, level): 56 | return level in self.levels_to_spans 57 | 58 | def get_spans_by_level(self, level): 59 | return set([(start, end) for start, end, _ in self.levels_to_spans.get(level, set())]) 60 | 61 | def get_levels_by_span(self, start, end): 62 | return set([level for level, _ in self.spans_to_levels.get((start, end), set())]) 63 | -------------------------------------------------------------------------------- /amr_aligner/rule_based_aligner/match_result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | 4 | 5 | class MatchResult(object): 6 | def __init__(self, level, signature): 7 | self.level = level 8 | self.signature = signature 9 | 10 | def __eq__(self, other): 11 | if isinstance(other, MatchResult): 12 | return self.level == other.level 13 | return False 14 | 15 | def __str__(self): 16 | return '{0}={1}'.format(self.signature, self.level) 17 | 18 | def __repr__(self): 19 | return self.__str__() 20 | 21 | def __hash__(self): 22 | return self.level.__hash__() 23 | 24 | 25 | class EntityMatchResult(MatchResult): 26 | def __init__(self, level, children_levels, signature): 27 | super(EntityMatchResult, self).__init__(level, signature) 28 | self.children_levels = children_levels 29 | 30 | def __str__(self): 31 | return '{0}=({1}, {2})'.format(self.signature, self.level, self.children_levels) 32 | 33 | 34 | class WordMatchResult(MatchResult): 35 | def __init__(self, level, signature='word'): 36 | super(WordMatchResult, self).__init__(level, signature) 37 | 38 | def __eq__(self, other): 39 | if isinstance(other, WordMatchResult) or \ 40 | isinstance(other, FuzzyWordMatchResult) or \ 41 | isinstance(other, SemanticWordMatchResult): 42 | return self.level == other.level 43 | return False 44 | 45 | 46 | class FuzzyWordMatchResult(WordMatchResult): 47 | def __init__(self, level): 48 | super(FuzzyWordMatchResult, self).__init__(level, '(fuzzy)word') 49 | 50 | 51 | class MorphosemanticLinkMatchResult(WordMatchResult): 52 | def __init__(self, level): 53 | super(MorphosemanticLinkMatchResult, self).__init__(level, '(morph)word') 54 | 55 | 56 | class SemanticWordMatchResult(WordMatchResult): 57 | def __init__(self, level): 58 | super(SemanticWordMatchResult, self).__init__(level, '(sem)word') 59 | 60 | 61 | class FuzzySpanMatchResult(MatchResult): 62 | def __init__(self, level): 63 | super(FuzzySpanMatchResult, self).__init__(level, '(fuzzy)span') 64 | 65 | 66 | class NamedEntityMatchResult(EntityMatchResult): 67 | def __init__(self, level, children_levels, signature='entity'): 68 | super(NamedEntityMatchResult, self).__init__(level, children_levels, signature) 69 | 70 | def __eq__(self, other): 71 | if isinstance(other, FuzzyNamedEntityMatchResult) or \ 72 | isinstance(other, NamedEntityMatchResult) or \ 73 | isinstance(other, SemanticNamedEntityMatchResult): 74 | return self.level == other.level 75 | return False 76 | 77 | 78 | class FuzzyNamedEntityMatchResult(NamedEntityMatchResult): 79 | def __init__(self, level, children_levels): 80 | super(FuzzyNamedEntityMatchResult, self).__init__(level, children_levels, '(fuzzy)entity') 81 | 82 | 83 | class SemanticNamedEntityMatchResult(NamedEntityMatchResult): 84 | def __init__(self, level, children_levels): 85 | super(SemanticNamedEntityMatchResult, self).__init__(level, children_levels, '(sem)entity') 86 | 87 | 88 | class URLEntityMatchResult(EntityMatchResult): 89 | def __init__(self, level, children_levels): 90 | super(URLEntityMatchResult, self).__init__(level, children_levels, 'url-entity') 91 | 92 | 93 | class OrdinalEntityMatchResult(EntityMatchResult): 94 | def __init__(self, level, children_levels): 95 | super(OrdinalEntityMatchResult, self).__init__(level, children_levels, 'ordinal-entity') 96 | 97 | 98 | class DateEntityMatchResult(MatchResult): 99 | def __init__(self, level, children_levels): 100 | super(DateEntityMatchResult, self).__init__(level, 'date-entity') 101 | self.children_levels = children_levels 102 | 103 | def __str__(self): 104 | return 'date-entity=({0}, {1})'.format(self.level, self.children_levels) 105 | 106 | 107 | class MinusPolarityMatchResult(MatchResult): 108 | def __init__(self, level): 109 | super(MinusPolarityMatchResult, self).__init__(level, 'minus') 110 | 111 | 112 | class EntityTypeMatchResult(MatchResult): 113 | def __init__(self, level): 114 | super(EntityTypeMatchResult, self).__init__(level, 'entity_type') 115 | 116 | 117 | class QuantityMatchResult(MatchResult): 118 | def __init__(self, level): 119 | super(QuantityMatchResult, self).__init__(level, 'quantity') 120 | 121 | 122 | class PersonOfUpdateResult(MatchResult): 123 | def __init__(self, level): 124 | super(PersonOfUpdateResult, self).__init__(level, 'person_of') 125 | 126 | 127 | class PersonUpdateResult(MatchResult): 128 | def __init__(self, level): 129 | super(PersonUpdateResult, self).__init__(level, 'person') 130 | 131 | 132 | class GovernmentOrganizationUpdateResult(MatchResult): 133 | def __init__(self, level): 134 | super(GovernmentOrganizationUpdateResult, self).__init__(level, 'gov_org') 135 | 136 | 137 | class MinusPolarityPrefixesUpdateResult(MatchResult): 138 | def __init__(self, level): 139 | super(MinusPolarityPrefixesUpdateResult, self).__init__(level, 'minus_prefix') 140 | 141 | 142 | class DegreeUpdateResult(MatchResult): 143 | def __init__(self, level): 144 | super(DegreeUpdateResult, self).__init__(level, 'degree') 145 | 146 | 147 | class RelativePositionUpdateResult(MatchResult): 148 | def __init__(self, level): 149 | super(RelativePositionUpdateResult, self).__init__(level, 'relative_position') 150 | 151 | 152 | class HaveOrgRoleUpdateResult(MatchResult): 153 | def __init__(self, level): 154 | super(HaveOrgRoleUpdateResult, self).__init__(level, 'have-org-role-91') 155 | 156 | 157 | class CauseUpdateResult(MatchResult): 158 | def __init__(self, level): 159 | super(CauseUpdateResult, self).__init__(level, 'cause01') 160 | 161 | 162 | class BelocatedAtMatchResult(MatchResult): 163 | def __init__(self, level): 164 | super(BelocatedAtMatchResult, self).__init__(level, 'be-located-91') 165 | 166 | 167 | class ImperativeUpdateResult(MatchResult): 168 | def __init__(self, level): 169 | super(ImperativeUpdateResult, self).__init__(level, 'imperative') 170 | 171 | 172 | class PossibleUpdateResult(MatchResult): 173 | def __init__(self, level): 174 | super(PossibleUpdateResult, self).__init__(level, 'possible') 175 | 176 | 177 | __all__ = [ 178 | 'EntityTypeMatchResult', 179 | 'QuantityMatchResult', 180 | 'DateEntityMatchResult', 181 | 'URLEntityMatchResult', 182 | 'OrdinalEntityMatchResult', 183 | 'MinusPolarityMatchResult', 184 | 'BelocatedAtMatchResult', 185 | 'FuzzyWordMatchResult', 186 | 'FuzzySpanMatchResult', 187 | 'MorphosemanticLinkMatchResult', 188 | 'SemanticWordMatchResult', 189 | 'NamedEntityMatchResult', 190 | 'FuzzyNamedEntityMatchResult', 191 | 'SemanticNamedEntityMatchResult', 192 | 193 | 'PersonOfUpdateResult', 194 | 'PersonUpdateResult', 195 | 'RelativePositionUpdateResult', 196 | 'GovernmentOrganizationUpdateResult', 197 | 'MinusPolarityPrefixesUpdateResult', 198 | 'DegreeUpdateResult', 199 | 'HaveOrgRoleUpdateResult', 200 | 'CauseUpdateResult', 201 | 'ImperativeUpdateResult', 202 | 'WordMatchResult', 203 | 'PossibleUpdateResult', 204 | ] 205 | -------------------------------------------------------------------------------- /amr_aligner/rule_based_aligner/stemmer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | from nltk.stem.wordnet import WordNetLemmatizer 4 | lemmatizer = WordNetLemmatizer() 5 | 6 | 7 | class Stemmer(object): 8 | kMinusPrefix2 = ('un', 'in', 'il', 'im', 'ir', 'il', 'Un', 'In', 'Il', 'Im', 'Ir', 'Il') 9 | kMinusPrefix3 = ('non', 'Non') 10 | 11 | kMonths = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 12 | 'September', 'October', 'November', 'December'] 13 | 14 | kNumbers = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] 15 | 16 | kExceptions = {'.': ('multi-sentence', ), 17 | ';': ('and', 'multi-sentence', ), 18 | ':': ('mean', ), 19 | '!': ('expressive', ), 20 | '..': ('expressive', ), 21 | '...': ('expressive', ), 22 | '....': ('expressive', ), 23 | '?': ('interrogative', ), 24 | '%': ('percentage-entity', ), 25 | '$': ('dollar', ), 26 | 'also': ('include',), 27 | 'anti': ('oppose', 'counter'), 28 | 'but': ('contrast', 'have-concession'), 29 | 'while': ('contrast', ), 30 | 'because': ('cause',), 31 | 'whereby': ('cause', ), 32 | 'if': ('cause', 'interrogative'), 33 | 'by': ('cause', ), 34 | 'for': ('cause', ), 35 | 'so': ('infer', 'cause'), 36 | 'since': ('cause', ), 37 | 'on': ('cause', ), 38 | 'in': ('cause', ), 39 | 'against': ('-', ), 40 | 'no': ('-',), 41 | 'non': ('-', ), 42 | 'not': ('-', ), 43 | 'n\'t': ('-', ), 44 | 'never': ('-', ), 45 | 'yet': ('-', ), 46 | 'neither': ('-', ), 47 | 'of': ('include', 'have-manner', ), 48 | 'might': ('possible', ), 49 | 'may': ('possible', ), 50 | 'maybe': ('possible', ), 51 | 'could': ('possible', ), 52 | 'can': ('possible', ), 53 | 'cant': ('possible', ), 54 | 'cannot': ('possible', ), 55 | 'can\'t': ('possible', ), 56 | 'should': ('recommend', ), 57 | 'who': ('amr-unknown', ), 58 | 'what': ('amr-unknown', ), 59 | 'how': ('amr-unknown', ), 60 | 'as': ('and', 'same', 'contrast',), 61 | 'with': ('and', ), 62 | 'plus': ('and', ), 63 | '-': ('and', ), 64 | 'without': ('-', ), 65 | 'me': ('i', ), 66 | 'my': ('i', ), 67 | 'her': ('she', ), 68 | 'his': ('he', ), 69 | 'him': ('he', ), 70 | 'us': ('we', ), 71 | 'our': ('we', ), 72 | 'ours': ('we', ), 73 | 'your': ('you', ), 74 | 'yourself': ('you', ), 75 | 'these': ('this', ), 76 | 'those': ('that', ), 77 | 'o.k.': ('okay', ), 78 | 'death': ('die',), 79 | 'deaths': ('die', ), 80 | 'like': ('resemble', ), 81 | 'similar': ('resemble', ), 82 | 'right': ('entitle', ), 83 | 'rights': ('entitle',), 84 | 'must': ('obligate',), 85 | 'etc': ('et-cetera',), 86 | 'according': ('say', ),} 87 | 88 | def __init__(self): 89 | pass 90 | 91 | def stem(self, word, postag): 92 | ret = set() 93 | ret.add(word) 94 | ret.add(word.lower()) 95 | 96 | # lemmatize 97 | if postag is not None: 98 | ret.add(lemmatizer.lemmatize(word.lower(), postag)) 99 | else: 100 | ret.add(lemmatizer.lemmatize(word.lower(), 'n')) 101 | ret.add(lemmatizer.lemmatize(word.lower(), 'v')) 102 | ret.add(lemmatizer.lemmatize(word.lower(), 'a')) 103 | ret.add(lemmatizer.lemmatize(word.lower(), 's')) 104 | 105 | # normalize month 106 | month_normalized_word = self._normalize_month(word) 107 | if month_normalized_word is not None: 108 | ret.add(month_normalized_word) 109 | 110 | # normalize number 111 | number_normalized_word = self._normalize_number(word) 112 | if number_normalized_word is not None: 113 | ret.add(number_normalized_word) 114 | 115 | # normalize exceptions 116 | exception_normalized_words = self._normalize_exceptions(word) 117 | if exception_normalized_words is not None: 118 | for exception_normalized_word in exception_normalized_words: 119 | ret.add(exception_normalized_word) 120 | 121 | other_normalized_word = self._normalize_others(word) 122 | if other_normalized_word is not None: 123 | ret.add(other_normalized_word) 124 | return ret 125 | 126 | def _normalize_number(self, word): 127 | if word.lower() in self.kNumbers: 128 | return str(self.kNumbers.index(word.lower()) + 1) 129 | elif ',' in word and word.replace(',', '').isdigit(): 130 | return word.replace(',', '') 131 | return None 132 | 133 | def _normalize_month(self, word): 134 | if word in self.kMonths: 135 | return str(self.kMonths.index(word) + 1) 136 | return None 137 | 138 | def _normalize_exceptions(self, word): 139 | if word.lower() in self.kExceptions: 140 | return self.kExceptions[word.lower()] 141 | return None 142 | 143 | def _normalize_others(self, word): 144 | if word[:3] in self.kMinusPrefix3: 145 | return word[3:] 146 | elif word[:2] in self.kMinusPrefix2: 147 | return word[2:] 148 | elif word.endswith('er'): 149 | return word[:-2] 150 | elif word.endswith('ers'): 151 | return word[:-3] 152 | return None 153 | 154 | -------------------------------------------------------------------------------- /amr_aligner/rule_based_aligner/updater.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | from __future__ import absolute_import 4 | from .stemmer import Stemmer 5 | 6 | 7 | class Updater(object): 8 | def __init__(self): 9 | pass 10 | 11 | def update(self, words, graph, align_results): 12 | """ 13 | 14 | :param words: list[str] 15 | :param graph: Alignment 16 | :param align_results: AlignedResults 17 | :return: 18 | """ 19 | raise NotImplemented 20 | 21 | 22 | class EntityTypeUpdater(Updater): 23 | def __init__(self): 24 | super(EntityTypeUpdater, self).__init__() 25 | 26 | def update(self, words, graph, align_results): 27 | updated = False 28 | for node in graph.true_nodes(): 29 | if graph.is_entity(node, consider_alignment=False): 30 | # get the :name node 31 | edges = filter(lambda e: e.relation == 'name', graph.edges_by_parents[node.level]) 32 | if len(edges) > 0: 33 | for start, end in align_results.get_spans_by_level(edges[0].tgt_level): 34 | updated = updated or align_results.add(start, end, node.level, edges[0].tgt_level) 35 | return updated 36 | 37 | 38 | class QuantityUpdater(Updater): 39 | def __init__(self): 40 | super(QuantityUpdater, self).__init__() 41 | 42 | def update(self, words, graph, align_results): 43 | updated = False 44 | for node in graph.true_nodes(): 45 | if not node.name.endswith('-quantity') or node.level not in graph.edges_by_parents: 46 | continue 47 | edges = filter(lambda e: e.relation == 'unit', graph.edges_by_parents[node.level]) 48 | if len(edges) > 0: 49 | for start, end in align_results.get_spans_by_level(edges[0].tgt_level): 50 | updated = updated or align_results.add(start, end, node.level, edges[0].tgt_level) 51 | return updated 52 | 53 | 54 | class PersonOfUpdater(Updater): 55 | def __init__(self): 56 | super(PersonOfUpdater, self).__init__() 57 | 58 | def update(self, words, graph, align_results): 59 | updated = False 60 | for node in graph.true_nodes(): 61 | if node.name not in ('person', 'thing') or node.level not in graph.edges_by_parents: 62 | continue 63 | edges = filter(lambda e: e.relation.endswith('-of'), graph.edges_by_parents[node.level]) 64 | if len(edges) > 0: 65 | for start, end in align_results.get_spans_by_level(edges[0].tgt_level): 66 | updated = updated or align_results.add(start, end, node.level, edges[0].tgt_level) 67 | return updated 68 | 69 | 70 | class PersonUpdater(Updater): 71 | def __init__(self): 72 | super(PersonUpdater, self).__init__() 73 | 74 | def update(self, words, graph, align_results): 75 | updated = False 76 | for node in graph.true_nodes(): 77 | if node.name != 'person' or node.level not in graph.edges_by_parents: 78 | continue 79 | edges = graph.edges_by_parents[node.level] 80 | if len(edges) == 1: 81 | for start, end in align_results.get_spans_by_level(edges[0].tgt_level): 82 | updated = updated or align_results.add(start, end, node.level, edges[0].tgt_level) 83 | return updated 84 | 85 | 86 | class GovernmentOrganizationUpdater(Updater): 87 | def __init__(self): 88 | super(GovernmentOrganizationUpdater, self).__init__() 89 | 90 | def update(self, words, graph, align_results): 91 | updated = False 92 | for edge in graph.edges: 93 | if not edge.relation.endswith('-of') or \ 94 | not edge.relation.startswith('ARG') or \ 95 | edge.src_name != 'government-organization': 96 | continue 97 | for start, end in align_results.get_spans_by_level(edge.tgt_level): 98 | updated = updated or align_results.add(start, end, edge.src_level, edge.tgt_level) 99 | return updated 100 | 101 | 102 | class RelativePositionUpdater(Updater): 103 | def __init__(self): 104 | super(RelativePositionUpdater, self).__init__() 105 | 106 | def update(self, words, graph, align_results): 107 | updated = False 108 | for edge in graph.edges: 109 | if edge.src_name != 'relative-position': 110 | continue 111 | for start, end in align_results.get_spans_by_level(edge.tgt_level): 112 | updated = updated or align_results.add(start, end, edge.src_level, edge.tgt_level) 113 | return updated 114 | 115 | 116 | class MinusPolarityPrefixUpdater(Updater): 117 | def __init__(self): 118 | super(MinusPolarityPrefixUpdater, self).__init__() 119 | 120 | def update(self, words, graph, align_results): 121 | updated = False 122 | for node in graph.true_nodes(): 123 | if node.name != '-': 124 | continue 125 | edges = graph.edges_by_children[node.level] 126 | if len(edges) == 1 and edges[0].relation == 'polarity': 127 | for start, end in align_results.get_spans_by_level(edges[0].src_level): 128 | if start + 1 == end and (words[start][:2] in Stemmer.kMinusPrefix2 or 129 | words[start][:3] in Stemmer.kMinusPrefix3 or 130 | words[start].endswith('less') or 131 | words[start].endswith('nt') or 132 | words[start].endswith('n\'t')): 133 | updated = updated or align_results.add(start, end, node.level, edges[0].src_level) 134 | return updated 135 | 136 | 137 | class DegreeUpdater(Updater): 138 | def __init__(self): 139 | super(DegreeUpdater, self).__init__() 140 | 141 | def update(self, words, graph, align_results): 142 | updated = False 143 | for edge in graph.edges: 144 | if edge.relation != 'degree': 145 | continue 146 | for start, end in align_results.get_spans_by_level(edge.src_level): 147 | if start + 1 == end and (words[start].endswith('est') or words[start].endswith('er')): 148 | updated = updated or align_results.add(start, end, edge.tgt_level, edge.src_level) 149 | return updated 150 | 151 | 152 | class HaveOrgRoleUpdater(Updater): 153 | def __init__(self): 154 | super(HaveOrgRoleUpdater, self).__init__() 155 | 156 | def update(self, words, graph, align_results): 157 | updated = False 158 | for node in graph.true_nodes(): 159 | if node.name not in ('have-org-role-91', 'have-rel-role-91') or node.level not in graph.edges_by_parents: 160 | continue 161 | edges = [edge for edge in graph.edges_by_parents[node.level] if edge.relation in ('ARG1', 'ARG2')] 162 | if len(edges) == 1: 163 | edge = edges[0] 164 | elif len(edges) == 2: 165 | edge = edges[0] if edges[0].relation == 'ARG2' else edges[1] 166 | for start, end in align_results.get_spans_by_level(edge.tgt_level): 167 | updated = updated or align_results.add(start, end, edge.src_level, edge.tgt_level) 168 | return updated 169 | 170 | 171 | class CauseUpdater(Updater): 172 | def __init__(self): 173 | super(CauseUpdater, self).__init__() 174 | 175 | def update(self, words, graph, align_results): 176 | updated = False 177 | for edge in graph.edges: 178 | if edge.tgt_name != 'cause-01' or not edge.relation.startswith('ARG') or not edge.relation.endswith('-of'): 179 | continue 180 | for start, end in align_results.get_spans_by_level(edge.src_level): 181 | if start + 1 == end: 182 | updated = updated or align_results.add(start, end, edge.tgt_level, edge.src_level) 183 | return updated 184 | 185 | 186 | class ImperativeUpdater(Updater): 187 | def __init__(self): 188 | super(ImperativeUpdater, self).__init__() 189 | 190 | def update(self, words, graph, align_results): 191 | updated = False 192 | for edge in graph.edges: 193 | if edge.tgt_name != 'imperative' or edge.relation != 'mode': 194 | continue 195 | you_level = [e.tgt_level for e in graph.edges_by_parents[edge.src_level] if e.tgt_name == 'you'] 196 | for start, end in align_results.get_spans_by_level(edge.src_level): 197 | if start + 1 == end: 198 | updated = updated or align_results.add(start, end, edge.tgt_level, edge.src_level) 199 | if len(you_level) == 1: 200 | updated = updated or align_results.add(start, end, you_level[0], edge.src_level) 201 | return updated 202 | 203 | 204 | class PossibleUpdater(Updater): 205 | def __init__(self): 206 | super(PossibleUpdater, self).__init__() 207 | 208 | def update(self, words, graph, align_results): 209 | updated = False 210 | for edge in graph.edges: 211 | if edge.src_name == 'possible' and edge.relation == 'domain': 212 | # operable => (p / possible :domain (o / operate)) 213 | for start, end in align_results.get_spans_by_level(edge.tgt_level): 214 | if start + 1 == end and words[start].endswith('ble'): 215 | updated = updated or align_results.add(start, end, edge.src_level, edge.tgt_level) 216 | elif edge.tgt_name == 'possible' and edge.relation == 'mod': 217 | for start, end in align_results.get_spans_by_level(edge.src_level): 218 | if start + 1 == end and words[start].endswith('ble'): 219 | updated = updated or align_results.add(start, end, edge.tgt_level, edge.src_level) 220 | return updated 221 | 222 | 223 | __all__ = [ 224 | 'EntityTypeUpdater', 225 | 'QuantityUpdater', 226 | 'PersonOfUpdater', 227 | 'PersonUpdater', 228 | 'RelativePositionUpdater', 229 | 'MinusPolarityPrefixUpdater', 230 | 'DegreeUpdater', 231 | 'HaveOrgRoleUpdater', 232 | 'GovernmentOrganizationUpdater', 233 | 'CauseUpdater', 234 | 'ImperativeUpdater', 235 | 'PossibleUpdater' 236 | ] -------------------------------------------------------------------------------- /amr_aligner/smatch/README.md: -------------------------------------------------------------------------------- 1 | C++ Implementation of Fast Smatch 2 | ================================= 3 | 4 | We use the *oracle smatch score* 5 | to evaluate each generated alignment, 6 | and we found the original smatch script 7 | greatly slowed down our program. 8 | So we use `Cython` to re-implement the smatch 9 | script. 10 | 11 | ## Compilation 12 | 13 | run `python setup.py build` in the `amr_aligner/smatch` 14 | folder. It will generate a dynamic library `_smatch.so` 15 | under the `build/lib.${arch}-2.7/` folder. 16 | Move the dynamic library into `amr_aligner/smatch` 17 | and it will do the work. 18 | 19 | ## Smatch Version 20 | 21 | 2.0.2 -------------------------------------------------------------------------------- /amr_aligner/smatch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/amr_aligner/smatch/__init__.py -------------------------------------------------------------------------------- /amr_aligner/smatch/_gain.cc: -------------------------------------------------------------------------------- 1 | #include "_gain.h" 2 | 3 | int _hash_pair(int x, int y) { 4 | return _HASH_PAIR(x, y); 5 | } 6 | 7 | int _get_0(int x) { 8 | return _GET_0(x); 9 | } 10 | 11 | int _get_1(int x) { 12 | return _GET_1(x); 13 | } 14 | 15 | 16 | int move_gain(MappingType & mapping, 17 | int node_id, 18 | int old_id, 19 | int new_id, 20 | WeightDictType & weight_dict, 21 | int match_num) { 22 | int new_mapping = _HASH_PAIR(node_id, new_id); 23 | int old_mapping = _HASH_PAIR(node_id, old_id); 24 | int saved_id = mapping[node_id]; 25 | 26 | mapping[node_id] = new_id; 27 | int gain = 0; 28 | WeightDictType::const_iterator entry = weight_dict.find(new_mapping); 29 | if (entry != weight_dict.end()) { 30 | for (std::unordered_map::const_iterator key = entry->second.begin(); 31 | key != entry->second.end(); 32 | key ++) { 33 | if (key->first == -1) { 34 | gain += key->second; 35 | } else if (mapping[_GET_0(key->first)] == _GET_1(key->first)) { 36 | gain += key->second; 37 | } 38 | } 39 | } 40 | 41 | mapping[node_id] = saved_id; 42 | entry = weight_dict.find(old_mapping); 43 | if (entry != weight_dict.end()) { 44 | for (std::unordered_map::const_iterator key = entry->second.begin(); 45 | key != entry->second.end(); 46 | key ++) { 47 | if (key->first == -1) { 48 | gain -= key->second; 49 | } else if (mapping[_GET_0(key->first)] == _GET_1(key->first)) { 50 | gain -= key->second; 51 | } 52 | } 53 | } 54 | return gain; 55 | } 56 | 57 | int swap_gain(MappingType & mapping, 58 | int node_id1, 59 | int mapping_id1, 60 | int node_id2, 61 | int mapping_id2, 62 | WeightDictType & weight_dict, 63 | int match_num) { 64 | int saved_id1 = mapping[node_id1]; 65 | int saved_id2 = mapping[node_id2]; 66 | int gain = 0; 67 | 68 | int new_mapping1 = _HASH_PAIR(node_id1, mapping_id2); 69 | int new_mapping2 = _HASH_PAIR(node_id2, mapping_id1); 70 | int old_mapping1 = _HASH_PAIR(node_id1, mapping_id1); 71 | int old_mapping2 = _HASH_PAIR(node_id2, mapping_id2); 72 | 73 | if (node_id1 > node_id2) { 74 | std::swap(new_mapping1, new_mapping2); 75 | std::swap(old_mapping1, old_mapping2); 76 | } 77 | 78 | mapping[node_id1] = mapping_id2; 79 | mapping[node_id2] = mapping_id1; 80 | 81 | WeightDictType::const_iterator entry = weight_dict.find(new_mapping1); 82 | if (entry != weight_dict.end()) { 83 | for (std::unordered_map::const_iterator key = entry->second.begin(); 84 | key != entry->second.end(); 85 | key ++) { 86 | if (key->first == -1) { 87 | gain += key->second; 88 | } else if (mapping[_GET_0(key->first)] == _GET_1(key->first)) { 89 | gain += key->second; 90 | } 91 | } 92 | } 93 | 94 | entry = weight_dict.find(new_mapping2); 95 | if (entry != weight_dict.end()) { 96 | for (std::unordered_map::const_iterator key = entry->second.begin(); 97 | key != entry->second.end(); 98 | key ++) { 99 | if (key->first == -1) { 100 | gain += key->second; 101 | continue; 102 | } 103 | int first = _GET_0(key->first); 104 | if (first != node_id1 && mapping[first] == _GET_1(key->first)) { 105 | gain += key->second; 106 | } 107 | } 108 | } 109 | 110 | mapping[node_id1] = saved_id1; 111 | mapping[node_id2] = saved_id2; 112 | 113 | entry = weight_dict.find(old_mapping1); 114 | if (entry != weight_dict.end()) { 115 | for (std::unordered_map::const_iterator key = entry->second.begin(); 116 | key != entry->second.end(); 117 | key ++) { 118 | if (key->first == -1) { 119 | gain -= key->second; 120 | } else if (mapping[_GET_0(key->first)] == _GET_1(key->first)) { 121 | gain -= key->second; 122 | } 123 | } 124 | } 125 | 126 | entry = weight_dict.find(old_mapping2); 127 | if (entry != weight_dict.end()) { 128 | for (std::unordered_map::const_iterator key = entry->second.begin(); 129 | key != entry->second.end(); 130 | key ++) { 131 | if (key->first == -1) { 132 | gain -= key->second; 133 | continue; 134 | } 135 | int first = _GET_0(key->first); 136 | if (first != node_id1 && mapping[first] == _GET_1(key->first)) { 137 | gain -= key->second; 138 | } 139 | } 140 | } 141 | 142 | return gain; 143 | } 144 | -------------------------------------------------------------------------------- /amr_aligner/smatch/_gain.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define _HASH_PAIR(x, y) (((x) << 14) | (y)) 6 | #define _GET_0(x) ((x) >> 14) 7 | #define _GET_1(x) ((x) & ((1 << 14) - 1)) 8 | 9 | typedef std::unordered_map > WeightDictType; 10 | typedef std::vector MappingType; 11 | 12 | int _hash_pair(int x, int y); 13 | 14 | int _get_0(int x); 15 | 16 | int _get_1(int x); 17 | 18 | int move_gain(MappingType & mapping, 19 | int node_id, 20 | int old_id, 21 | int new_id, 22 | WeightDictType & weight_dict, 23 | int match_num); 24 | 25 | int swap_gain(MappingType & mapping, 26 | int node_id1, 27 | int mapping_id1, 28 | int node_id2, 29 | int mapping_id2, 30 | WeightDictType & weight_dict, 31 | int match_num); 32 | 33 | -------------------------------------------------------------------------------- /amr_aligner/smatch/api.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from amr import AMR 4 | try: 5 | from _smatch import get_best_match, compute_f, clear_match_triple_dict 6 | except: 7 | import sys 8 | print('WARN: use slow version of smatch api.', file=sys.stderr) 9 | from smatch import get_best_match, compute_f, clear_match_triple_dict 10 | 11 | 12 | def _smatch(cur_amr1, cur_amr2, n_iter): 13 | clear_match_triple_dict() 14 | 15 | amr1 = AMR.parse_AMR_line(cur_amr1) 16 | amr2 = AMR.parse_AMR_line(cur_amr2) 17 | prefix1 = "a" 18 | prefix2 = "b" 19 | 20 | amr1.rename_node(prefix1) 21 | amr2.rename_node(prefix2) 22 | instance1, attributes1, relation1 = amr1.get_triples() 23 | instance2, attributes2, relation2 = amr2.get_triples() 24 | 25 | best_mapping, best_match_num = get_best_match(instance1, attributes1, relation1, 26 | instance2, attributes2, relation2, 27 | prefix1, prefix2) 28 | 29 | test_triple_num = len(instance1) + len(attributes1) + len(relation1) 30 | gold_triple_num = len(instance2) + len(attributes2) + len(relation2) 31 | return best_match_num, test_triple_num, gold_triple_num 32 | 33 | 34 | def smatch(cur_amr1, cur_amr2, n_iter=5): 35 | best_match_num, test_triple_num, gold_triple_num = _smatch(cur_amr1, cur_amr2, n_iter) 36 | precision, recall, best_f_score = compute_f(best_match_num, test_triple_num, gold_triple_num) 37 | return best_f_score 38 | 39 | 40 | class SmatchScorer(object): 41 | def __init__(self, n_iter=5): 42 | self.total_match_num = 0 43 | self.total_test_num = 0 44 | self.total_gold_num = 0 45 | self.last_match_num = 0 46 | self.last_test_num = 0 47 | self.last_gold_num = 0 48 | self.n_iter = n_iter 49 | 50 | def update(self, cur_amr1, cur_amr2): 51 | best_match_num, test_triple_num, gold_triple_num = _smatch(cur_amr1, cur_amr2, self.n_iter) 52 | self.last_match_num = best_match_num 53 | self.last_test_num = test_triple_num 54 | self.last_gold_num = gold_triple_num 55 | 56 | self.total_match_num += best_match_num 57 | self.total_test_num += test_triple_num 58 | self.total_gold_num += gold_triple_num 59 | 60 | def f_score(self): 61 | return compute_f(self.total_match_num, self.total_test_num, self.total_gold_num)[2] 62 | 63 | def last_f_score(self): 64 | return compute_f(self.last_match_num, self.last_test_num, self.last_gold_num)[2] 65 | 66 | def reset(self): 67 | self.total_match_num = 0 68 | self.total_test_num = 0 69 | self.total_gold_num = 0 70 | -------------------------------------------------------------------------------- /amr_aligner/smatch/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from distutils.core import setup, Extension 3 | from Cython.Build import cythonize 4 | 5 | setup(ext_modules=cythonize(Extension("_smatch", sources=["_smatch.pyx", "_gain.cc"], language="c++"))) 6 | -------------------------------------------------------------------------------- /amr_aligner/system/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/amr_aligner/system/__init__.py -------------------------------------------------------------------------------- /amr_aligner/system/eager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/amr_aligner/system/eager/__init__.py -------------------------------------------------------------------------------- /amr_aligner/system/edge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | from system.node import TokenNode, AttributeNode 4 | 5 | 6 | class Edge(object): 7 | def __init__(self, source_node, relation, target_node): 8 | self.source_node = source_node 9 | self.relation = relation 10 | self.target_node = target_node 11 | 12 | 13 | class EdgeSet(set): 14 | def __init__(self, top): 15 | super(EdgeSet, self).__init__() 16 | self.top = top 17 | 18 | def _traverse_print(self, root, variables, shown, in_const_edge): 19 | children = [] 20 | for edge in self.__iter__(): 21 | if edge.source_node == root: 22 | children.append((edge.relation, edge.target_node)) 23 | children.sort(key=lambda x: (x[0], x[1].name)) 24 | 25 | if len(children) == 0: 26 | if not shown[root]: 27 | shown[root] = True 28 | if isinstance(root, TokenNode): 29 | ret = '"{0}"'.format(self._normalize_entity_token(root.name)) 30 | elif isinstance(root, AttributeNode) or (in_const_edge and self._is_attribute(root.name)): 31 | ret = root.name 32 | else: 33 | ret = '({0} / {1})'.format(variables[root], root.name) 34 | else: 35 | ret = variables[root] 36 | else: 37 | if shown[root]: 38 | ret = '{0}'.format(variables[root]) 39 | else: 40 | shown[root] = True 41 | unnamed_concept = in_const_edge and self._is_attribute(root.name) 42 | if unnamed_concept: 43 | ret = root.name 44 | else: 45 | ret = '({0} / {1}'.format(variables[root], root.name) 46 | for relation, child in children: 47 | ret += ' :{0} {1}'.format(relation, self._traverse_print(child, variables, shown, 48 | self._is_const_relation(relation))) 49 | if not unnamed_concept: 50 | ret += ')' 51 | return ret 52 | 53 | def _get_size(self, root, visited, covered=set()): 54 | if root in visited or root in covered: 55 | return 1 56 | visited.add(root) 57 | tree_size = 0 58 | children = [] 59 | for edge in self.__iter__(): 60 | if edge.source_node == root: 61 | children.append(edge.target_node) 62 | for child in children: 63 | tree_size += self._get_size(child, visited, covered) + 1 64 | return tree_size + 1 65 | 66 | def _print(self): 67 | roots = self._get_roots() 68 | variables = self._get_variables 69 | shown = {node: False for node in variables} 70 | if len(roots) == 1: 71 | return self._traverse_print(roots[0], variables, shown, False) 72 | elif len(roots) > 1: 73 | # return self._traverse_print(self.top, variables, shown) 74 | new_root = roots[0] 75 | for i, root in enumerate(roots[1:]): 76 | self.add(Edge(new_root, 'TOP{0}'.format(i), root)) 77 | return self._traverse_print(new_root, variables, shown, False) 78 | else: 79 | return '(a / amr-empty)' 80 | 81 | def __str__(self): 82 | return self._print() 83 | 84 | def _get_roots(self): 85 | covered = set() 86 | for node in self._get_variables: 87 | if node.name == '_ROOT_': 88 | covered.add(node) 89 | 90 | ret = [] 91 | for edge in self.__iter__(): 92 | if edge.source_node == self.top: 93 | ret.append(edge.target_node) 94 | self._get_size(edge.target_node, covered) 95 | 96 | while True: 97 | max_sz = 0 98 | max_node = None 99 | for node in self._get_variables: 100 | if node not in covered: 101 | visited = set() 102 | sz = self._get_size(node, visited, covered) 103 | if sz > max_sz: 104 | max_node = node 105 | max_sz = sz 106 | if max_node is None: 107 | break 108 | ret.append(max_node) 109 | visited = set() 110 | sz = self._get_size(max_node, covered) 111 | 112 | assert len(covered) == len(self._get_variables) 113 | return ret 114 | 115 | @property 116 | def _get_variables(self): 117 | nodes = set() 118 | for edge in self.__iter__(): 119 | nodes.add(edge.source_node) 120 | nodes.add(edge.target_node) 121 | nodes = list(nodes) 122 | nodes.sort(key=lambda x: x.name) 123 | 124 | variables = {} 125 | variable_name_counts = {} 126 | for node in nodes: 127 | shortname = self._shortname(node.name) 128 | if shortname not in variable_name_counts: 129 | variable_name_counts[shortname] = 0 130 | variable_name_counts[shortname] += 1 131 | count = variable_name_counts[shortname] 132 | variables[node] = shortname if count == 1 else (shortname + str(count)) 133 | return variables 134 | 135 | @staticmethod 136 | def _normalize_entity_token(token): 137 | if token == '"': 138 | return '_QUOTE_' 139 | return token 140 | 141 | @staticmethod 142 | def _shortname(token): 143 | if token[0] == '"': 144 | return token[1] if len(token) > 1 else 'q' 145 | return token[0] 146 | 147 | @staticmethod 148 | def _is_attribute(name): 149 | if name in ('-', 'imperative'): # polarity 150 | return True 151 | return name.isdigit() 152 | 153 | @staticmethod 154 | def _is_const_relation(relation): 155 | if relation.startswith('op') or \ 156 | relation in ('month', 'decade', 'polarity', 'day', 'quarter', 'year', 'era', 'century', 157 | 'timezone', 'polite', 'mode', 'value', 'quant', 'unit', 'range', 'scale'): 158 | return True 159 | return False 160 | -------------------------------------------------------------------------------- /amr_aligner/system/misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | import sys 5 | from datetime import datetime 6 | _DATE_FORMATS = { 7 | '%y0000': (True, False, False), 8 | '%y%m00': (True, True, False), 9 | '%y%m%d': (True, True, True), 10 | '%Y0000': (True, False, False), 11 | '%Y%m00': (True, True, False), 12 | '%d %B %Y': (True, True, True), 13 | '%d %B': (True, True, False), 14 | '%d %Y': (True, False, True), 15 | '%Y%m%d': (True, True, True), 16 | '%Y-%m-%d': (True, True, True), 17 | '%m/%d': (False, True, True), 18 | '%m/%d/%Y': (True, True, True), 19 | '%m - %d - %Y': (True, True, True), 20 | '%B %Y': (True, True, False), 21 | '%B , %Y': (True, True, False), 22 | '%B %d %Y': (True, True, True), 23 | '%B %d , %Y': (True, True, True), 24 | '%B %d': (False, True, True), 25 | '%B %dst': (False, True, True), 26 | '%B %dnd': (False, True, True), 27 | '%B %drd': (False, True, True), 28 | '%B %dth': (False, True, True), 29 | '%B': (False, True, False), 30 | '%Y': (True, False, False), 31 | '%y': (True, False, False), 32 | } 33 | 34 | 35 | def parse_date(expression): 36 | results = [] 37 | for format_ in _DATE_FORMATS: 38 | try: 39 | result = datetime.strptime(expression, format_) 40 | results.append((result, _DATE_FORMATS[format_])) 41 | except: 42 | continue 43 | results = list(filter(lambda result: 1900 <= result[0].year < 2100, results)) 44 | if len(results) > 1: 45 | return results[0] 46 | elif len(results) == 1: 47 | return results[0] 48 | else: 49 | return None, (False, False, False) 50 | 51 | 52 | def parse_all_dates(expression): 53 | results = [] 54 | for format_ in _DATE_FORMATS: 55 | try: 56 | result = datetime.strptime(expression, format_) 57 | results.append((result, _DATE_FORMATS[format_])) 58 | except: 59 | continue 60 | results = list(filter(lambda r: 1900 <= r[0].year < 2100, results)) 61 | return results 62 | 63 | 64 | def test(): 65 | for line in open(sys.argv[1], 'r'): 66 | expression, fields = line.strip().split('|||') 67 | expression = expression.strip() 68 | result = parse_date(expression) 69 | slots = result[1] 70 | for field in fields: 71 | if field == 'year': 72 | assert slots[0] 73 | if field == 'month': 74 | assert slots[1] 75 | if field == 'day': 76 | assert slots[2] 77 | print('{0} ||| {1} ||| {2}'.format(expression, slots, fields), file=sys.stderr) 78 | 79 | 80 | if __name__ == "__main__": 81 | test() 82 | -------------------------------------------------------------------------------- /amr_aligner/system/node.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | 4 | 5 | class Node(object): 6 | def __init__(self, name, type_name, coverage): 7 | self.name = name 8 | self.type_name = type_name 9 | self.coverage = coverage 10 | 11 | def get_coverage(self): 12 | return self.coverage 13 | 14 | def get_name(self): 15 | return self.name 16 | 17 | def get_type(self): 18 | return self.type_name 19 | 20 | 21 | class TokenNode(Node): 22 | def __init__(self, name, coverage): 23 | super(TokenNode, self).__init__(name, 'token', coverage) 24 | 25 | def __str__(self): 26 | return '"{0}"'.format(self.name) 27 | 28 | 29 | class EntityNode(Node): 30 | def __init__(self, node1, node2): 31 | self.nodes = [node1, node2] 32 | coverage = node1.get_coverage() + node2.get_coverage() 33 | name = '{0}'.format('_'.join([node1.name, node2.name])) 34 | super(EntityNode, self).__init__(name, 'entity', coverage) 35 | 36 | def add(self, node): 37 | self.nodes.append(node) 38 | self.coverage = self.coverage + node.get_coverage() 39 | self.name = self.name + '_{0}'.format(node.name) 40 | 41 | def __str__(self): 42 | return '"{0}"'.format(self.name) 43 | 44 | 45 | class ConceptNode(Node): 46 | def __init__(self, name, coverage, level=None): 47 | super(ConceptNode, self).__init__(name, 'concept', coverage) 48 | self.level = level 49 | 50 | def get_level(self): 51 | return self.level 52 | 53 | def __str__(self): 54 | return self.name 55 | 56 | 57 | class AttributeNode(Node): 58 | def __init__(self, value): 59 | super(AttributeNode, self).__init__(value, 'attribute', None) 60 | 61 | def __str__(self): 62 | return '={0}'.format(self.name) 63 | 64 | 65 | def coverage_match_alignment(coverage, align): 66 | assert isinstance(coverage, list) 67 | if len(coverage) == 1: 68 | return align[0] == coverage[0] and align[1] == align[0] + 1 69 | else: 70 | return align[0] == coverage[0] and align[1] == coverage[-1] + 1 71 | -------------------------------------------------------------------------------- /amr_parser/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(amr_parser) 2 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR) 3 | 4 | set(CMAKE_MACOSX_RPATH 1) 5 | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) 6 | set (EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) 7 | 8 | # DYNET uses Eigen which exploits modern CPU architectures. To get the 9 | # best possible performance, the following are recommended: 10 | # 1. use very recent versions of gcc or Clang to build 11 | # 2. use very recent versions of Eigen (ideally the dev version) 12 | # 3. try compiler options like -march=native or other architecture 13 | # flags (the compiler does not always make the best configuration 14 | # decisions without help) 15 | 16 | function(find_mkl) 17 | set(MKL_ARCH intel64) 18 | find_path(MKL_INCLUDE_DIR mkl.h 19 | PATHS ${MKL_ROOT} ${MKL_ROOT}/include) 20 | find_library(MKL_CORE_LIB NAMES mkl_intel_lp64 mkl_intel_thread mkl_core 21 | PATHS ${MKL_ROOT} ${MKL_ROOT}/lib/${MKL_ARCH} 22 | DOC "MKL core library path") 23 | 24 | find_library(MKL_COMPILER_LIB NAMES iomp5 libiomp5md 25 | PATHS ${MKL_ROOT} ${MKL_ROOT}/../compiler/lib/${MKL_ARCH} #Windows 26 | ${MKL_ROOT}/../compilers_and_libraries/linux/lib/${MKL_ARCH}_lin #Linux 27 | DOC "MKL compiler lib (for threaded MKL)") 28 | 29 | if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_COMPILER_LIB) 30 | get_filename_component(MKL_CORE_LIB_DIR ${MKL_CORE_LIB} DIRECTORY) 31 | get_filename_component(MKL_COMPILER_LIB_DIR ${MKL_COMPILER_LIB} DIRECTORY) 32 | get_filename_component(MKL_COMPILER_LIB_FILE ${MKL_COMPILER_LIB} NAME) 33 | message(STATUS "Found MKL\n * include: ${MKL_INCLUDE_DIR},\n * core library dir: ${MKL_CORE_LIB_DIR},\n * compiler library: ${MKL_COMPILER_LIB}") 34 | 35 | # Due to a conflict with /MT and /MD, MSVC needs mkl_intel_lp64 linked last, or we can change individual 36 | # projects to use /MT (mkl_intel_lp64 linked with /MT, default MSVC projects use /MD), or we can instead 37 | # link to the DLL versions. For now I'm opting for this solution which seems to work with projects still 38 | # at their default /MD. Linux build requires the mkl_intel_lp64 to be linked first. So...: 39 | if(MSVC) 40 | set(LIBS ${LIBS} mkl_intel_thread mkl_core mkl_intel_lp64 ${MKL_COMPILER_LIB_FILE} PARENT_SCOPE) 41 | else() 42 | set(LIBS ${LIBS} mkl_intel_lp64 mkl_intel_thread mkl_core ${MKL_COMPILER_LIB_FILE} PARENT_SCOPE) 43 | endif() 44 | include_directories(${MKL_INCLUDE_DIR}) 45 | link_directories(${MKL_CORE_LIB_DIR} ${MKL_COMPILER_LIB_DIR}) 46 | set(MKL_LINK_DIRS ${MKL_CORE_LIB_DIR} ${MKL_COMPILER_LIB_DIR} PARENT_SCOPE) # Keeping this for python build 47 | else() 48 | message(FATAL_ERROR "Failed to find MKL in path: ${MKL_ROOT} (Did you set MKL_ROOT properly?)") 49 | endif() 50 | endfunction() 51 | 52 | ######## Cross-compiler, cross-platform options 53 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_FAST_MATH") 54 | if (MKL OR MKL_ROOT) 55 | find_mkl() # sets include/lib directories and sets ${LIBS} needed for linking 56 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_USE_MKL_ALL") 57 | endif() 58 | 59 | 60 | ######## Platform-specific options 61 | if(WIN32) 62 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX") # Disable min/max macros in windef.h 63 | endif() 64 | 65 | ######## Compiler-specific options 66 | if(MSVC) 67 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W1 -DEIGEN_HAS_C99_MATH /MP") # -Wall produces 20k warnings 68 | else() 69 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -Wno-missing-braces -std=c++11 -Ofast -g -march=native") 70 | endif() 71 | 72 | enable_testing() 73 | 74 | function(find_cudnn) 75 | set(CUDNN_ROOT "" CACHE PATH "CUDNN root path") 76 | find_path(CUDNN_INCLUDE_DIRS cudnn.h 77 | PATHS ${CUDNN_ROOT} 78 | ${CUDNN_ROOT}/include 79 | DOC "CUDNN include path") 80 | find_library(CUDNN_LIBRARIES NAMES libcudnn.so 81 | PATHS ${CUDNN_ROOT} 82 | ${CUDNN_ROOT}/lib 83 | ${CUDNN_ROOT}/lib64 84 | DOC "CUDNN library path") 85 | if(CUDNN_INCLUDE_DIRS AND CUDNN_LIBRARIES) 86 | set(CUDNN_FOUND TRUE PARENT_SCOPE) 87 | message(STATUS "Found CUDNN (include: ${CUDNN_INCLUDE_DIRS}, library: ${CUDNN_LIBRARIES})") 88 | mark_as_advanced(CUDNN_INCLUDE_DIRS CUDNN_LIBRARIES) 89 | else() 90 | MESSAGE(FATAL_ERROR "Failed to find CUDNN in path: ${CUDNN_ROOT} (Did you set CUDNN_ROOT properly?)") 91 | endif() 92 | endfunction() 93 | 94 | # look for Boost 95 | if(DEFINED ENV{BOOST_ROOT}) 96 | set(Boost_NO_SYSTEM_PATHS ON) 97 | get_filename_component(Boost_INCLUDE_DIR "${Boost_INCLUDE_DIR}" REALPATH BASE_DIR "${CMAKE_BINARY_DIR}") 98 | endif() 99 | set(Boost_REALPATH ON) 100 | message("-- Boost dir is " ${Boost_INCLUDE_DIR}) 101 | if (MSVC) 102 | find_package(Boost COMPONENTS program_options regex serialization REQUIRED) 103 | else() 104 | add_definitions (-DBOOST_LOG_DYN_LINK) 105 | find_package(Boost COMPONENTS program_options regex serialization log_setup log thread system REQUIRED) 106 | endif() 107 | include_directories(${Boost_INCLUDE_DIR}) 108 | if(MSVC) 109 | # Boost does auto-linking when using a compiler like Microsoft Visual C++, we just need to help it find the libraries 110 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LIBPATH:${Boost_LIBRARY_DIRS}") 111 | set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /LIBPATH:${Boost_LIBRARY_DIRS}") 112 | else() 113 | set(LIBS ${LIBS} ${Boost_LIBRARIES}) 114 | endif() 115 | # trouble shooting: 116 | # if boost library cannot be found, in addition to install boost library 117 | # check if environment variables are set 118 | # 119 | # to set boost root and its library root in environment variable, use 120 | # for example 121 | # echo "export BOOST_LIBRARYDIR=/usr/local/lib" >> ~/.bashrc 122 | # echo "export BOOST_ROOT=/cygdrive/d/tools/boost_1_58_0/boost_1_58_0" >> ~/.bashrc 123 | # then run source ~/.bashrc to have those environment variable effective immediately 124 | if (NOT DEFINED DYNET_DEBUG_LEVEL) 125 | set(DYNET_DEBUG_LEVEL 1) 126 | endif() 127 | add_definitions(-DDYNET_DEBUG_LEVEL=${DYNET_DEBUG_LEVEL}) 128 | 129 | if(BACKEND) 130 | message("-- BACKEND: ${BACKEND}") 131 | else() 132 | message("-- BACKEND not specified, defaulting to eigen.") 133 | set(BACKEND "eigen") 134 | endif() 135 | 136 | if(BACKEND MATCHES "^eigen$") 137 | set(WITH_EIGEN_BACKEND 1) 138 | elseif(BACKEND MATCHES "^cuda$") 139 | set(WITH_CUDA_BACKEND 1) 140 | else() 141 | message(SEND_ERROR "BACKEND must be eigen or cuda") 142 | endif() 143 | 144 | if (WITH_CUDA_BACKEND) 145 | find_package(CUDA REQUIRED) 146 | set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT}) 147 | include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) 148 | #list(APPEND CUDA_LIBRARIES /usr/lib64/libpthread.so) 149 | MESSAGE("CUDA_LIBRARIES: ${CUDA_LIBRARIES}") 150 | list(REMOVE_ITEM CUDA_LIBRARIES -lpthread) 151 | set(LIBS ${LIBS} ${CUDA_LIBRARIES}) 152 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_HAS_CUDA_FP16 -DEIGEN_USE_GPU") 153 | #find_cudnn() 154 | #include_directories(SYSTEM ${CUDNN_INCLUDE_DIRS}) 155 | endif() 156 | 157 | # look for Eigen 158 | find_package(Eigen3 REQUIRED) 159 | get_filename_component(EIGEN3_INCLUDE_DIR "${EIGEN3_INCLUDE_DIR}" REALPATH BASE_DIR "${CMAKE_BINARY_DIR}") 160 | message("-- Eigen dir is " ${EIGEN3_INCLUDE_DIR}) 161 | include_directories(${EIGEN3_INCLUDE_DIR}) 162 | 163 | FIND_PACKAGE(Threads REQUIRED) 164 | set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) 165 | 166 | #configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) 167 | #set(source_directory ${PROJECT_SOURCE_DIR}/src) 168 | 169 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dynet) 170 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dynet_layer) 171 | 172 | add_subdirectory(dynet/dynet) 173 | set (DYNET_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}}/dynet) 174 | add_subdirectory(dynet_layer/dynet_layer) 175 | add_subdirectory(src) 176 | -------------------------------------------------------------------------------- /amr_parser/cmake/FindEigen3.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find Eigen3 lib 2 | # 3 | # This module supports requiring a minimum version, e.g. you can do 4 | # find_package(Eigen3 3.1.2) 5 | # to require version 3.1.2 or newer of Eigen3. 6 | # 7 | # Once done this will define 8 | # 9 | # EIGEN3_FOUND - system has eigen lib with correct version 10 | # EIGEN3_INCLUDE_DIR - the eigen include directory 11 | # EIGEN3_VERSION - eigen version 12 | 13 | # Copyright (c) 2006, 2007 Montel Laurent, 14 | # Copyright (c) 2008, 2009 Gael Guennebaud, 15 | # Copyright (c) 2009 Benoit Jacob 16 | # Redistribution and use is allowed according to the terms of the 2-clause BSD license. 17 | 18 | if(NOT Eigen3_FIND_VERSION) 19 | if(NOT Eigen3_FIND_VERSION_MAJOR) 20 | set(Eigen3_FIND_VERSION_MAJOR 2) 21 | endif(NOT Eigen3_FIND_VERSION_MAJOR) 22 | if(NOT Eigen3_FIND_VERSION_MINOR) 23 | set(Eigen3_FIND_VERSION_MINOR 91) 24 | endif(NOT Eigen3_FIND_VERSION_MINOR) 25 | if(NOT Eigen3_FIND_VERSION_PATCH) 26 | set(Eigen3_FIND_VERSION_PATCH 0) 27 | endif(NOT Eigen3_FIND_VERSION_PATCH) 28 | 29 | set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}") 30 | endif(NOT Eigen3_FIND_VERSION) 31 | 32 | macro(_eigen3_check_version) 33 | file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header) 34 | 35 | string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}") 36 | set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}") 37 | string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}") 38 | set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}") 39 | string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}") 40 | set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}") 41 | 42 | set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION}) 43 | if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 44 | set(EIGEN3_VERSION_OK FALSE) 45 | else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 46 | set(EIGEN3_VERSION_OK TRUE) 47 | endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 48 | 49 | if(NOT EIGEN3_VERSION_OK) 50 | 51 | message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, " 52 | "but at least version ${Eigen3_FIND_VERSION} is required") 53 | endif(NOT EIGEN3_VERSION_OK) 54 | endmacro(_eigen3_check_version) 55 | 56 | if (EIGEN3_INCLUDE_DIR) 57 | 58 | # in cache already 59 | _eigen3_check_version() 60 | set(EIGEN3_FOUND ${EIGEN3_VERSION_OK}) 61 | 62 | else (EIGEN3_INCLUDE_DIR) 63 | 64 | find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library 65 | PATHS 66 | ${CMAKE_INSTALL_PREFIX}/include 67 | ${KDE4_INCLUDE_DIR} 68 | PATH_SUFFIXES eigen3 eigen 69 | ) 70 | 71 | if(EIGEN3_INCLUDE_DIR) 72 | _eigen3_check_version() 73 | endif(EIGEN3_INCLUDE_DIR) 74 | 75 | include(FindPackageHandleStandardArgs) 76 | find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK) 77 | 78 | mark_as_advanced(EIGEN3_INCLUDE_DIR) 79 | 80 | endif(EIGEN3_INCLUDE_DIR) 81 | 82 | -------------------------------------------------------------------------------- /amr_parser/scripts/eval_eager.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Evaluation for eager system 3 | # 4 | # It will convert actions into AMR graph and call smatch 5 | # to evaluate it score 6 | # 7 | # Usage: 8 | # 9 | # bash eval_eager.sh predict-action gold-AMR 10 | # 11 | #!/bin/bash 12 | BASEDIR=$(dirname "$0")/../../amr_aligner 13 | python ${BASEDIR}/eager_actions_evaluator.py -pred_actions $1 -gold $2 14 | -------------------------------------------------------------------------------- /amr_parser/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library (common 2 | corpus.cc 3 | corpus.h 4 | ds.cc 5 | ds.h 6 | logging.cc 7 | logging.h 8 | math_utils.cc 9 | math_utils.h 10 | sys_utils.cc 11 | sys_utils.h 12 | trainer_utils.cc 13 | trainer_utils.h 14 | lstm.h 15 | lstm.cc) 16 | 17 | target_link_libraries (common ${Boost_LIBRARIES}) 18 | add_subdirectory (left_to_right) 19 | -------------------------------------------------------------------------------- /amr_parser/src/corpus.cc: -------------------------------------------------------------------------------- 1 | #include "corpus.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "logging.h" 7 | #include 8 | #include 9 | #include 10 | 11 | const char* Corpus::UNK = "_UNK_"; 12 | const char* Corpus::SPAN = "_SPAN_"; 13 | const char* Corpus::BAD0 = "_BAD0_"; 14 | const char* Corpus::ROOT = "_ROOT_"; 15 | 16 | Corpus::Corpus() : n_train(0), n_devel(0) { 17 | 18 | } 19 | 20 | void Corpus::load_training_data(const std::string& filename) { 21 | _INFO << "Corpus:: reading training data from: " << filename; 22 | 23 | word_map.insert(Corpus::ROOT); 24 | word_map.insert(Corpus::UNK); 25 | // word_map.insert(Corpus::SPAN); 26 | pos_map.insert(Corpus::ROOT); 27 | pos_map.insert(Corpus::UNK); 28 | char_map.insert(Corpus::UNK); 29 | action_map.insert("CONFIRM"); 30 | action_map.insert(Corpus::UNK); 31 | 32 | confirm_map[word_map.get(Corpus::UNK)] = Alphabet(); 33 | confirm_map[word_map.get(Corpus::UNK)].insert(Corpus::UNK); 34 | 35 | std::ifstream in(filename); 36 | BOOST_ASSERT_MSG(in, "Corpus:: failed to open the training file."); 37 | 38 | n_train = 0; 39 | std::string data; 40 | std::string line; 41 | while (std::getline(in, line)) { 42 | boost::algorithm::trim(line); 43 | if (line.empty()) { 44 | parse_data(data, training_inputs[n_train], training_actions[n_train], true); 45 | data = ""; 46 | ++n_train; 47 | } else { 48 | data += (line + "\n"); 49 | } 50 | } 51 | if (!data.empty()) { 52 | parse_data(data, training_inputs[n_train], training_actions[n_train], true); 53 | ++n_train; 54 | } 55 | _INFO << "Corpus:: loaded " << n_train << " training sentences."; 56 | } 57 | 58 | void Corpus::load_devel_data(const std::string& filename) { 59 | _INFO << "Corpus:: reading development data from: " << filename; 60 | BOOST_ASSERT_MSG(word_map.size() > 1, 61 | "Corpus:: ROOT and UNK should be inserted before loading devel data."); 62 | 63 | std::ifstream in(filename); 64 | BOOST_ASSERT_MSG(in, "Corpus:: failed to open the devel file."); 65 | 66 | n_devel = 0; 67 | std::string data; 68 | std::string line; 69 | while (std::getline(in, line)) { 70 | boost::algorithm::trim(line); 71 | if (line.empty()) { 72 | parse_data(data, devel_inputs[n_devel], devel_actions[n_devel], false); 73 | data = ""; 74 | ++n_devel; 75 | } else { 76 | data += (line + "\n"); 77 | } 78 | } 79 | if (!data.empty()) { 80 | parse_data(data, devel_inputs[n_devel], devel_actions[n_devel], false); 81 | ++n_devel; 82 | } 83 | _INFO << "Corpus:: loaded " << n_devel << " development sentences."; 84 | } 85 | 86 | void Corpus::load_test_data(const std::string & filename) { 87 | _INFO << "Corpus:: reading test data from: " << filename; 88 | BOOST_ASSERT_MSG(word_map.size() > 1, 89 | "Corpus:: ROOT and UNK should be inserted before loading devel data."); 90 | 91 | std::ifstream in(filename); 92 | BOOST_ASSERT_MSG(in, "Corpus:: failed to open the test file."); 93 | 94 | n_test = 0; 95 | std::string data; 96 | std::string line; 97 | while (std::getline(in, line)) { 98 | boost::algorithm::trim(line); 99 | if (line.size() == 0) { 100 | parse_data(data, test_inputs[n_test], test_actions[n_test], false); 101 | data = ""; 102 | ++n_test; 103 | } else { 104 | data += (line + "\n"); 105 | } 106 | } 107 | if (!data.empty()) { 108 | parse_data(data, test_inputs[n_test], test_actions[n_test], false); 109 | ++n_test; 110 | } 111 | _INFO << "Corpus:: loaded " << n_test << " development sentences."; 112 | } 113 | 114 | void Corpus::parse_data(const std::string& data, 115 | InputUnits& input_units, 116 | ActionUnits& action_units, 117 | bool train) { 118 | std::stringstream S(data); 119 | std::string line; 120 | 121 | input_units.clear(); 122 | action_units.clear(); 123 | 124 | while (std::getline(S, line)) { 125 | std::vector tokens; 126 | boost::algorithm::trim(line); 127 | boost::algorithm::split(tokens, line, boost::is_any_of(" \t"), boost::token_compress_on); 128 | 129 | if (tokens[1] == "::tok") { 130 | for (int i = 2; i < tokens.size(); i++) { 131 | input_units.push_back(InputUnit()); 132 | if (train) { 133 | unsigned wid = word_map.insert(tokens[i]); 134 | input_units[i - 2].wid = wid; 135 | input_units[i - 2].aux_wid = wid; 136 | input_units[i - 2].w_str = tokens[i]; 137 | for (int j = 0; j < tokens[i].size(); ++j) { 138 | unsigned c_id = char_map.insert(std::string(1, tokens[i][j])); 139 | input_units[i - 2].c_id.push_back(c_id); 140 | } 141 | } else { 142 | unsigned wid = (word_map.contains(tokens[i])) ? word_map.get(tokens[i]) : word_map.get(UNK); 143 | input_units[i - 2].wid = wid; 144 | input_units[i - 2].aux_wid = wid; 145 | input_units[i - 2].w_str = tokens[i]; 146 | for (int j = 0; j < tokens[i].size(); ++j) { 147 | unsigned c_id = (char_map.contains(std::string(1, tokens[i][j]))) ? char_map.get(std::string(1, tokens[i][j])) : char_map.get(UNK); 148 | input_units[i - 2].c_id.push_back(c_id); 149 | } 150 | } 151 | } 152 | } else if (tokens[1] == "::pos") { 153 | for (int i = 2; i < tokens.size(); i++) { 154 | if (train) { 155 | unsigned pid = pos_map.insert(tokens[i]); 156 | input_units[i - 2].pid = pid; 157 | } else { 158 | unsigned pid = (pos_map.contains(tokens[i])) ? pos_map.get(tokens[i]) : pos_map.get(UNK); 159 | input_units[i - 2].pid = pid; 160 | } 161 | } 162 | } else if (tokens[1] == "::action") { 163 | std::string action = tokens[2]; 164 | for (int i = 3; i < tokens.size(); i++) { 165 | action += "\t" + tokens[i]; 166 | } 167 | ActionUnit action_unit = ActionUnit(action, tokens[2]); 168 | if (tokens[2] == "CONFIRM") { 169 | action_unit.action_name = "CONFIRM"; 170 | } else { 171 | action_unit.action_name = action; 172 | } 173 | 174 | if (train) { 175 | std::vector terms; 176 | boost::algorithm::split(terms, action, boost::is_any_of(" \t"), boost::token_compress_on); 177 | if (terms[0] == "CONFIRM") { 178 | unsigned wid = (word_map.contains(terms[1])) ? word_map.get(terms[1]) : word_map.get(UNK); 179 | if (wid == word_map.get(UNK)) { 180 | action_unit.idx = 0; 181 | } else { 182 | if (confirm_map.find(wid) == confirm_map.end()) { 183 | confirm_map[wid] = Alphabet(); 184 | confirm_map[wid].insert(word_map.get(wid)); 185 | } 186 | action_unit.idx = confirm_map[wid].insert(terms[2]); 187 | } 188 | } else if (terms[0] == "NEWNODE") { 189 | unsigned nid = node_map.insert(terms[1]); 190 | action_unit.idx = nid; 191 | } else if (terms[0] == "LEFT" || terms[0] == "RIGHT") { 192 | unsigned rid = rel_map.insert(terms[1]); 193 | action_unit.idx = rid; 194 | } else if (terms[0] == "ENTITY") { 195 | unsigned eid = entity_map.insert(terms[1]); 196 | action_unit.idx = eid; 197 | } 198 | unsigned aid = action_map.insert(action_unit.action_name); 199 | action_unit.aid = aid; 200 | } else { 201 | unsigned aid = (action_map.contains(action_unit.action_name)) ? action_map.get(action_unit.action_name) : action_map.get(UNK); 202 | action_unit.aid = aid; 203 | } 204 | action_units.push_back(action_unit); 205 | } 206 | } 207 | InputUnit input_unit; 208 | input_unit.wid = word_map.get(ROOT); 209 | input_unit.pid = pos_map.get(ROOT); 210 | input_unit.aux_wid = word_map.get(ROOT); 211 | input_unit.w_str = ROOT; 212 | input_units.push_back(input_unit); 213 | } 214 | 215 | unsigned Corpus::get_or_add_word(const std::string& word) { 216 | return word_map.insert(word); 217 | } 218 | 219 | void Corpus::stat() { 220 | _INFO << "Corpus:: # of words = " << word_map.size(); 221 | _INFO << "Corpus:: # of pos = " << pos_map.size(); 222 | } 223 | 224 | void Corpus::get_vocabulary_and_singletons() { 225 | std::map counter; 226 | for (auto& payload : training_inputs) { 227 | for (auto& item : payload.second) { 228 | vocab.insert(item.wid); 229 | ++counter[item.wid]; 230 | } 231 | } 232 | for (auto& payload : counter) { 233 | if (payload.second == 1) { singleton.insert(payload.first); } 234 | } 235 | } 236 | 237 | void load_pretrained_word_embedding(const std::string& embedding_file, 238 | unsigned pretrained_dim, 239 | std::unordered_map >& pretrained, 240 | Corpus& corpus) { 241 | pretrained[corpus.get_or_add_word(Corpus::BAD0)] = std::vector(pretrained_dim, 0.f); 242 | pretrained[corpus.get_or_add_word(Corpus::UNK)] = std::vector(pretrained_dim, 0.f); 243 | _INFO << "Main:: Loading from " << embedding_file << " with " << pretrained_dim << " dimensions."; 244 | std::ifstream ifs(embedding_file); 245 | BOOST_ASSERT_MSG(ifs, "Failed to load embedding file."); 246 | std::string line; 247 | // get the header in word2vec styled embedding. 248 | std::getline(ifs, line); 249 | std::vector v(pretrained_dim, 0.); 250 | std::string word; 251 | while (std::getline(ifs, line)) { 252 | std::istringstream iss(line); 253 | iss >> word; 254 | // actually, there should be a checking about the embedding dimension. 255 | for (unsigned i = 0; i < pretrained_dim; ++i) { iss >> v[i]; } 256 | unsigned id = corpus.get_or_add_word(word); 257 | pretrained[id] = v; 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /amr_parser/src/corpus.h: -------------------------------------------------------------------------------- 1 | #ifndef RLPARSER_CORPUS_H 2 | #define RLPARSER_CORPUS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "ds.h" 8 | #include 9 | 10 | struct InputUnit { 11 | unsigned wid; 12 | unsigned aux_wid; 13 | unsigned pid; 14 | std::vector c_id; 15 | 16 | std::string w_str; 17 | 18 | friend class boost::serialization::access; 19 | template 20 | void serialize(Archive& ar, const unsigned version) { 21 | ar & wid; 22 | ar & pid; 23 | ar & c_id; 24 | ar & aux_wid; 25 | } 26 | }; 27 | 28 | struct ActionUnit { 29 | std::string a_str; 30 | std::string action_name; 31 | unsigned aid; 32 | unsigned idx; //for confirm, newnode, la and ra op 33 | 34 | ActionUnit(std::string a_str, std::string action_name): a_str(a_str), action_name(action_name){} 35 | friend class boost::serialization::access; 36 | template 37 | void serialize(Archive& ar, const unsigned version) { 38 | ar & aid; 39 | ar & idx; 40 | } 41 | }; 42 | 43 | typedef std::vector InputUnits; 44 | typedef std::vector ActionUnits; 45 | 46 | struct Corpus { 47 | const static char* UNK; 48 | const static char* SPAN; 49 | const static char* BAD0; 50 | const static char* ROOT; 51 | 52 | unsigned n_train; 53 | unsigned n_devel; 54 | unsigned n_test; 55 | 56 | Alphabet word_map; 57 | Alphabet pos_map; 58 | Alphabet action_map; 59 | Alphabet char_map; 60 | Alphabet node_map; 61 | Alphabet rel_map; 62 | Alphabet entity_map; 63 | 64 | std::unordered_map confirm_map; 65 | 66 | std::unordered_map training_inputs; 67 | std::unordered_map training_actions; 68 | std::unordered_map devel_inputs; 69 | std::unordered_map devel_actions; 70 | std::unordered_map test_inputs; 71 | std::unordered_map test_actions; 72 | 73 | std::set vocab; 74 | std::set singleton; 75 | 76 | Corpus(); 77 | 78 | void load_training_data(const std::string& filename); 79 | 80 | void load_devel_data(const std::string& filename); 81 | 82 | void load_test_data(const std::string& filename); 83 | 84 | void parse_data(const std::string& data, 85 | InputUnits& input_units, 86 | ActionUnits& action_units, 87 | bool train); 88 | 89 | void get_vocabulary_and_singletons(); 90 | 91 | unsigned get_or_add_word(const std::string& word); 92 | void stat(); 93 | }; 94 | 95 | void load_pretrained_word_embedding(const std::string& embedding_file, 96 | unsigned pretrained_dim, 97 | std::unordered_map >& pretrained, 98 | Corpus& corpus); 99 | 100 | #endif // end for RLPARSER_CORPUS_H 101 | -------------------------------------------------------------------------------- /amr_parser/src/ds.cc: -------------------------------------------------------------------------------- 1 | #include "ds.h" 2 | #include "logging.h" 3 | #include 4 | #include 5 | 6 | Alphabet::Alphabet() : max_id(0), freezed(false), in_order(true) { 7 | 8 | } 9 | 10 | void Alphabet::freeze() { 11 | freezed = false; 12 | } 13 | 14 | unsigned Alphabet::size() const { 15 | return max_id; 16 | } 17 | 18 | unsigned Alphabet::get(const std::string& str) const { 19 | const auto found = str_to_id.find(str); 20 | if (found == str_to_id.end()) { 21 | _ERROR << "Alphabet :: str[\"" << str << "\"] not found!"; 22 | abort(); 23 | } 24 | return found->second; 25 | } 26 | 27 | std::string Alphabet::get(unsigned id) const { 28 | const auto found = id_to_str.find(id); 29 | if (found == id_to_str.end()) { 30 | _ERROR << "Alphabet :: id[" << id << "] not found!"; 31 | abort(); 32 | } 33 | return found->second; 34 | } 35 | 36 | bool Alphabet::contains(const std::string& str) const { 37 | const auto found = str_to_id.find(str); 38 | return (found != str_to_id.end()); 39 | } 40 | 41 | bool Alphabet::contains(unsigned id) const { 42 | const auto found = id_to_str.find(id); 43 | return (found != id_to_str.end()); 44 | } 45 | 46 | unsigned Alphabet::insert(const std::string& str) { 47 | BOOST_ASSERT_MSG(freezed == false, "Corpus::Insert should not insert into freezed alphabet."); 48 | if (contains(str)) { 49 | return get(str); 50 | } 51 | 52 | str_to_id[str] = max_id; 53 | id_to_str[max_id] = str; 54 | max_id++; 55 | return max_id - 1; 56 | } 57 | 58 | 59 | unsigned Alphabet::insert(const std::string& str, unsigned id) { 60 | _ERROR << "not implemented!"; 61 | abort(); 62 | } 63 | -------------------------------------------------------------------------------- /amr_parser/src/ds.h: -------------------------------------------------------------------------------- 1 | #ifndef RLPARSER_DS_H 2 | #define RLPARSER_DS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | struct Alphabet { 9 | typedef std::unordered_map StringToIdMap; 10 | typedef std::unordered_map IdToStringMap; 11 | 12 | unsigned max_id; 13 | StringToIdMap str_to_id; 14 | IdToStringMap id_to_str; 15 | bool freezed; 16 | bool in_order; 17 | 18 | Alphabet(); 19 | 20 | void freeze(); 21 | unsigned size() const; 22 | unsigned get(const std::string& str) const; 23 | std::string get(unsigned id) const; 24 | bool contains(const std::string& str) const; 25 | bool contains(unsigned id) const; 26 | unsigned insert(const std::string& str); 27 | unsigned insert(const std::string& str, unsigned id); 28 | }; 29 | 30 | struct HashVector : public std::vector { 31 | bool operator == (const HashVector& other) const { 32 | if (size() != other.size()) { return false; } 33 | for (unsigned i = 0; i < size(); ++i) { 34 | if (at(i) != other.at(i)) { return false; } 35 | } 36 | return true; 37 | } 38 | }; 39 | 40 | 41 | namespace std { 42 | template<> 43 | struct hash { 44 | std::size_t operator()(const HashVector& values) const { 45 | size_t seed = 0; 46 | boost::hash_range(seed, values.begin(), values.end()); 47 | return seed; 48 | } 49 | }; 50 | } 51 | 52 | #endif // end for RLPARSER_DS_H -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories (${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/src/left_to_right/ ${PROJECT_SOURCE_DIR}/src/left_to_right/s2a/) 2 | 3 | add_subdirectory (parser) 4 | add_subdirectory (train) 5 | add_subdirectory (decode) 6 | add_subdirectory (evaluate) 7 | add_subdirectory (system) 8 | 9 | add_executable (parser_l2r main.cc) 10 | 11 | target_link_libraries (parser_l2r 12 | parser_l2r_system 13 | parser_l2r_parser 14 | parser_l2r_train 15 | parser_l2r_decode 16 | parser_l2r_evaluate 17 | dynet 18 | dynet_layer 19 | common 20 | ${LIBS}) 21 | 22 | add_executable (ensemble ensemble.cc) 23 | 24 | target_link_libraries (ensemble 25 | parser_l2r_system 26 | parser_l2r_parser 27 | parser_l2r_train 28 | parser_l2r_decode 29 | parser_l2r_evaluate 30 | dynet 31 | dynet_layer 32 | common 33 | ${LIBS}) 34 | 35 | if(UNIX AND NOT APPLE) 36 | target_link_libraries (parser_l2r rt) 37 | endif() 38 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/decode/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories (${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/src/left_to_right/) 2 | 3 | add_library (parser_l2r_decode 4 | testing.cc 5 | testing.h) 6 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/decode/testing.cc: -------------------------------------------------------------------------------- 1 | #include "testing.h" 2 | #include "logging.h" 3 | #include 4 | 5 | 6 | po::options_description Tester::get_options() { 7 | po::options_description cmd("Testing model options"); 8 | cmd.add_options() 9 | ("test_model_path", po::value(), "The path to the model") 10 | ("test_target", po::value()->default_value("train"), "The evaluation target.") 11 | ("test_mode", po::value()->default_value("decision_acc"), "The mode of testing [decision_acc, pred_detail].") 12 | ("test_num_tests", po::value()->default_value(1), "The number of tests run on each instance.") 13 | ; 14 | return cmd; 15 | } -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/decode/testing.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTING_H 2 | #define TESTING_H 3 | 4 | #include "parser/parser.h" 5 | 6 | struct Tester { 7 | /*enum TEST_TARGET { kTrain, kDevelopment }; 8 | TEST_TARGET test_target; 9 | bool enable_decision_acc_test; 10 | bool enable_pred_detail_test; 11 | 12 | Parser* parser; 13 | unsigned n_tests;*/ 14 | 15 | static po::options_description get_options(); 16 | 17 | /*Tester(const po::variables_map& conf, Parser* parser_); 18 | 19 | void test(const po::variables_map& conf, 20 | Corpus& corpus, 21 | const std::string& model_name);*/ 22 | }; 23 | 24 | #endif // end for TESTING_H -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/evaluate/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories (${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/src/left_to_right/) 2 | 3 | add_library (parser_l2r_evaluate evaluate.cc evaluate.h) 4 | 5 | target_link_libraries (parser_l2r_evaluate parser_l2r_parser) 6 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/evaluate/evaluate.cc: -------------------------------------------------------------------------------- 1 | #include "evaluate.h" 2 | #include "logging.h" 3 | #include "sys_utils.h" 4 | #include 5 | #include 6 | 7 | float evaluate(const po::variables_map & conf, 8 | Corpus & corpus, 9 | Parser & parser, 10 | const std::string & output, 11 | bool devel) { 12 | auto t_start = std::chrono::high_resolution_clock::now(); 13 | unsigned kUNK = corpus.get_or_add_word(Corpus::UNK); 14 | 15 | std::ofstream ofs(output); 16 | parser.inactivate_training(); 17 | 18 | unsigned n = (devel ? corpus.n_devel : corpus.n_test); 19 | std::unordered_map & inputs = (devel ? corpus.devel_inputs : corpus.test_inputs); 20 | 21 | for (unsigned sid = 0; sid < n; ++sid) { 22 | 23 | ofs << "# ::tok"; 24 | for (unsigned i = 0; i < inputs[sid].size() - 1; ++i) { //except for _ROOT_ 25 | ofs << " " << inputs[sid][i].w_str; 26 | } 27 | ofs << std::endl; 28 | 29 | InputUnits& input_units = inputs[sid]; 30 | 31 | for (InputUnit& u : input_units) { 32 | if (!corpus.vocab.count(u.wid)) { u.wid = kUNK; } 33 | } 34 | dynet::ComputationGraph cg; 35 | ActionUnits output; 36 | 37 | unsigned len = input_units.size(); 38 | State state(len); 39 | 40 | parser.new_graph(cg); 41 | 42 | parser.initialize(cg, input_units, state); 43 | unsigned n_actions = 0; 44 | while (!state.terminated() && n_actions++ < 500) { 45 | // collect all valid actions. 46 | std::vector valid_actions; 47 | parser.sys.get_valid_actions(state, valid_actions); 48 | //std::cerr << valid_actions.size() << std::endl; 49 | 50 | std::vector scores = dynet::as_vector(cg.get_value(parser.get_scores())); 51 | 52 | auto payload = Parser::get_best_action(scores, valid_actions); 53 | unsigned best_a = payload.first; 54 | unsigned best_c = 0; 55 | //if CONFIRM 56 | if (best_a == 0) { 57 | unsigned wid = 0; 58 | if (conf["system"].as() == "swap") { 59 | wid = state.stack.back().first; 60 | } else if (conf["system"].as() == "eager") { 61 | wid = state.buffer.back().first; 62 | } else { 63 | BOOST_ASSERT_MSG(false, "Illegal System"); 64 | } 65 | 66 | std::vector confirm_scores = dynet::as_vector(cg.get_value(parser.get_confirm_values(wid))); 67 | float best_score = -1e9f; 68 | for (unsigned i = 0; i < confirm_scores.size(); i++) { 69 | if (confirm_scores[i] > best_score) { 70 | best_score = confirm_scores[i]; 71 | best_c = i; 72 | } 73 | } 74 | //std::cerr << "# ::action\t" << "CONFIRM\t" << 75 | // corpus.word_map.get(wid) << "\t"; 76 | ofs << "# ::action\t" 77 | << "CONFIRM\t" 78 | << (corpus.word_map.contains(wid) ? corpus.word_map.get(wid) : std::string("_UNK_")) 79 | << "\t"; 80 | if (corpus.confirm_map.find(wid) == corpus.confirm_map.end()) { 81 | //std::cerr << corpus.word_map.get(wid) << std::endl; 82 | ofs << (corpus.word_map.contains(wid) ? corpus.word_map.get(wid) : std::string("_UNK_")) << std::endl; 83 | } else { 84 | //std::cerr << corpus.confirm_map[wid].get(best_c) << std::endl; 85 | ofs << corpus.confirm_map[wid].get(best_c) << std::endl; 86 | } 87 | 88 | 89 | } else { 90 | //std::cerr << "# ::action\t" << parser.sys.action_map.get(best_a) << std::endl; 91 | ofs << "# ::action\t" << parser.sys.action_map.get(best_a) << std::endl; 92 | } 93 | parser.perform_action(best_a, cg, state); 94 | } 95 | 96 | for (InputUnit& u : input_units) { u.wid = u.aux_wid; } 97 | 98 | ofs << std::endl; 99 | 100 | //ofs && confirm 101 | } 102 | ofs.close(); 103 | auto t_end = std::chrono::high_resolution_clock::now(); 104 | float f_score = execute_and_get_result(conf["external_eval"].as() + 105 | " " + 106 | (devel ? 107 | conf["devel_gold"].as() : conf["test_gold"].as()) + 108 | " " + 109 | output); 110 | _INFO << "Evaluate:: Smatch " << f_score << " [" << corpus.n_devel << 111 | " sents in " << std::chrono::duration(t_end - t_start).count() << " ms]"; 112 | return f_score; 113 | } 114 | 115 | float evaluate_oracle(const po::variables_map & conf, 116 | Corpus & corpus, 117 | Parser & parser, 118 | const std::string & output, 119 | bool devel) { 120 | auto t_start = std::chrono::high_resolution_clock::now(); 121 | unsigned kUNK = corpus.get_or_add_word(Corpus::UNK); 122 | 123 | std::ofstream ofs(output); 124 | parser.inactivate_training(); 125 | 126 | unsigned n = (devel ? corpus.n_devel : corpus.n_test); 127 | std::unordered_map & inputs = (devel ? corpus.devel_inputs : corpus.test_inputs); 128 | std::unordered_map & actions = (devel ? corpus.devel_actions : corpus.test_actions); 129 | 130 | for (unsigned sid = 0; sid < n; ++sid) { 131 | 132 | ofs << "# ::tok"; 133 | for (unsigned i = 0; i < inputs[sid].size() - 1; ++i) { //except for _ROOT_ 134 | ofs << " " << inputs[sid][i].w_str; 135 | } 136 | ofs << std::endl; 137 | 138 | InputUnits& input_units = inputs[sid]; 139 | ActionUnits & parse_units = actions[sid]; 140 | 141 | for (InputUnit& u : input_units) { 142 | if (!corpus.vocab.count(u.wid)) { u.wid = kUNK; } 143 | } 144 | dynet::ComputationGraph cg; 145 | ActionUnits output; 146 | 147 | unsigned len = input_units.size(); 148 | State state(len); 149 | 150 | parser.new_graph(cg); 151 | 152 | parser.initialize(cg, input_units, state); 153 | unsigned n_actions = 0; 154 | while (!state.terminated() && n_actions++ < 500) { 155 | // collect all valid actions. 156 | std::vector valid_actions; 157 | parser.sys.get_valid_actions(state, valid_actions); 158 | //std::cerr << valid_actions.size() << std::endl; 159 | 160 | std::vector scores = dynet::as_vector(cg.get_value(parser.get_scores())); 161 | 162 | auto payload = Parser::get_best_action(scores, valid_actions); 163 | unsigned best_a = payload.first; 164 | unsigned best_c = 0; 165 | //if CONFIRM 166 | if (best_a == 0) { 167 | unsigned wid = 0; 168 | if (conf["system"].as() == "swap") { 169 | wid = state.stack.back().first; 170 | } else if (conf["system"].as() == "eager") { 171 | wid = state.buffer.back().first; 172 | } else { 173 | BOOST_ASSERT_MSG(false, "Illegal System"); 174 | } 175 | 176 | std::vector confirm_scores = dynet::as_vector(cg.get_value(parser.get_confirm_values(wid))); 177 | float best_score = -1e9f; 178 | for (unsigned i = 0; i < confirm_scores.size(); i++) { 179 | if (confirm_scores[i] > best_score) { 180 | best_score = confirm_scores[i]; 181 | best_c = i; 182 | } 183 | } 184 | //std::cerr << "# ::action\t" << "CONFIRM\t" << 185 | // corpus.word_map.get(wid) << "\t"; 186 | ofs << "# ::action\t" 187 | << "CONFIRM\t" 188 | << (corpus.word_map.contains(wid) ? corpus.word_map.get(wid) : std::string("_UNK_")) 189 | << "\t"; 190 | if (corpus.confirm_map.find(wid) == corpus.confirm_map.end()) { 191 | //std::cerr << corpus.word_map.get(wid) << std::endl; 192 | ofs << (corpus.word_map.contains(wid) ? corpus.word_map.get(wid) : std::string("_UNK_")) << std::endl; 193 | } else { 194 | //std::cerr << corpus.confirm_map[wid].get(best_c) << std::endl; 195 | ofs << corpus.confirm_map[wid].get(best_c) << std::endl; 196 | } 197 | 198 | 199 | } else { 200 | //std::cerr << "# ::action\t" << parser.sys.action_map.get(best_a) << std::endl; 201 | ofs << "# ::action\t" << parser.sys.action_map.get(best_a) << std::endl; 202 | } 203 | best_a = parse_units[n_actions].aid; 204 | parser.perform_action(best_a, cg, state); 205 | } 206 | 207 | for (InputUnit& u : input_units) { u.wid = u.aux_wid; } 208 | 209 | ofs << std::endl; 210 | 211 | //ofs && confirm 212 | } 213 | ofs.close(); 214 | auto t_end = std::chrono::high_resolution_clock::now(); 215 | float f_score = execute_and_get_result(conf["external_eval"].as() + 216 | " " + 217 | (devel ? 218 | conf["devel_gold"].as() : conf["test_gold"].as()) + 219 | " " + 220 | output); 221 | _INFO << "Evaluate:: Smatch " << f_score << " [" << corpus.n_devel << 222 | " sents in " << std::chrono::duration(t_end - t_start).count() << " ms]"; 223 | return f_score; 224 | } 225 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/evaluate/evaluate.h: -------------------------------------------------------------------------------- 1 | #ifndef EVALUATE_H 2 | #define EVALUATE_H 3 | 4 | #include 5 | #include 6 | #include "corpus.h" 7 | #include "parser/parser.h" 8 | #include 9 | 10 | namespace po = boost::program_options; 11 | 12 | float evaluate(const po::variables_map & conf, 13 | Corpus & corpus, 14 | Parser & parser, 15 | const std::string& output, 16 | bool devel); 17 | 18 | float evaluate_oracle(const po::variables_map & conf, 19 | Corpus & corpus, 20 | Parser & parser, 21 | const std::string& output, 22 | bool devel); 23 | 24 | 25 | #endif // end for EVALUATE_H -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "dynet/init.h" 5 | #include "corpus.h" 6 | #include "logging.h" 7 | #include "sys_utils.h" 8 | #include "trainer_utils.h" 9 | #include "parser/parser_builder.h" 10 | #include "system/swap.h" 11 | #include "system/eager.h" 12 | #include "train/algorithm.h" 13 | #include "evaluate/evaluate.h" 14 | #include "decode/testing.h" 15 | #include 16 | #include 17 | 18 | namespace po = boost::program_options; 19 | 20 | void init_command_line(int argc, char* argv[], po::variables_map& conf) { 21 | po::options_description general("Transition-based AMR parser."); 22 | general.add_options() 23 | ("train,t", "Use to specify to perform training.") 24 | ("architecture", po::value()->default_value("eager"), "The architecture [swap, eager].") 25 | ("algorithm", po::value()->default_value("supervised"), 26 | "The choice of reinforcement learning algorithm [supervised]") 27 | ("training_data,T", po::value(), "The path to the training data.") 28 | ("devel_data,d", po::value(), "The path to the development data.") 29 | ("test_data,e", po::value(), "The path to the test data.") 30 | ("pretrained,w", po::value(), "The path to the word embedding.") 31 | ("devel_gold", po::value(), "The path to the development data.") 32 | ("test_gold", po::value(), "The path to the test data.") 33 | ("model,m", po::value(), "The path to the model.") 34 | ("system", po::value()->default_value("eager"), "The transition system [swap, eager].") 35 | ("unk_strategy,o", po::value()->default_value(1), "The unknown word strategy.") 36 | ("unk_prob,u", po::value()->default_value(0.2f), "The probability for replacing the training word.") 37 | ("layers", po::value()->default_value(2), "The number of layers in LSTM.") 38 | ("word_dim", po::value()->default_value(100), "Word dim") 39 | ("pos_dim", po::value()->default_value(20), "POS dim, set it as 0 to disable POS.") 40 | ("pretrained_dim", po::value()->default_value(100), "Pretrained input dimension.") 41 | ("char_dim", po::value()->default_value(50), "Character input dimension.") 42 | ("newnode_dim", po::value()->default_value(100), "Newnode embedding dimension.") 43 | ("action_dim", po::value()->default_value(20), "The dimension for action.") 44 | ("relation_dim", po::value()->default_value(32), "The dimension for relation.") 45 | ("entity_dim", po::value()->default_value(32), "The dimension for entity.") 46 | ("label_dim", po::value()->default_value(20), "The dimension for label.") 47 | ("lstm_input_dim", po::value()->default_value(100), "The dimension for lstm input.") 48 | ("hidden_dim", po::value()->default_value(100), "The dimension for hidden unit.") 49 | ("dropout", po::value()->default_value(0.f), "The dropout rate.") 50 | ("reward_type", po::value()->default_value("local"), 51 | "The type of reward [local, local0p10, local00n1, global, global_norm, global_maxout].") 52 | ("batch_size", po::value()->default_value(1), "The size of batch.") 53 | ("gamma", po::value()->default_value(1.f), "The gamma, reward discount factor.") 54 | ("max_iter", po::value()->default_value(10), "The maximum number of iteration.") 55 | ("report_stops", po::value()->default_value(100), "The reporting stops") 56 | ("report_reward", "Use to specify to report reward and q-value in evaluation.") 57 | ("evaluate_oracle", "Use to specify use oracle.") 58 | ("evaluate_stops", po::value()->default_value(2500), "The evaluation stops") 59 | ("evaluate_skips", po::value()->default_value(0), "skip evaluation on the first n round.") 60 | ("external_eval", po::value()->default_value("python -u ../scripts/eval.py"), "config the path for evaluation script") 61 | ("lambda", po::value()->default_value(0.f), "The L2 regularizer, should not set in --dynet-l2.") 62 | ("output", po::value(), "The path to the output file.") 63 | ("beam_size", po::value(), "The beam size.") 64 | ("random_seed", po::value()->default_value(7743), "The value of random seed.") 65 | ("verbose,v", "Details logging.") 66 | ("help,h", "show help information") 67 | ; 68 | 69 | po::options_description optimizer = get_optimizer_options(); 70 | po::options_description supervise = SupervisedTrainer::get_options(); 71 | po::options_description test = Tester::get_options(); 72 | 73 | po::options_description cmd("Allowed options"); 74 | cmd.add(general) 75 | .add(optimizer) 76 | .add(supervise) 77 | .add(test) 78 | ; 79 | 80 | po::store(po::parse_command_line(argc, argv, cmd), conf); 81 | if (conf.count("help")) { 82 | std::cerr << cmd << std::endl; 83 | exit(1); 84 | } 85 | init_boost_log(conf.count("verbose") > 0); 86 | if (!conf.count("training_data")) { 87 | std::cerr << "Please specify --training_data (-T), even in test" << std::endl; 88 | exit(1); 89 | } 90 | } 91 | 92 | int main(int argc, char** argv) { 93 | dynet::initialize(argc, argv, false); 94 | std::cerr << "command:"; 95 | for (int i = 0; i < argc; ++i) { std::cerr << ' ' << argv[i]; } 96 | std::cerr << std::endl; 97 | 98 | po::variables_map conf; 99 | init_command_line(argc, argv, conf); 100 | 101 | dynet::rndeng = new std::mt19937(conf["random_seed"].as()); 102 | 103 | std::string model_name; 104 | if (conf.count("train")) { 105 | if (conf.count("model")) { 106 | model_name = conf["model"].as(); 107 | _INFO << "Main:: write parameters to: " << model_name; 108 | } else { 109 | std::string prefix("parser_l2r"); 110 | prefix = prefix + "." + conf["algorithm"].as(); 111 | model_name = get_model_name(conf, prefix); 112 | _INFO << "Main:: write parameters to: " << model_name; 113 | } 114 | } else { 115 | model_name = conf["model"].as(); 116 | _INFO << "Main:: evaluating model from: " << model_name; 117 | } 118 | 119 | Corpus corpus; 120 | corpus.load_training_data(conf["training_data"].as()); 121 | corpus.stat(); 122 | 123 | corpus.get_vocabulary_and_singletons(); 124 | 125 | std::unordered_map> pretrained; 126 | if (conf.count("pretrained")) { 127 | load_pretrained_word_embedding(conf["pretrained"].as(), 128 | conf["pretrained_dim"].as(), 129 | pretrained, corpus); 130 | } 131 | _INFO << "Main:: after loading pretrained embedding, size(vocabulary)=" << corpus.word_map.size(); 132 | 133 | dynet::ParameterCollection model; 134 | TransitionSystem* sys = nullptr; 135 | 136 | std::string system_name = conf["system"].as(); 137 | if (system_name == "swap") { 138 | sys = new Swap(corpus.action_map, corpus.node_map, corpus.rel_map, corpus.entity_map); 139 | } else if (system_name == "eager") { 140 | sys = new Eager(corpus.action_map, corpus.node_map, corpus.rel_map, corpus.entity_map); 141 | } else { 142 | _ERROR << "Main:: Unknown transition system: " << system_name; 143 | exit(1); 144 | } 145 | _INFO << "Main:: transition system: " << system_name; 146 | 147 | Parser* parser = ParserBuilder().build(conf, model, (*sys), corpus, pretrained); 148 | 149 | _INFO << "Main:: char_map unk id: " << corpus.char_map.get(corpus.UNK); 150 | 151 | corpus.load_devel_data(conf["devel_data"].as()); 152 | _INFO << "Main:: after loading development data, size(vocabulary)=" << corpus.word_map.size(); 153 | 154 | if (conf.count("test_data")) { 155 | corpus.load_test_data(conf["test_data"].as()); 156 | _INFO << "Main:: after loading test data, size(vocabulary)=" << corpus.word_map.size(); 157 | } 158 | 159 | std::string output; 160 | if (conf.count("output")) { 161 | output = conf["output"].as(); 162 | } else { 163 | int pid = portable_getpid(); 164 | #ifdef _MSC_VER 165 | output = "parser_l2r.evaluator." + boost::lexical_cast(pid); 166 | #else 167 | output = "/tmp/parser_l2r.evaluator." + boost::lexical_cast(pid); 168 | #endif 169 | } 170 | _INFO << "Main:: write tmp file to: " << output; 171 | 172 | if (conf.count("train")) { 173 | const std::string algorithm = conf["algorithm"].as(); 174 | _INFO << "Main:: algorithm: " << algorithm; 175 | if (algorithm == "supervised" || algorithm == "sup") { 176 | SupervisedTrainer trainer(conf, parser); 177 | trainer.train(conf, corpus, model_name, output); 178 | }/* else if (algorithm == "testing") { 179 | Tester tester(conf, parser); 180 | tester.test(conf, corpus, model_name); 181 | } else { 182 | _ERROR << "Main:: Unknown RL algorithm."; 183 | }*/ 184 | } 185 | 186 | dynet::load_dynet_model(model_name, (&model)); 187 | float dev_f, test_f; 188 | if (conf.count("evaluate_oracle")) { 189 | dev_f = evaluate_oracle(conf, corpus, (*parser), output, true); 190 | test_f = evaluate_oracle(conf, corpus, (*parser), output, false); 191 | } else { 192 | dev_f = evaluate(conf, corpus, (*parser), output, true); 193 | test_f = evaluate(conf, corpus, (*parser), output, false); 194 | } 195 | _INFO << "Final score: dev: " << dev_f << ", test: " << test_f; 196 | 197 | return 0; 198 | } 199 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/parser/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories (${PROJECT_SOURCE_DIR}/src/ ${PROJECT_SOURCE_DIR}/src/left_to_right/) 2 | 3 | add_library (parser_l2r_parser 4 | parser.cc 5 | parser.h 6 | parser_swap.cc 7 | parser_swap.h 8 | parser_eager.cc 9 | parser_eager.h 10 | parser_builder.cc 11 | parser_builder.h) 12 | 13 | target_link_libraries (parser_l2r_parser parser_l2r_system) 14 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/parser/parser.cc: -------------------------------------------------------------------------------- 1 | #include "parser.h" 2 | #include "dynet/expr.h" 3 | #include "corpus.h" 4 | #include "logging.h" 5 | #include 6 | #include 7 | 8 | std::pair Parser::get_best_action(const std::vector& scores, 9 | const std::vector& valid_actions) { 10 | unsigned best_a = valid_actions[0]; 11 | float best_score = scores[best_a]; 12 | //! should use next valid action. 13 | for (unsigned i = 1; i < valid_actions.size(); ++i) { 14 | unsigned a = valid_actions[i]; 15 | if (best_score < scores[a]) { 16 | best_a = a; 17 | best_score = scores[a]; 18 | } 19 | } 20 | return std::make_pair(best_a, best_score); 21 | } 22 | 23 | dynet::Expression Parser::get_scores() { 24 | return get_a_values(); 25 | } 26 | 27 | void Parser::initialize(dynet::ComputationGraph & cg, 28 | const InputUnits & input, 29 | State & state) { 30 | initialize_state(input, state); 31 | initialize_parser(cg, input); 32 | } 33 | 34 | void Parser::initialize_state(const InputUnits & input, State & state) { 35 | unsigned len = input.size(); 36 | state.buffer.resize(len); 37 | for (unsigned i = 0; i < len; ++i) { state.buffer[len - i - 1] = std::make_pair(input[i].wid, 0); } 38 | state.buffer[0].second = 2; //Corpus::ROOT; 39 | } -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/parser/parser.h: -------------------------------------------------------------------------------- 1 | #ifndef RLPARSER_LEFT_TO_RIGHT_S2A_PARSER_H 2 | #define RLPARSER_LEFT_TO_RIGHT_S2A_PARSER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "corpus.h" 8 | #include "system/state.h" 9 | #include "system/system.h" 10 | #include "dynet/expr.h" 11 | 12 | namespace po = boost::program_options; 13 | 14 | struct Parser { 15 | dynet::ParameterCollection& model; 16 | TransitionSystem& sys; 17 | std::string system_name; 18 | 19 | Parser(dynet::ParameterCollection & m, 20 | TransitionSystem& s, 21 | const std::string & sys_name) : 22 | model(m), sys(s), system_name(sys_name){} 23 | 24 | virtual Parser* copy_architecture(dynet::Model& new_model) = 0; 25 | virtual void activate_training() = 0; 26 | virtual void inactivate_training() = 0; 27 | virtual void new_graph(dynet::ComputationGraph& cg) = 0; 28 | virtual std::vector get_params() = 0; 29 | 30 | void initialize(dynet::ComputationGraph& cg, 31 | const InputUnits& input, 32 | State& state); 33 | 34 | void initialize_state(const InputUnits& input, 35 | State& state); 36 | 37 | virtual void initialize_parser(dynet::ComputationGraph& cg, 38 | const InputUnits& input) = 0; 39 | 40 | virtual void perform_action(const unsigned& action, 41 | dynet::ComputationGraph& cg, 42 | State& state) = 0; 43 | 44 | static std::pair get_best_action(const std::vector& scores, 45 | const std::vector& valid_actions); 46 | 47 | /// Get the un-softmaxed scores from the LSTM-parser. 48 | dynet::Expression get_scores(); 49 | 50 | virtual dynet::Expression get_confirm_values(unsigned wid) = 0; 51 | virtual dynet::Expression get_a_values() = 0; 52 | }; 53 | 54 | #endif // end for RLPARSER_LEFT_TO_RIGHT_S2A_PARSER_H 55 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/parser/parser_builder.cc: -------------------------------------------------------------------------------- 1 | #include "parser_swap.h" 2 | #include "parser_eager.h" 3 | #include "parser_builder.h" 4 | #include "logging.h" 5 | 6 | po::options_description ParserBuilder::get_options() { 7 | po::options_description cmd("Parser settings."); 8 | return cmd; 9 | } 10 | 11 | Parser * ParserBuilder::build(const po::variables_map& conf, 12 | dynet::ParameterCollection & model, 13 | TransitionSystem& sys, 14 | const Corpus& corpus, 15 | const std::unordered_map>& pretrained) { 16 | std::string system_name = conf["system"].as(); 17 | Parser* parser = nullptr; 18 | std::string arch_name = conf["architecture"].as(); 19 | if (arch_name == "swap") { 20 | parser = new ParserSwap(model, 21 | corpus.vocab.size() + 10, 22 | conf["word_dim"].as(), 23 | corpus.pos_map.size() + 10, 24 | conf["pos_dim"].as(), 25 | corpus.word_map.size() + 1, 26 | conf["pretrained_dim"].as(), 27 | corpus.char_map.size() + 1, 28 | conf["char_dim"].as(), 29 | sys.num_actions(), 30 | conf["action_dim"].as(), 31 | sys.node_map.size(), 32 | conf["lstm_input_dim"].as(), 33 | sys.rel_map.size(), 34 | conf["relation_dim"].as(), 35 | sys.entity_map.size(), 36 | conf["entity_dim"].as(), 37 | conf["layers"].as(), 38 | conf["lstm_input_dim"].as(), 39 | conf["hidden_dim"].as(), 40 | system_name, 41 | sys, 42 | pretrained, 43 | corpus.confirm_map, 44 | corpus.char_map); 45 | } else if (arch_name == "eager") { 46 | parser = new ParserEager(model, 47 | corpus.vocab.size() + 10, 48 | conf["word_dim"].as(), 49 | corpus.pos_map.size() + 10, 50 | conf["pos_dim"].as(), 51 | corpus.word_map.size() + 1, 52 | conf["pretrained_dim"].as(), 53 | corpus.char_map.size() + 1, 54 | conf["char_dim"].as(), 55 | sys.num_actions(), 56 | conf["action_dim"].as(), 57 | sys.node_map.size(), 58 | conf["lstm_input_dim"].as(), 59 | sys.rel_map.size(), 60 | conf["relation_dim"].as(), 61 | sys.entity_map.size(), 62 | conf["entity_dim"].as(), 63 | conf["layers"].as(), 64 | conf["lstm_input_dim"].as(), 65 | conf["hidden_dim"].as(), 66 | system_name, 67 | sys, 68 | pretrained, 69 | corpus.confirm_map, 70 | corpus.char_map); 71 | } else { 72 | _ERROR << "Main:: Unknown architecture name: " << arch_name; 73 | } 74 | _INFO << "Main:: architecture: " << arch_name; 75 | return parser; 76 | } 77 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/parser/parser_builder.h: -------------------------------------------------------------------------------- 1 | #ifndef PARSER_BUILDER_H 2 | #define PARSER_BUILDER_H 3 | 4 | #include 5 | #include "parser.h" 6 | #include "dynet/model.h" 7 | #include 8 | 9 | namespace po = boost::program_options; 10 | 11 | struct ParserBuilder { 12 | static po::options_description get_options(); 13 | static Parser* build(const po::variables_map& conf, 14 | dynet::ParameterCollection& model, 15 | TransitionSystem& sys, 16 | const Corpus& corpus, 17 | const std::unordered_map>& pretrained); 18 | }; 19 | #endif // end for PARSER_BUILDER_H -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/parser/parser_eager.h: -------------------------------------------------------------------------------- 1 | #ifndef PARSER_EAGER_H 2 | #define PARSER_EAGER_H 3 | 4 | #include "parser.h" 5 | #include "lstm.h" 6 | #include "dynet_layer/layer.h" 7 | #include 8 | #include 9 | #include 10 | 11 | namespace po = boost::program_options; 12 | 13 | struct ParserEager : public Parser { 14 | struct TransitionSystemFunction { 15 | virtual void perform_action(const unsigned& action, 16 | dynet::ComputationGraph& cg, 17 | std::vector& stack, 18 | std::vector& buffer, 19 | std::vector& deque, 20 | dynet::RNNBuilder& a_lstm, dynet::RNNPointer& a_pointer, 21 | dynet::RNNBuilder& s_lstm, dynet::RNNPointer& s_pointer, 22 | dynet::RNNBuilder& q_lstm, dynet::RNNPointer& q_pointer, 23 | dynet::RNNBuilder& d_lstm, dynet::RNNPointer& d_pointer, 24 | dynet::Expression& act_expr, 25 | const Alphabet & action_map, 26 | const Alphabet & node_map, 27 | SymbolEmbedding & node_emb, 28 | const Alphabet & rel_map, 29 | SymbolEmbedding & rel_emb, 30 | const Alphabet & entity_map, 31 | SymbolEmbedding & entity_emb, 32 | DenseLayer & confirm_layer, 33 | Merge3Layer & merge_parent, 34 | Merge3Layer & merge_child, 35 | Merge2Layer & merge_token, 36 | Merge2Layer & merge_entity) = 0; 37 | dynet::Expression get_arg_emb(const std::string & a_str, const Alphabet & arg_map, SymbolEmbedding & arg_emb); 38 | }; 39 | 40 | struct EagerFunction : public TransitionSystemFunction { 41 | void perform_action(const unsigned& action, 42 | dynet::ComputationGraph& cg, 43 | std::vector& stack, 44 | std::vector& buffer, 45 | std::vector& deque, 46 | dynet::RNNBuilder& a_lstm, dynet::RNNPointer& a_pointer, 47 | dynet::RNNBuilder& s_lstm, dynet::RNNPointer& s_pointer, 48 | dynet::RNNBuilder& q_lstm, dynet::RNNPointer& q_pointer, 49 | dynet::RNNBuilder& d_lstm, dynet::RNNPointer& d_pointer, 50 | dynet::Expression& act_expr, 51 | const Alphabet & action_map, 52 | const Alphabet & node_map, 53 | SymbolEmbedding & node_emb, 54 | const Alphabet & rel_map, 55 | SymbolEmbedding & rel_emb, 56 | const Alphabet & entity_map, 57 | SymbolEmbedding & entity_emb, 58 | DenseLayer & confirm_layer, 59 | Merge3Layer & merge_parent, 60 | Merge3Layer & merge_child, 61 | Merge2Layer & merge_token, 62 | Merge2Layer & merge_entity) override; 63 | }; 64 | 65 | LSTMBuilder s_lstm; 66 | LSTMBuilder q_lstm; 67 | LSTMBuilder a_lstm; 68 | LSTMBuilder d_lstm; 69 | 70 | BiLSTMBuilder c_lstm; 71 | 72 | SymbolEmbedding word_emb; 73 | SymbolEmbedding pos_emb; 74 | SymbolEmbedding preword_emb; 75 | SymbolEmbedding act_emb; 76 | SymbolEmbedding char_emb; 77 | SymbolEmbedding node_emb; 78 | SymbolEmbedding rel_emb; 79 | SymbolEmbedding entity_emb; 80 | 81 | Merge3Layer merge_input; // merge (pos, pretained, char_emb) 82 | Merge4Layer merge; // merge (s_lstm, q_lstm, a_lstm, d_lstm) 83 | Merge3Layer merge_parent; // merge (parent, rel, child) -> parent 84 | Merge3Layer merge_child; // merge (parent, rel, child) -> child 85 | Merge2Layer merge_token; // merge (A, B) -> AB 86 | Merge2Layer merge_entity; // merge (AB, entity_label) -> X 87 | DenseLayer scorer; // Q / A value scorer. 88 | DenseLayer confirm_layer; 89 | 90 | 91 | Alphabet char_map; 92 | 93 | std::unordered_map confirm_scorer; //confirm scorer. 94 | std::unordered_map confirm_map; 95 | 96 | dynet::Expression confirm_to_one; 97 | 98 | dynet::Parameter p_action_start; // start of action 99 | dynet::Parameter p_buffer_guard; // end of buffer 100 | dynet::Parameter p_stack_guard; // end of stack 101 | dynet::Parameter p_deque_guard; // end of deque 102 | dynet::Expression action_start; 103 | dynet::Expression buffer_guard; 104 | dynet::Expression stack_guard; 105 | dynet::Expression deque_guard; 106 | 107 | /// state machine 108 | dynet::RNNPointer s_pointer; 109 | dynet::RNNPointer q_pointer; 110 | dynet::RNNPointer a_pointer; 111 | dynet::RNNPointer d_pointer; 112 | std::vector stack; 113 | std::vector buffer; 114 | std::vector deque; 115 | 116 | bool trainable; 117 | /// The reference 118 | TransitionSystemFunction* sys_func; 119 | const std::unordered_map>& pretrained; 120 | 121 | /// The Configurations: useful for other models. 122 | unsigned size_w, dim_w, size_p, dim_p, size_t, dim_t, size_c, dim_c, size_a, dim_a, size_n, dim_n, size_r, dim_r, size_e, dim_e; 123 | unsigned n_layers, dim_lstm_in, dim_hidden; 124 | 125 | explicit ParserEager(dynet::ParameterCollection & m, 126 | unsigned size_w, // 127 | unsigned dim_w, // word size, word dim 128 | unsigned size_p, // 129 | unsigned dim_p, // pos size, pos dim 130 | unsigned size_t, // 131 | unsigned dim_t, // pword size, pword dim 132 | unsigned size_c, // 133 | unsigned dim_c, // char size, char dim 134 | unsigned size_a, // 135 | unsigned dim_a, // act size, act dim 136 | unsigned size_n, // 137 | unsigned dim_n, // newnode size, newnode dim 138 | unsigned size_r, 139 | unsigned dim_r, // rel size, rel dim 140 | unsigned size_e, 141 | unsigned dim_e, // entity size, entity dim 142 | unsigned n_layers, 143 | unsigned dim_lstm_in, 144 | unsigned dim_hidden, 145 | const std::string& system_name, 146 | TransitionSystem& system, 147 | const std::unordered_map>& pretrained, 148 | const std::unordered_map & confirm_map, 149 | const Alphabet & char_map); 150 | 151 | Parser* copy_architecture(dynet::Model& new_model) override; 152 | void activate_training() override; 153 | void inactivate_training() override; 154 | void new_graph(dynet::ComputationGraph& cg) override; 155 | std::vector get_params() override; 156 | 157 | void initialize_parser(dynet::ComputationGraph& cg, 158 | const InputUnits& input) override; 159 | 160 | void perform_action(const unsigned& action, 161 | dynet::ComputationGraph& cg, 162 | State& state) override; 163 | 164 | /// Get the un-softmaxed scores from the LSTM-parser. 165 | dynet::Expression get_confirm_values(unsigned wid) override; 166 | dynet::Expression get_a_values() override; 167 | }; 168 | 169 | #endif // end for PARSER_H 170 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/parser/parser_swap.h: -------------------------------------------------------------------------------- 1 | #ifndef PARSER_FULL_H 2 | #define PARSER_FULL_H 3 | 4 | #include "parser.h" 5 | #include "lstm.h" 6 | #include "dynet_layer/layer.h" 7 | #include 8 | #include 9 | #include 10 | 11 | namespace po = boost::program_options; 12 | 13 | struct ParserSwap : public Parser { 14 | struct TransitionSystemFunction { 15 | virtual void perform_action(const unsigned& action, 16 | dynet::ComputationGraph& cg, 17 | std::vector& stack, 18 | std::vector& buffer, 19 | dynet::RNNBuilder& a_lstm, dynet::RNNPointer& a_pointer, 20 | dynet::RNNBuilder& s_lstm, dynet::RNNPointer& s_pointer, 21 | dynet::RNNBuilder& q_lstm, dynet::RNNPointer& q_pointer, 22 | dynet::Expression& act_expr, 23 | const Alphabet & action_map, 24 | const Alphabet & node_map, 25 | SymbolEmbedding & node_emb, 26 | const Alphabet & rel_map, 27 | SymbolEmbedding & rel_emb, 28 | const Alphabet & entity_map, 29 | SymbolEmbedding & entity_emb, 30 | DenseLayer & confirm_layer, 31 | Merge3Layer & merge_parent, 32 | Merge3Layer & merge_child, 33 | Merge2Layer & merge_token, 34 | Merge2Layer & merge_entity) = 0; 35 | dynet::Expression get_arg_emb(const std::string & a_str, const Alphabet & arg_map, SymbolEmbedding & arg_emb); 36 | }; 37 | 38 | struct SwapFunction : public TransitionSystemFunction { 39 | void perform_action(const unsigned& action, 40 | dynet::ComputationGraph& cg, 41 | std::vector& stack, 42 | std::vector& buffer, 43 | dynet::RNNBuilder& a_lstm, dynet::RNNPointer& a_pointer, 44 | dynet::RNNBuilder& s_lstm, dynet::RNNPointer& s_pointer, 45 | dynet::RNNBuilder& q_lstm, dynet::RNNPointer& q_pointer, 46 | dynet::Expression& act_expr, 47 | const Alphabet & action_map, 48 | const Alphabet & node_map, 49 | SymbolEmbedding & node_emb, 50 | const Alphabet & rel_map, 51 | SymbolEmbedding & rel_emb, 52 | const Alphabet & entity_map, 53 | SymbolEmbedding & entity_emb, 54 | DenseLayer & confirm_layer, 55 | Merge3Layer & merge_parent, 56 | Merge3Layer & merge_child, 57 | Merge2Layer & merge_token, 58 | Merge2Layer & merge_entity) override; 59 | }; 60 | 61 | LSTMBuilder s_lstm; 62 | LSTMBuilder q_lstm; 63 | LSTMBuilder a_lstm; 64 | BiLSTMBuilder c_lstm; 65 | 66 | SymbolEmbedding word_emb; 67 | SymbolEmbedding pos_emb; 68 | SymbolEmbedding preword_emb; 69 | SymbolEmbedding act_emb; 70 | SymbolEmbedding char_emb; 71 | SymbolEmbedding node_emb; 72 | SymbolEmbedding rel_emb; 73 | SymbolEmbedding entity_emb; 74 | 75 | Merge3Layer merge_input; // merge (pos, pretained, char_emb) 76 | Merge3Layer merge; // merge (s_lstm, q_lstm, a_lstm) 77 | Merge3Layer merge_parent; // merge (parent, rel, child) -> parent 78 | Merge3Layer merge_child; // merge (parent, rel, child) -> child 79 | Merge2Layer merge_token; // merge (A, B) -> AB 80 | Merge2Layer merge_entity; // merge (AB, entity_label) -> X 81 | DenseLayer scorer; // Q / A value scorer. 82 | DenseLayer confirm_layer; 83 | 84 | Alphabet char_map; 85 | std::unordered_map confirm_scorer; //confirm scorer. 86 | std::unordered_map confirm_map; 87 | 88 | dynet::Expression confirm_to_one; 89 | 90 | dynet::Parameter p_action_start; // start of action 91 | dynet::Parameter p_buffer_guard; // end of buffer 92 | dynet::Parameter p_stack_guard; // end of stack 93 | dynet::Expression action_start; 94 | dynet::Expression buffer_guard; 95 | dynet::Expression stack_guard; 96 | 97 | /// state machine 98 | dynet::RNNPointer s_pointer; 99 | dynet::RNNPointer q_pointer; 100 | dynet::RNNPointer a_pointer; 101 | std::vector stack; 102 | std::vector buffer; 103 | 104 | bool trainable; 105 | /// The reference 106 | TransitionSystemFunction* sys_func; 107 | const std::unordered_map>& pretrained; 108 | 109 | /// The Configurations: useful for other models. 110 | unsigned size_w, dim_w, size_p, dim_p, size_t, dim_t, size_c, dim_c, size_a, dim_a, size_n, dim_n, size_r, dim_r, size_e, dim_e; 111 | unsigned n_layers, dim_lstm_in, dim_hidden; 112 | 113 | explicit ParserSwap(dynet::ParameterCollection& m, 114 | unsigned size_w, // 115 | unsigned dim_w, // word size, word dim 116 | unsigned size_p, // 117 | unsigned dim_p, // pos size, pos dim 118 | unsigned size_t, // 119 | unsigned dim_t, // pword size, pword dim 120 | unsigned size_c, // 121 | unsigned dim_c, // char size, char dim 122 | unsigned size_a, // 123 | unsigned dim_a, // act size, act dim 124 | unsigned size_n, // 125 | unsigned dim_n, // newnode size, newnode dim 126 | unsigned size_r, 127 | unsigned dim_r, // rel size, rel dim 128 | unsigned size_e, 129 | unsigned dim_e, // entity size, entity dim 130 | unsigned n_layers, 131 | unsigned dim_lstm_in, 132 | unsigned dim_hidden, 133 | const std::string& system_name, 134 | TransitionSystem& system, 135 | const std::unordered_map>& pretrained, 136 | const std::unordered_map & confirm_map, 137 | const Alphabet & char_map); 138 | 139 | Parser* copy_architecture(dynet::Model& new_model) override; 140 | void activate_training() override; 141 | void inactivate_training() override; 142 | void new_graph(dynet::ComputationGraph& cg) override; 143 | std::vector get_params() override; 144 | 145 | void initialize_parser(dynet::ComputationGraph& cg, 146 | const InputUnits& input) override; 147 | 148 | void perform_action(const unsigned& action, 149 | dynet::ComputationGraph& cg, 150 | State& state) override; 151 | 152 | /// Get the un-softmaxed scores from the LSTM-parser. 153 | dynet::Expression get_confirm_values(unsigned wid) override; 154 | dynet::Expression get_a_values() override; 155 | }; 156 | 157 | #endif // end for PARSER_H 158 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories (${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/src/left_to_right/) 2 | 3 | add_library (parser_l2r_system 4 | swap.cc 5 | swap.h 6 | eager.cc 7 | eager.h 8 | system.cc 9 | system.h 10 | state.h 11 | state.cc) 12 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/eager.cc: -------------------------------------------------------------------------------- 1 | #include "eager.h" 2 | #include "logging.h" 3 | #include "corpus.h" 4 | #include 5 | #include 6 | 7 | Eager::Eager(const Alphabet & action_map, const Alphabet & node_map, const Alphabet & rel_map, const Alphabet & entity_map) : 8 | TransitionSystem(action_map, node_map, rel_map, entity_map) { 9 | n_actions = action_map.size(); 10 | _INFO << "TransitionSystem:: show action names:"; 11 | for (const auto& x : action_map.str_to_id) { 12 | _INFO << "- " << x.first; 13 | } 14 | } 15 | 16 | std::string Eager::name(unsigned id) const { 17 | BOOST_ASSERT_MSG(id < action_map.size(), "id in illegal range"); 18 | return action_map.get(id); 19 | } 20 | 21 | unsigned Eager::num_actions() const { return n_actions; } 22 | 23 | void Eager::perform_action(State & state, const unsigned & action) { 24 | std::string action_type = get_action_type(action, action_map); 25 | if (action_type == "SHIFT") { 26 | shift_unsafe(state); 27 | } else if (action_type == "CONFIRM") { 28 | confirm_unsafe(state); 29 | } else if (action_type == "MERGE") { 30 | merge_unsafe(state); 31 | } else if (action_type == "ENTITY") { 32 | entity_unsafe(state); 33 | } else if (action_type == "NEWNODE") { 34 | unsigned nid = get_action_arg1(node_map, action); 35 | newnode_unsafe(state, nid); 36 | } else if (action_type == "REDUCE") { 37 | reduce_unsafe(state); 38 | } else if (action_type == "DROP") { 39 | drop_unsafe(state); 40 | } else if (action_type == "CACHE") { 41 | cache_unsafe(state); 42 | } else if (action_type == "LEFT") { 43 | unsigned rid = get_action_arg1(rel_map, action); 44 | la_unsafe(state, rid); 45 | } else if (action_type == "RIGHT") { 46 | unsigned rid = get_action_arg1(rel_map, action); 47 | ra_unsafe(state, rid); 48 | } else { 49 | BOOST_ASSERT_MSG(false, "Illegal Action"); 50 | } 51 | } 52 | 53 | void Eager::get_valid_actions(const State & state, 54 | std::vector& valid_actions) { 55 | valid_actions.clear(); 56 | for (unsigned a = 0; a < n_actions; ++a) { 57 | //if (!is_valid_action(state, action_names[a])) { continue; } 58 | if (!is_valid_action(state, a)) { continue; } 59 | valid_actions.push_back(a); 60 | } 61 | BOOST_ASSERT_MSG(valid_actions.size() > 0, "There should be one or more valid action."); 62 | } 63 | 64 | void Eager::shift_unsafe(State & state) const { 65 | while (state.deque.size() > 0) { 66 | state.stack.push_back(state.deque.back()); 67 | state.deque.pop_back(); 68 | } 69 | state.stack.push_back(state.buffer.back()); 70 | state.buffer.pop_back(); 71 | } 72 | 73 | void Eager::confirm_unsafe(State & state) const { 74 | state.buffer[state.buffer.size() - 1] = std::make_pair(state.new_amr_node(), 2); 75 | } 76 | 77 | void Eager::reduce_unsafe(State & state) const { 78 | state.stack.pop_back(); 79 | } 80 | 81 | void Eager::merge_unsafe(State & state) const { 82 | state.buffer.pop_back(); 83 | state.buffer[state.buffer.size() - 1].second = 1; 84 | state.buffer[state.buffer.size() - 1].first = 1; 85 | // state.buffer[state.buffer.size() - 1].first = -1; 86 | } 87 | 88 | void Eager::entity_unsafe(State & state) const { 89 | state.buffer[state.buffer.size() - 1] = std::make_pair(state.new_amr_node(), 2); 90 | } 91 | 92 | void Eager::newnode_unsafe(State & state, const unsigned & node) const { 93 | state.buffer.push_back(std::make_pair(state.new_amr_node(), state.buffer.back().second + 1)); 94 | state.buffer[state.buffer.size() - 2].second += 5; 95 | } 96 | 97 | void Eager::drop_unsafe(State & state) const { 98 | state.buffer.pop_back(); 99 | } 100 | 101 | void Eager::cache_unsafe(State & state) const { 102 | state.deque.push_back(state.stack.back()); 103 | state.stack.pop_back(); 104 | } 105 | 106 | void Eager::la_unsafe(State & state, const unsigned & rel) const { 107 | unsigned u = state.buffer.back().first; 108 | unsigned v = state.stack.back().first; 109 | state.existing_edges.insert({ u, rel }); 110 | } 111 | 112 | void Eager::ra_unsafe(State& state, const unsigned & rel) const { 113 | unsigned u = state.stack.back().first; 114 | unsigned v = state.buffer.back().first; 115 | state.existing_edges.insert({ u, rel }); 116 | } 117 | 118 | 119 | std::string Eager::get_action_type(const unsigned & idx, const Alphabet & action_map) { 120 | std::string action = action_map.get(idx); 121 | std::vector terms; 122 | boost::algorithm::split(terms, action, boost::is_any_of(" \t"), boost::token_compress_on); 123 | return terms[0]; 124 | } 125 | 126 | bool Eager::is_valid_action(const State& state, const unsigned& action) const { 127 | std::string action_type = get_action_type(action, action_map); 128 | if (action_type == "_UNK_") { 129 | return false; 130 | } else if (action_type == "SHIFT") { 131 | return state.buffer.size() > 0 && state.buffer.back().second > 1; 132 | } else if (action_type == "CONFIRM") { 133 | return state.buffer.size() > 0 && state.buffer.back().second < 2; 134 | } else if (action_type == "MERGE") { 135 | return state.buffer.size() > 1 && state.buffer.back().second < 2 && state.buffer[state.buffer.size() - 2].second == 0; 136 | } else if (action_type == "ENTITY") { 137 | return state.buffer.size() > 0 && state.buffer.back().second < 2; 138 | } else if (action_type == "REDUCE") { 139 | return state.stack.size() > 0 && state.stack.back().second > 1; 140 | } else if (action_type == "DROP") { 141 | return state.buffer.size() > 0 && state.buffer.back().second == 0; 142 | } else if (action_type == "CACHE") { 143 | return state.buffer.size() > 0 && state.stack.size() > 0; 144 | } else if (action_type == "NEWNODE") { 145 | return state.buffer.size() > 0 && state.buffer.back().second > 1 && state.buffer.back().second <= 5; 146 | } else if (action_type == "LEFT" || action_type == "RIGHT") { 147 | if (state.stack.size() < 1 || state.stack.back().second < 2 || state.buffer.size() < 1 || state.buffer.back().second < 2) { 148 | return false; 149 | } 150 | unsigned u = state.stack.back().first; 151 | unsigned v = state.buffer.back().first; 152 | if (action_type == "LEFT") { 153 | std::swap(u, v); 154 | } 155 | std::vector terms; 156 | std::string a_str = action_map.get(action); 157 | boost::algorithm::split(terms, a_str, boost::is_any_of(" \t"), boost::token_compress_on); 158 | unsigned rid = rel_map.get(terms[1]); 159 | return state.existing_edges.find({ u, rid }) == state.existing_edges.end(); 160 | } else { 161 | BOOST_ASSERT_MSG(false, "Illegal Action"); 162 | } 163 | return true; 164 | } 165 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/eager.h: -------------------------------------------------------------------------------- 1 | #ifndef RLPARSER_LEFT_TO_RIGHT_EAGER_H 2 | #define RLPARSER_LEFT_TO_RIGHT_EAGER_H 3 | 4 | #include "system.h" 5 | 6 | struct Eager : public TransitionSystem { 7 | unsigned n_actions; 8 | 9 | Eager(const Alphabet & action_map, 10 | const Alphabet & node_map, 11 | const Alphabet & rel_map, 12 | const Alphabet & entity_map); 13 | 14 | std::string name(unsigned id) const override; 15 | 16 | unsigned num_actions() const override; 17 | 18 | void perform_action(State& state, const unsigned& action) override; 19 | 20 | void get_valid_actions(const State& state, 21 | std::vector& valid_actions) override; 22 | 23 | bool is_valid_action(const State& state, const unsigned& act) const override; 24 | 25 | void shift_unsafe(State& state) const; 26 | 27 | void confirm_unsafe(State & state) const; 28 | 29 | void merge_unsafe(State& state) const; 30 | 31 | void entity_unsafe(State & state) const; 32 | 33 | void reduce_unsafe(State& state) const; 34 | 35 | void drop_unsafe(State& state) const; 36 | 37 | void cache_unsafe(State& state) const; 38 | 39 | void la_unsafe(State & state, const unsigned & rel) const; 40 | 41 | void ra_unsafe(State& state, const unsigned & rel) const; 42 | 43 | void newnode_unsafe(State& state, const unsigned & node) const; 44 | 45 | static std::string get_action_type(const unsigned& action, const Alphabet & action_map); 46 | 47 | }; 48 | 49 | #endif // end for RLPARSER_LEFT_TO_RIGHT_SWAP_H -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/state.cc: -------------------------------------------------------------------------------- 1 | #include "state.h" 2 | 3 | 4 | State::State(unsigned n) : num_nodes(0) { 5 | } 6 | 7 | unsigned State::new_amr_node() { 8 | return num_nodes++; 9 | } 10 | 11 | bool State::terminated() { 12 | return stack.empty() && buffer.empty(); 13 | } 14 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/state.h: -------------------------------------------------------------------------------- 1 | #ifndef RLPARSER_LEFT_TO_RIGHT_STATE_H 2 | #define RLPARSER_LEFT_TO_RIGHT_STATE_H 3 | 4 | #include 5 | #include 6 | 7 | struct State { 8 | static const unsigned MAX_N_WORDS = 1024; 9 | 10 | std::vector> stack; 11 | std::vector> buffer; 12 | std::vector> deque; 13 | std::vector aux; 14 | 15 | std::set < std::vector > existing_edges; 16 | 17 | unsigned num_nodes; 18 | 19 | State(unsigned n); 20 | 21 | unsigned new_amr_node(); 22 | 23 | bool terminated(); 24 | }; 25 | 26 | 27 | #endif // end for RLPARSER_LEFT_TO_RIGHT_STATE_H -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/swap.cc: -------------------------------------------------------------------------------- 1 | #include "swap.h" 2 | #include "logging.h" 3 | #include "corpus.h" 4 | #include 5 | 6 | Swap::Swap(const Alphabet & action_map, const Alphabet & node_map, const Alphabet & rel_map, const Alphabet & entity_map) : 7 | TransitionSystem(action_map, node_map, rel_map, entity_map) { 8 | n_actions = action_map.size(); 9 | _INFO << "TransitionSystem:: show action names:"; 10 | for (const auto& x : action_map.str_to_id) { 11 | _INFO << "- " << x.first; 12 | } 13 | } 14 | 15 | std::string Swap::name(unsigned id) const { 16 | BOOST_ASSERT_MSG(id < action_map.size(), "id in illegal range"); 17 | return action_map.get(id); 18 | } 19 | 20 | unsigned Swap::num_actions() const { return n_actions; } 21 | 22 | void Swap::perform_action(State & state, const unsigned & action) { 23 | std::string action_type = get_action_type(action, action_map); 24 | if (action_type == "SHIFT") { 25 | shift_unsafe(state); 26 | } else if (action_type == "CONFIRM") { 27 | confirm_unsafe(state); 28 | } else if (action_type == "REDUCE") { 29 | reduce_unsafe(state); 30 | } else if (action_type == "MERGE") { 31 | merge_unsafe(state); 32 | } else if (action_type == "ENTITY") { 33 | entity_unsafe(state); 34 | } else if (action_type == "NEWNODE") { 35 | unsigned nid = get_action_arg1(node_map, action); 36 | newnode_unsafe(state, nid); 37 | } else if (action_type == "SWAP") { 38 | swap_unsafe(state); 39 | } else if (action_type == "LEFT") { 40 | unsigned rid = get_action_arg1(rel_map, action); 41 | la_unsafe(state, rid); 42 | } else if (action_type == "RIGHT") { 43 | unsigned rid = get_action_arg1(rel_map, action); 44 | ra_unsafe(state, rid); 45 | } else { 46 | BOOST_ASSERT_MSG(false, "Illegal Action"); 47 | } 48 | } 49 | 50 | void Swap::get_valid_actions(const State & state, 51 | std::vector& valid_actions) { 52 | valid_actions.clear(); 53 | for (unsigned a = 0; a < n_actions; ++a) { 54 | //if (!is_valid_action(state, action_names[a])) { continue; } 55 | if (!is_valid_action(state, a)) { continue; } 56 | valid_actions.push_back(a); 57 | } 58 | BOOST_ASSERT_MSG(valid_actions.size() > 0, "There should be one or more valid action."); 59 | } 60 | 61 | void Swap::shift_unsafe(State & state) const { 62 | state.stack.push_back(state.buffer.back()); 63 | state.buffer.pop_back(); 64 | } 65 | 66 | void Swap::confirm_unsafe(State & state) const { 67 | state.stack[state.stack.size() - 1] = std::make_pair(state.new_amr_node(), 2); 68 | } 69 | 70 | void Swap::reduce_unsafe(State & state) const { 71 | state.stack.pop_back(); 72 | } 73 | 74 | void Swap::merge_unsafe(State & state) const { 75 | state.stack.pop_back(); 76 | state.stack[state.stack.size() - 1].second = 1; 77 | } 78 | 79 | void Swap::entity_unsafe(State & state) const { 80 | state.stack[state.stack.size() - 1] = std::make_pair(state.new_amr_node(), 2); 81 | } 82 | 83 | void Swap::newnode_unsafe(State & state, const unsigned & node) const { 84 | state.stack.push_back(std::make_pair(state.new_amr_node(), state.stack.back().second + 1)); 85 | state.stack[state.stack.size() - 2].second += 5; 86 | } 87 | 88 | void Swap::swap_unsafe(State & state) const { 89 | auto j = state.stack.back(); state.stack.pop_back(); 90 | auto i = state.stack.back(); state.stack.pop_back(); 91 | state.stack.push_back(j); 92 | state.buffer.push_back(i); 93 | } 94 | 95 | void Swap::la_unsafe(State & state, const unsigned & rel) const { 96 | unsigned u = state.stack[state.stack.size() - 2].first; 97 | unsigned v = state.stack.back().first; 98 | state.existing_edges.insert({ u, rel }); 99 | } 100 | 101 | void Swap::ra_unsafe(State& state, const unsigned & rel) const { 102 | unsigned u = state.stack.back().first; 103 | unsigned v = state.stack[state.stack.size() - 2].first; 104 | state.existing_edges.insert({ u, rel }); 105 | } 106 | 107 | 108 | std::string Swap::get_action_type(const unsigned & idx, const Alphabet & action_map) { 109 | std::string action = action_map.get(idx); 110 | std::vector terms; 111 | boost::algorithm::split(terms, action, boost::is_any_of(" \t"), boost::token_compress_on); 112 | return terms[0]; 113 | } 114 | 115 | bool Swap::is_valid_action(const State& state, const unsigned& action) const { 116 | std::string action_type = get_action_type(action, action_map); 117 | if (action_type == "_UNK_") { 118 | return false; 119 | } else if (action_type == "SHIFT") { 120 | return state.buffer.size() > 0; 121 | } else if (action_type == "CONFIRM") { 122 | return state.stack.size() > 0 && state.stack.back().second == 0; 123 | } else if (action_type == "REDUCE") { 124 | return state.stack.size() > 0; 125 | } else if (action_type == "MERGE") { 126 | return state.stack.size() > 1 && state.stack.back().second < 2 && state.stack[state.stack.size() - 2].second == 0; 127 | } else if (action_type == "ENTITY") { 128 | return state.stack.size() > 0 && state.stack.back().second < 2; 129 | } else if (action_type == "NEWNODE") { 130 | return state.stack.size() > 0 && state.stack.back().second > 1 && state.stack.back().second <= 5; 131 | } else if (action_type == "SWAP") { 132 | return state.stack.size() > 1 && state.stack.back().second > 1 && state.stack[state.stack.size() - 2].second > 1; 133 | } else if (action_type == "LEFT" || action_type == "RIGHT") { 134 | if (state.stack.size() <= 1 || state.stack.back().second < 2 || state.stack[state.stack.size() - 2].second < 2) { 135 | return false; 136 | } 137 | unsigned u = state.stack.back().first; 138 | unsigned v = state.stack[state.stack.size() - 2].first; 139 | if (action_type == "LEFT") { 140 | std::swap(u, v); 141 | } 142 | std::vector terms; 143 | std::string a_str = action_map.get(action); 144 | boost::algorithm::split(terms, a_str, boost::is_any_of(" \t"), boost::token_compress_on); 145 | unsigned rid = rel_map.get(terms[1]); 146 | return state.existing_edges.find({ u, rid }) == state.existing_edges.end(); 147 | } else { 148 | BOOST_ASSERT_MSG(false, "Illegal Action"); 149 | } 150 | return true; 151 | } 152 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/swap.h: -------------------------------------------------------------------------------- 1 | #ifndef RLPARSER_LEFT_TO_RIGHT_SWAP_H 2 | #define RLPARSER_LEFT_TO_RIGHT_SWAP_H 3 | 4 | #include "system.h" 5 | 6 | struct Swap : public TransitionSystem { 7 | unsigned n_actions; 8 | 9 | Swap(const Alphabet & action_map, const Alphabet & node_map, const Alphabet & rel_map, const Alphabet & entity_map); 10 | 11 | std::string name(unsigned id) const override; 12 | 13 | unsigned num_actions() const override; 14 | 15 | void perform_action(State& state, const unsigned& action) override; 16 | 17 | void get_valid_actions(const State& state, 18 | std::vector& valid_actions) override; 19 | 20 | bool is_valid_action(const State& state, const unsigned& act) const override; 21 | 22 | void shift_unsafe(State& state) const; 23 | void confirm_unsafe(State & state) const; 24 | void reduce_unsafe(State& state) const; 25 | void merge_unsafe(State& state) const; 26 | void entity_unsafe(State & state) const; 27 | void newnode_unsafe(State& state, const unsigned & node) const; 28 | void swap_unsafe(State& state) const; 29 | void la_unsafe(State & state, const unsigned & rel) const; 30 | void ra_unsafe(State& state, const unsigned & rel) const; 31 | 32 | static std::string get_action_type(const unsigned& action, const Alphabet & action_map); 33 | 34 | }; 35 | 36 | #endif // end for RLPARSER_LEFT_TO_RIGHT_SWAP_H -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/system.cc: -------------------------------------------------------------------------------- 1 | #include "system.h" 2 | #include "logging.h" 3 | #include 4 | #include 5 | 6 | 7 | unsigned TransitionSystem::get_action_arg1(const Alphabet & map, const unsigned &action) { 8 | std::vector terms; 9 | std::string a_str = action_map.get(action); 10 | boost::algorithm::split(terms, a_str, boost::is_any_of(" \t"), boost::token_compress_on); 11 | return map.get(terms[1]); 12 | } -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/system/system.h: -------------------------------------------------------------------------------- 1 | #ifndef RLPARSER_LEFT_TO_RIGHT_SYSTEM_H 2 | #define RLPARSER_LEFT_TO_RIGHT_SYSTEM_H 3 | 4 | #include 5 | #include "state.h" 6 | #include "corpus.h" 7 | 8 | struct TransitionSystem { 9 | enum REWARD { kLocal, kGlobal, kGlobalMaxout }; 10 | REWARD reward_type; 11 | 12 | Alphabet action_map; 13 | Alphabet node_map; 14 | Alphabet rel_map; 15 | Alphabet entity_map; 16 | 17 | TransitionSystem(const Alphabet & action_map, 18 | const Alphabet & node_map, 19 | const Alphabet & rel_map, 20 | const Alphabet & entity_map) : 21 | action_map(action_map), node_map(node_map), rel_map(rel_map), entity_map(entity_map) {} 22 | 23 | unsigned get_action_arg1(const Alphabet & map, const unsigned & action); 24 | 25 | virtual std::string name(unsigned id) const = 0; 26 | 27 | virtual unsigned num_actions() const = 0; 28 | 29 | virtual void perform_action(State& state, const unsigned& action) = 0; 30 | 31 | virtual bool is_valid_action(const State& state, const unsigned& act) const = 0; 32 | 33 | virtual void get_valid_actions(const State& state, std::vector& valid_actions) = 0; 34 | }; 35 | 36 | #endif // end for RLPARSER_LEFT_TO_RIGHT_SYSTEM_H 37 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/train/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories (${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/src/left_to_right/) 2 | 3 | add_library (parser_l2r_train 4 | train.cc 5 | train.h 6 | train_supervised.cc 7 | train_supervised.h 8 | algorithm.h) 9 | 10 | target_link_libraries (parser_l2r_train parser_l2r_parser) 11 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/train/algorithm.h: -------------------------------------------------------------------------------- 1 | #ifndef TRAIN_ALGORITHM_H 2 | #define TRAIN_ALGORITHM_H 3 | 4 | #include "train/train_supervised.h" 5 | 6 | #endif // end for TRAIN_ALGORITHM -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/train/train.cc: -------------------------------------------------------------------------------- 1 | #include "train.h" 2 | #include "logging.h" 3 | #include "evaluate/evaluate.h" 4 | 5 | Trainer::Trainer(const po::variables_map & conf) { 6 | gamma = conf["gamma"].as(); 7 | _INFO << "RL:: gamma = " << gamma; 8 | 9 | lambda_ = conf["lambda"].as(); 10 | _INFO << "RL:: lambda = " << lambda_; 11 | } 12 | 13 | void Trainer::eval(const po::variables_map& conf, 14 | const std::string & output, 15 | const std::string & model_name, 16 | float & current_best, 17 | Corpus & corpus, 18 | Parser & parser, 19 | bool update_and_save) { 20 | float f = evaluate(conf, corpus, parser, output, true); 21 | if (update_and_save && f > current_best) { 22 | current_best = f; 23 | dynet::save_dynet_model(model_name, (&(parser.model))); 24 | f = evaluate(conf, corpus, parser, output, false); 25 | _INFO << "Trainer:: new best record achieved " << current_best << ", test: " << f; 26 | } 27 | } 28 | 29 | dynet::Expression Trainer::l2(Parser & parser, unsigned n) { 30 | std::vector reg; 31 | for (auto e : parser.get_params()) { reg.push_back(dynet::squared_norm(e)); } 32 | return (0.5 * n) * lambda_ * dynet::sum(reg); 33 | } 34 | -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/train/train.h: -------------------------------------------------------------------------------- 1 | #ifndef TRAIN_H 2 | #define TRAIN_H 3 | 4 | #include 5 | #include "parser/parser.h" 6 | #include "corpus.h" 7 | namespace po = boost::program_options; 8 | 9 | struct Trainer { 10 | float gamma; 11 | float lambda_; 12 | 13 | Trainer(const po::variables_map& conf); 14 | 15 | void eval(const po::variables_map& conf, 16 | const std::string & output, 17 | const std::string & model_name, 18 | float & current_best, 19 | Corpus & corpus, 20 | Parser & parser, 21 | bool update_and_save = true); 22 | 23 | void eval(const po::variables_map& conf, 24 | const std::string & output, 25 | const std::string & model_name, 26 | float & current_best, 27 | Corpus & corpus, 28 | Parser & parser, 29 | Parser & parser2, 30 | bool update_and_save = true); 31 | 32 | dynet::Expression l2(Parser & parser, unsigned n); 33 | }; 34 | 35 | #endif // end for TRAIN_H -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/train/train_supervised.cc: -------------------------------------------------------------------------------- 1 | #include "trainer_utils.h" 2 | #include "train_supervised.h" 3 | #include "logging.h" 4 | #include "evaluate/evaluate.h" 5 | 6 | po::options_description SupervisedTrainer::get_options() { 7 | po::options_description cmd("Supervised options"); 8 | cmd.add_options() 9 | ("supervised_oracle", po::value()->default_value("static"), "The type of oracle in supervised learning [static|dynamic|pseduo_dynamic].") 10 | ("supervised_objective", po::value()->default_value("crossentropy"), "The learning objective [crossentropy|rank|bipartie_rank]") 11 | ("supervised_do_pretrain_iter", po::value()->default_value(1), "The number of pretrain iteration on dynamic oracle.") 12 | ("supervised_do_explore_prob", po::value()->default_value(0.9), "The probability of exploration.") 13 | ("supervised_pseudo_oracle_model", po::value(), "The path to the pseudo dynamic oracle model, must in pseduo_dynamic mode.") 14 | ; 15 | return cmd; 16 | } 17 | 18 | SupervisedTrainer::SupervisedTrainer(const po::variables_map& conf, Parser * p) : 19 | Trainer(conf), 20 | parser(p), 21 | pseudo_dynamic_oracle(nullptr), 22 | pseudo_dynamic_oracle_model(nullptr) { 23 | if (conf["supervised_oracle"].as() == "static") { 24 | oracle_type = kStatic; 25 | } else { 26 | _ERROR << "Unknown oracle :" << conf["supervised_oracle"].as(); 27 | } 28 | 29 | if (conf["supervised_objective"].as() == "crossentropy") { 30 | objective_type = kCrossEntropy; 31 | } else if (conf["supervised_objective"].as() == "rank") { 32 | objective_type = kRank; 33 | } else { 34 | objective_type = kBipartieRank; 35 | } 36 | lambda_ = conf["lambda"].as(); 37 | _INFO << "SUP:: learning objective " << conf["supervised_objective"].as(); 38 | 39 | system = conf["system"].as(); 40 | } 41 | 42 | void SupervisedTrainer::train(const po::variables_map& conf, 43 | Corpus& corpus, 44 | const std::string& name, 45 | const std::string& output) { 46 | dynet::ParameterCollection& model = parser->model; 47 | _INFO << "SUP:: start lstm-parser supervised training."; 48 | 49 | dynet::Trainer* trainer = get_trainer(conf, model); 50 | // unsigned kUNK = corpus.get_or_add_word(Corpus::UNK); 51 | unsigned max_iter = conf["max_iter"].as(); 52 | 53 | float llh = 0.f; 54 | float llh_in_batch = 0.f; 55 | float best_f = 0.f; 56 | 57 | std::vector order; 58 | get_orders(corpus, order); 59 | float n_train = order.size(); 60 | 61 | unsigned logc = 0; 62 | // unsigned unk_strategy = conf["unk_strategy"].as(); 63 | // float unk_prob = conf["unk_prob"].as(); 64 | unsigned report_stops = conf["report_stops"].as(); 65 | unsigned evaluate_stops = conf["evaluate_stops"].as(); 66 | unsigned evaluate_skips = conf["evaluate_skips"].as(); 67 | float eta0 = trainer->learning_rate; 68 | 69 | _INFO << "SUP:: will stop after " << max_iter << " iterations."; 70 | for (unsigned iter = 0; iter < max_iter; ++iter) { 71 | llh = 0; 72 | _INFO << "SUP:: start training iteration #" << iter << ", shuffled."; 73 | std::shuffle(order.begin(), order.end(), (*dynet::rndeng)); 74 | 75 | for (unsigned sid : order) { 76 | _TRACE << "sid=" << sid; 77 | InputUnits& input_units = corpus.training_inputs[sid]; 78 | const ActionUnits& parse_units = corpus.training_actions[sid]; 79 | //random_replace_singletons(unk_strategy, unk_prob, corpus.singleton, kUNK, input_units); 80 | 81 | float lp; 82 | 83 | lp = train_on_one_full_tree(input_units, parse_units, trainer, iter); 84 | 85 | llh += lp; 86 | llh_in_batch += lp; 87 | //restore_singletons(unk_strategy, input_units); 88 | 89 | ++logc; 90 | if (logc % report_stops == 0) { 91 | float epoch = (float(logc) / n_train); 92 | _INFO << "SUP:: iter #" << iter << " (epoch " << epoch << ") loss " << llh_in_batch; 93 | llh_in_batch = 0.f; 94 | } 95 | if (iter >= evaluate_skips && logc % evaluate_stops == 0) { 96 | eval(conf, output, name, best_f, corpus, *parser); 97 | } 98 | } 99 | 100 | _INFO << "SUP:: end of iter #" << iter << " loss " << llh; 101 | eval(conf, output, name, best_f, corpus, *parser); 102 | 103 | update_trainer(conf, eta0, float(iter), trainer); 104 | trainer->status(); 105 | } 106 | 107 | delete trainer; 108 | } 109 | 110 | float SupervisedTrainer::train_on_one_full_tree(const InputUnits& input_units, 111 | const ActionUnits& action_units, 112 | dynet::Trainer* trainer, 113 | unsigned iter) { 114 | dynet::ComputationGraph cg; 115 | parser->activate_training(); 116 | parser->new_graph(cg); 117 | 118 | std::vector loss; 119 | 120 | unsigned len = input_units.size(); 121 | //for (int i = 0; i < len; i++) { 122 | // std::cerr << input_units[i].w_str << " "; 123 | //} 124 | //std::cerr << std::endl; 125 | State state(len); 126 | parser->initialize(cg, input_units, state); 127 | 128 | unsigned illegal_action = parser->sys.num_actions(); 129 | unsigned n_actions = 0; 130 | while (!state.terminated()) { 131 | // collect all valid actions. 132 | std::vector valid_actions; 133 | parser->sys.get_valid_actions(state, valid_actions); 134 | 135 | dynet::Expression score_exprs = parser->get_scores(); 136 | std::vector scores = dynet::as_vector(cg.get_value(score_exprs)); 137 | unsigned action = 0; 138 | 139 | unsigned best_gold_action = illegal_action; 140 | unsigned worst_gold_action = illegal_action; 141 | unsigned best_non_gold_action = illegal_action; 142 | 143 | best_gold_action = action_units[n_actions].aid; 144 | //std::cerr << action_units[n_actions].a_str << std::endl; 145 | action = action_units[n_actions].aid; 146 | 147 | if (objective_type == kRank || objective_type == kBipartieRank) { 148 | float best_non_gold_action_score = -1e10; 149 | for (unsigned i = 0; i < valid_actions.size(); ++i) { 150 | unsigned act = valid_actions[i]; 151 | if (act != best_gold_action && (scores[act] > best_non_gold_action_score)) { 152 | best_non_gold_action = act; 153 | best_non_gold_action_score = scores[act]; 154 | } 155 | } 156 | } 157 | 158 | if (objective_type == kCrossEntropy) { 159 | loss.push_back(dynet::pickneglogsoftmax(score_exprs, best_gold_action)); 160 | } else if (objective_type == kRank) { 161 | if (best_gold_action != illegal_action && best_non_gold_action != illegal_action) { 162 | loss.push_back(dynet::pairwise_rank_loss( 163 | dynet::pick(score_exprs, best_gold_action), 164 | dynet::pick(score_exprs, best_non_gold_action) 165 | )); 166 | } 167 | } else { 168 | if (worst_gold_action != illegal_action && best_non_gold_action != illegal_action) { 169 | loss.push_back(dynet::pairwise_rank_loss( 170 | dynet::pick(score_exprs, worst_gold_action), 171 | dynet::pick(score_exprs, best_non_gold_action) 172 | )); 173 | } 174 | } 175 | 176 | //CONFIRM 177 | if (action == 0 && best_gold_action == 0) { 178 | dynet::Expression confirm_scores_expr; 179 | if (system == "eager") { 180 | confirm_scores_expr = parser->get_confirm_values(state.buffer.back().first); 181 | } else if (system == "swap") { 182 | confirm_scores_expr = parser->get_confirm_values(state.stack.back().first); 183 | } else { 184 | BOOST_ASSERT_MSG(false, "Illegal System"); 185 | } 186 | //std::cerr << confirm_scores_expr.dim()[0] << " " << confirm_scores_expr.dim()[1] << std::endl; 187 | //std::cerr << "~" << action_units[n_actions].idx << " " << state.stack.back().first << " " << state.stack.back().second << std::endl; 188 | //std::cerr << action_units[n_actions].idx << std::endl; 189 | loss.push_back(dynet::pickneglogsoftmax(confirm_scores_expr, action_units[n_actions].idx)); 190 | //std::cerr << action_units[n_actions].idx << std::endl; 191 | } 192 | 193 | parser->perform_action(action, cg, state); 194 | n_actions++; 195 | } 196 | float ret = 0.f; 197 | if (loss.size() > 0) { 198 | std::vector all_params = parser->get_params(); 199 | std::vector reg; 200 | for (auto e : all_params) { reg.push_back(dynet::squared_norm(e)); } 201 | dynet::Expression l = dynet::sum(loss) + 0.5 * loss.size() * lambda_ * dynet::sum(reg); 202 | ret = dynet::as_scalar(cg.incremental_forward(l)); 203 | cg.backward(l); 204 | trainer->update(); 205 | } 206 | return ret; 207 | } -------------------------------------------------------------------------------- /amr_parser/src/left_to_right/train/train_supervised.h: -------------------------------------------------------------------------------- 1 | #ifndef TRAIN_SUPERVISED_H 2 | #define TRAIN_SUPERVISED_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "parser/parser.h" 8 | #include "dynet/training.h" 9 | #include "train.h" 10 | 11 | namespace po = boost::program_options; 12 | 13 | struct SupervisedTrainer : public Trainer { 14 | enum ORACLE_TYPE { kStatic, kDynamic, kPseudoDynamic }; 15 | enum OBJECTIVE_TYPE { kCrossEntropy, kRank, kBipartieRank }; 16 | ORACLE_TYPE oracle_type; 17 | OBJECTIVE_TYPE objective_type; 18 | Parser* parser; 19 | Parser* pseudo_dynamic_oracle; 20 | dynet::Model* pseudo_dynamic_oracle_model; 21 | float do_pretrain_iter; 22 | float do_explore_prob; 23 | std::string system; 24 | 25 | 26 | static po::options_description get_options(); 27 | 28 | SupervisedTrainer(const po::variables_map& conf, Parser* parser); 29 | 30 | /* Code for supervised pretraining. */ 31 | void train(const po::variables_map& conf, 32 | Corpus& corpus, 33 | const std::string& name, 34 | const std::string& output); 35 | 36 | float train_on_one_full_tree(const InputUnits& input_units, 37 | const ActionUnits& action_units, 38 | dynet::Trainer* trainer, 39 | unsigned iter); 40 | }; 41 | 42 | #endif // end for TRAIN_SUPERVISED_H -------------------------------------------------------------------------------- /amr_parser/src/logging.cc: -------------------------------------------------------------------------------- 1 | #include "logging.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | void init_boost_log(bool verbose) { 11 | namespace logging = boost::log; 12 | namespace src = boost::log::sources; 13 | namespace expr = boost::log::expressions; 14 | namespace keywords = boost::log::keywords; 15 | 16 | logging::add_console_log( 17 | std::clog, 18 | keywords::format = ( 19 | expr::stream 20 | << expr::format_date_time< boost::posix_time::ptime >( 21 | "TimeStamp", 22 | "%Y-%m-%d %H:%M:%S") 23 | << " [" << logging::trivial::severity << "] " 24 | << expr::smessage 25 | ) 26 | ); 27 | 28 | if (verbose) { 29 | logging::core::get()->set_filter(logging::trivial::severity >= logging::trivial::trace); 30 | } else { 31 | logging::core::get()->set_filter(logging::trivial::severity > logging::trivial::trace); 32 | } 33 | 34 | logging::add_common_attributes(); 35 | } 36 | -------------------------------------------------------------------------------- /amr_parser/src/logging.h: -------------------------------------------------------------------------------- 1 | #ifndef LOGGING_UTILS_H 2 | #define LOGGING_UTILS_H 3 | 4 | #include 5 | #define _TRACE BOOST_LOG_TRIVIAL(trace) 6 | #define _DEBUG BOOST_LOG_TRIVIAL(debug) 7 | #define _INFO BOOST_LOG_TRIVIAL(info) 8 | #define _WARN BOOST_LOG_TRIVIAL(warning) 9 | #define _ERROR BOOST_LOG_TRIVIAL(error) 10 | 11 | 12 | void init_boost_log(bool verbose); 13 | 14 | 15 | #endif // end for LOGGING_UTILS_H 16 | -------------------------------------------------------------------------------- /amr_parser/src/lstm.cc: -------------------------------------------------------------------------------- 1 | #include "lstm.h" 2 | 3 | enum { X2I, H2I, C2I, BI, X2O, H2O, C2O, BO, X2C, H2C, BC }; 4 | 5 | LSTMBuilder::LSTMBuilder(unsigned layers, 6 | unsigned input_dim, 7 | unsigned hidden_dim, 8 | dynet::ParameterCollection& model, 9 | bool trainable) : 10 | dynet::CoupledLSTMBuilder(layers, input_dim, hidden_dim, model), 11 | trainable(trainable) { 12 | } 13 | 14 | void LSTMBuilder::new_graph(dynet::ComputationGraph& cg) { 15 | if (trainable) { 16 | dynet::CoupledLSTMBuilder::new_graph(cg); 17 | } else { 18 | // cannot call sm.transition directly. this will waste some nodes 19 | // in computation graph. 20 | dynet::CoupledLSTMBuilder::new_graph(cg); 21 | param_vars.clear(); 22 | for (unsigned i = 0; i < layers; ++i) { 23 | auto& p = params[i]; 24 | 25 | //i 26 | dynet::Expression i_x2i = dynet::const_parameter(cg, p[X2I]); 27 | dynet::Expression i_h2i = dynet::const_parameter(cg, p[H2I]); 28 | dynet::Expression i_c2i = dynet::const_parameter(cg, p[C2I]); 29 | dynet::Expression i_bi = dynet::const_parameter(cg, p[BI]); 30 | //o 31 | dynet::Expression i_x2o = dynet::const_parameter(cg, p[X2O]); 32 | dynet::Expression i_h2o = dynet::const_parameter(cg, p[H2O]); 33 | dynet::Expression i_c2o = dynet::const_parameter(cg, p[C2O]); 34 | dynet::Expression i_bo = dynet::const_parameter(cg, p[BO]); 35 | //c 36 | dynet::Expression i_x2c = dynet::const_parameter(cg, p[X2C]); 37 | dynet::Expression i_h2c = dynet::const_parameter(cg, p[H2C]); 38 | dynet::Expression i_bc = dynet::const_parameter(cg, p[BC]); 39 | 40 | std::vector vars = { 41 | i_x2i, i_h2i, i_c2i, i_bi, 42 | i_x2o, i_h2o, i_c2o, i_bo, 43 | i_x2c, i_h2c, i_bc 44 | }; 45 | param_vars.push_back(vars); 46 | } // layers 47 | } 48 | } 49 | 50 | 51 | BiLSTMBuilder::BiLSTMBuilder(unsigned layers, 52 | unsigned input_dim, 53 | unsigned hidden_dim, 54 | dynet::ParameterCollection& model, 55 | bool trainable): 56 | trainable(trainable), 57 | fw_lstm(layers, input_dim, hidden_dim, model, trainable), 58 | bw_lstm(layers, input_dim, hidden_dim, model, trainable), 59 | p_fw_guard(model.add_parameters({ input_dim, 1 })), 60 | p_bw_guard(model.add_parameters({ input_dim, 1 })) { 61 | } 62 | 63 | void BiLSTMBuilder::new_graph(dynet::ComputationGraph &cg) { 64 | fw_lstm.new_graph(cg); 65 | bw_lstm.new_graph(cg); 66 | if (trainable) { 67 | fw_guard = dynet::parameter(cg, p_fw_guard); 68 | bw_guard = dynet::parameter(cg, p_bw_guard); 69 | } 70 | else { 71 | fw_guard = dynet::const_parameter(cg, p_fw_guard); 72 | bw_guard = dynet::const_parameter(cg, p_bw_guard); 73 | } 74 | } 75 | 76 | dynet::Expression BiLSTMBuilder::get_h(SymbolEmbedding &char_emb, const std::vector & c_id) { 77 | fw_lstm.start_new_sequence(); 78 | bw_lstm.start_new_sequence(); 79 | fw_lstm.add_input(fw_guard); 80 | bw_lstm.add_input(bw_guard); 81 | 82 | std::vector inputs(c_id.size()); 83 | for (int i = 0; i < c_id.size(); i++) { 84 | inputs[i] = char_emb.embed(c_id[i]); 85 | } 86 | for (int i = 0; i < inputs.size(); i++) { 87 | fw_lstm.add_input(inputs[i]); 88 | bw_lstm.add_input(inputs[inputs.size() - i - 1]); 89 | } 90 | return dynet::concatenate({ fw_lstm.get_h(inputs.size()).back(), bw_lstm.get_h(inputs.size()).back() }); 91 | } 92 | 93 | -------------------------------------------------------------------------------- /amr_parser/src/lstm.h: -------------------------------------------------------------------------------- 1 | #ifndef LSTM_CONST_NEW_GRAPH_H 2 | #define LSTM_CONST_NEW_GRAPH_H 3 | 4 | #include "dynet/lstm.h" 5 | #include "dynet/expr.h" 6 | #include "dynet/model.h" 7 | #include "dynet_layer/layer.h" 8 | #include "ds.h" 9 | 10 | struct LSTMBuilder : public dynet::CoupledLSTMBuilder { 11 | bool trainable; 12 | explicit LSTMBuilder(unsigned layers, 13 | unsigned input_dim, 14 | unsigned hidden_dim, 15 | dynet::ParameterCollection& model, 16 | bool trainable=true); 17 | void active_training() { trainable = true; } 18 | void inactive_training() { trainable = false; } 19 | void new_graph(dynet::ComputationGraph& cg); 20 | }; 21 | 22 | struct BiLSTMBuilder { 23 | bool trainable; 24 | LSTMBuilder fw_lstm; 25 | LSTMBuilder bw_lstm; 26 | dynet::Parameter p_fw_guard; 27 | dynet::Parameter p_bw_guard; 28 | 29 | dynet::Expression fw_guard; 30 | dynet::Expression bw_guard; 31 | BiLSTMBuilder(unsigned layers, 32 | unsigned input_dim, 33 | unsigned hidden_dim, 34 | dynet::ParameterCollection& model, 35 | bool trainable = true); 36 | 37 | void active_training() { fw_lstm.active_training(); bw_lstm.active_training(); } 38 | void inactive_training() { fw_lstm.inactive_training(); bw_lstm.inactive_training(); } 39 | void new_graph(dynet::ComputationGraph &cg); 40 | dynet::Expression get_h(SymbolEmbedding &char_emb, const std::vector & c_id); 41 | 42 | }; 43 | 44 | 45 | 46 | #endif // end for LSTM_CONST_NEW_GRAPH -------------------------------------------------------------------------------- /amr_parser/src/math_utils.cc: -------------------------------------------------------------------------------- 1 | #include "math_utils.h" 2 | #include 3 | 4 | void MeanStdevStreamer::clear() { n = 0; } 5 | 6 | void MeanStdevStreamer::push(double x) { 7 | n++; 8 | if (n == 1) { 9 | old_m = new_m = x; 10 | old_s = 0.; 11 | } else { 12 | new_m = old_m + (x - old_m) / n; 13 | new_s = old_s + (x - old_m) * (x - new_m); 14 | 15 | old_m = new_m; 16 | old_s = new_s; 17 | } 18 | } 19 | 20 | int MeanStdevStreamer::num_data_values() const { return n; } 21 | double MeanStdevStreamer::mean() const { return ((n > 0) ? new_m : 0.0); } 22 | double MeanStdevStreamer::variance() const { return ((n > 1) ? new_s / (n - 1) : 0.0); } 23 | double MeanStdevStreamer::stdev() const { return sqrt(variance()); } 24 | 25 | void mean_and_stddev(const std::deque& data, 26 | float& mean, float& stddev) { 27 | float n = 0.; 28 | float sum1 = 0., sum2 = 0.; 29 | for (auto x : data) { sum1 += x; n += 1.; } 30 | mean = sum1 / n; 31 | for (auto x : data) { sum2 += (x - mean) * (x - mean); } 32 | stddev = sqrt(sum2 / (n - 1)); 33 | } 34 | 35 | void softmax_copy(const std::vector& input, std::vector& output) { 36 | BOOST_ASSERT_MSG(input.size() > 0, "input should have one or more element."); 37 | float m = input[0]; 38 | output.resize(input.size()); 39 | for (unsigned i = 1; i < input.size(); ++i) { m = (input[i] > m ? input[i] : m); } 40 | float s = 0.; 41 | for (unsigned i = 0; i < input.size(); ++i) { 42 | output[i] = exp(input[i] - m); 43 | s += output[i]; 44 | } 45 | for (unsigned i = 0; i < output.size(); ++i) { output[i] /= s; } 46 | } 47 | 48 | void softmax_inplace(std::vector& x) { 49 | BOOST_ASSERT_MSG(x.size() > 0, "input should have one or more element."); 50 | float m = x[0]; 51 | for (const float& _x : x) { m = (_x > m ? _x : m); } 52 | float s = 0.; 53 | for (unsigned i = 0; i < x.size(); ++i) { 54 | x[i] = exp(x[i] - m); 55 | s += x[i]; 56 | } 57 | for (unsigned i = 0; i < x.size(); ++i) { x[i] /= s; } 58 | } 59 | 60 | void softmax_inplace_on_valid_indicies(std::vector& x, 61 | const std::vector& valid_indices) { 62 | BOOST_ASSERT_MSG(x.size() > 0, "input should have one or more element."); 63 | BOOST_ASSERT_MSG(valid_indices.size() > 0, "input should have one or more indicces."); 64 | float m = x[valid_indices[0]]; 65 | for (unsigned id : valid_indices) { m = (x[id] > m ? x[id] : m); } 66 | float s = 0.; 67 | for (unsigned id : valid_indices) { 68 | x[id] = exp(x[id] - m); 69 | s += x[id]; 70 | } 71 | for (unsigned id : valid_indices) { x[id] /= s; } 72 | } 73 | 74 | void unnormalized_softmax_inplace(std::vector& x) { 75 | BOOST_ASSERT_MSG(x.size() > 0, "input should have one or more element."); 76 | float m = x[0]; 77 | for (const float& _x : x) { m = (_x > m ? _x : m); } 78 | for (unsigned i = 0; i < x.size(); ++i) { x[i] = exp(x[i] - m); } 79 | } 80 | 81 | std::vector fisher_yates_shuffle(unsigned size, 82 | unsigned max_size, 83 | std::mt19937& gen) { 84 | assert(size < max_size); 85 | std::vector b(size); 86 | 87 | for (unsigned i = 0; i < max_size; ++i) { 88 | std::uniform_int_distribution<> dis(0, i); 89 | unsigned j = dis(gen); 90 | if (j < b.size()) { 91 | if (i < j) { 92 | b[i] = b[j]; 93 | } 94 | b[j] = i; 95 | } 96 | } 97 | return b; 98 | } 99 | 100 | unsigned distribution_sample(const std::vector& prob, 101 | std::mt19937& gen) { 102 | // http://en.cppreference.com/w/cpp/numeric/random/discrete_distribution 103 | // std::discrete_distribution produces random integers on the interval [0, n) 104 | // std::discrete_distribution<> d({40, 10, 10, 40}); 105 | std::discrete_distribution distrib(prob.begin(), prob.end()); 106 | return distrib(gen); 107 | } 108 | 109 | void reservoir_sample_n(const std::vector& S, unsigned N, 110 | std::vector& R, unsigned K, 111 | std::mt19937& gen) { 112 | for (unsigned i = 0; i < K; ++i) { R[i] = S[i]; } 113 | for (unsigned i = K; i < N; ++i) { 114 | std::uniform_int_distribution<> dis(0, i - 1); 115 | unsigned j = dis(gen); 116 | if (j < K) { R[j] = S[i]; } 117 | } 118 | } 119 | 120 | void fast_reservoir_sample_n(const std::vector& S, unsigned N, 121 | std::vector& R, unsigned K, 122 | std::mt19937& gen) { 123 | for (unsigned i = 0; i < K; ++i) { R[i] = S[i]; } 124 | unsigned t = 4 * K; 125 | unsigned j = 1 + K; 126 | while (j < N && j <= t) { 127 | std::uniform_int_distribution<> dis(0, j - 1); 128 | unsigned k = dis(gen); 129 | if (k < K) { R[k] = S[j]; } 130 | j++; 131 | } 132 | while (j < N) { 133 | float p = static_cast(K) / j; 134 | std::uniform_real_distribution<> dis(0, 1); 135 | float u = dis(gen); 136 | unsigned g = static_cast(floor(log(u) / log(1 - p))); 137 | j = j + g; 138 | if (j < N) { 139 | std::uniform_int_distribution<> dis(0, K - 1); 140 | unsigned k = dis(gen); 141 | if (k < K) { R[k] = S[j]; } 142 | } 143 | j++; 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /amr_parser/src/math_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef MATH_UTILS_H 2 | #define MATH_UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | struct MeanStdevStreamer { 9 | int n; 10 | double old_m, new_m, old_s, new_s; 11 | 12 | void clear(); 13 | void push(double x); 14 | int num_data_values() const; 15 | double mean() const; 16 | double variance() const; 17 | double stdev() const; 18 | }; 19 | 20 | void mean_and_stddev(const std::deque& data, 21 | float& mean, 22 | float& stddev); 23 | 24 | void softmax_copy(const std::vector& input, 25 | std::vector& out); 26 | 27 | void softmax_inplace(std::vector& x); 28 | 29 | void softmax_inplace_on_valid_indicies(std::vector& x, 30 | const std::vector& valid_indices); 31 | 32 | void unnormalized_softmax_inplace(std::vector& x); 33 | 34 | // Shuffle 35 | std::vector fisher_yates_shuffle(unsigned size, 36 | unsigned max_size, 37 | std::mt19937& gen); 38 | 39 | // Sample one 40 | unsigned distribution_sample(const std::vector& prob, std::mt19937& gen); 41 | 42 | // Sample n 43 | void reservoir_sample_n(const std::vector& S, unsigned N, 44 | std::vector& R, unsigned K, 45 | std::mt19937& gen); 46 | 47 | void fast_reservoir_sample_n(const std::vector& S, unsigned N, 48 | std::vector& R, unsigned K, 49 | std::mt19937& gen); 50 | 51 | #endif // end for MATH_UTILS_H 52 | -------------------------------------------------------------------------------- /amr_parser/src/sys_utils.cc: -------------------------------------------------------------------------------- 1 | #include "sys_utils.h" 2 | #include "logging.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #if _MSC_VER 8 | #include 9 | #endif 10 | 11 | 12 | int portable_getpid() { 13 | #ifdef _MSC_VER 14 | return _getpid(); 15 | #else 16 | return getpid(); 17 | #endif 18 | } 19 | 20 | float execute_and_get_result(const std::string& cmd) { 21 | _TRACE << "Running: " << cmd; 22 | system(cmd.c_str()); 23 | 24 | #ifndef _MSC_VER 25 | FILE* pipe = popen(cmd.c_str(), "r"); 26 | #else 27 | FILE* pipe = _popen(cmd.c_str(), "r"); 28 | #endif 29 | if (!pipe) { 30 | return 0.f; 31 | } 32 | char buffer[128]; 33 | std::string result = ""; 34 | while (!feof(pipe)) { 35 | if (fgets(buffer, 128, pipe) != NULL) { result += buffer; } 36 | } 37 | #ifndef _MSC_VER 38 | pclose(pipe); 39 | #else 40 | _pclose(pipe); 41 | #endif 42 | 43 | std::stringstream S(result); 44 | std::string token; 45 | while (S >> token) { 46 | boost::algorithm::trim(token); 47 | return boost::lexical_cast(token); 48 | } 49 | return 0.f; 50 | } 51 | -------------------------------------------------------------------------------- /amr_parser/src/sys_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef SYS_UTILS_H 2 | #define SYS_UTILS_H 3 | 4 | #include 5 | 6 | int portable_getpid(); 7 | 8 | float execute_and_get_result(const std::string& cmd); 9 | 10 | #endif // end for SYS_UTILS_H -------------------------------------------------------------------------------- /amr_parser/src/trainer_utils.cc: -------------------------------------------------------------------------------- 1 | #include "trainer_utils.h" 2 | #include "sys_utils.h" 3 | #include "logging.h" 4 | #include 5 | #include 6 | 7 | void random_replace_singletons(const unsigned & unk_strategy, 8 | const float & unk_prob, 9 | const std::set& singletons, 10 | const unsigned& kUNK, 11 | InputUnits & input_units) { 12 | if (unk_strategy != 1) { return; } 13 | for (auto& u : input_units) { 14 | if (singletons.count(u.wid) && dynet::rand01() < unk_prob) { u.wid = kUNK; } 15 | } 16 | } 17 | 18 | void restore_singletons(const unsigned & unk_strategy, 19 | InputUnits & input_units) { 20 | if (unk_strategy != 1) { return; } 21 | for (auto& u : input_units) { u.wid = u.aux_wid; } 22 | } 23 | 24 | void get_orders(Corpus& corpus, 25 | std::vector& order) { 26 | order.clear(); 27 | for (unsigned i = 0; i < corpus.training_inputs.size(); ++i) { 28 | order.push_back(i); 29 | } 30 | } 31 | 32 | std::string get_model_name(const po::variables_map& conf, 33 | const std::string& prefix) { 34 | std::ostringstream os; 35 | os << prefix << "." << portable_getpid(); 36 | return os.str(); 37 | } 38 | 39 | po::options_description get_optimizer_options() { 40 | po::options_description cmd("Optimizer options"); 41 | cmd.add_options() 42 | ("optimizer", po::value()->default_value("simple_sgd"), "The choice of optimizer [simple_sgd, momentum_sgd, adagrad, adadelta, adam].") 43 | ("optimizer_eta", po::value(), "The initial value of learning rate (eta).") 44 | ("optimizer_final_eta", po::value()->default_value(0.f), "The final value of eta.") 45 | ("optimizer_enable_eta_decay", po::value()->required(), "Specify to update eta at the end of each epoch.") 46 | ("optimizer_eta_decay", po::value(), "The decay rate of eta.") 47 | ("optimizer_enable_clipping", po::value()->required(), "Enable clipping.") 48 | ("optimizer_adam_beta1", po::value()->default_value(0.9f), "The beta1 hyper-parameter of adam") 49 | ("optimizer_adam_beta2", po::value()->default_value(0.999f), "The beta2 hyper-parameter of adam.") 50 | ("optimizer_rmsprop_rho", po::value()->default_value(0.99f), "The rho hyper-parameter of rmsprop.") 51 | ; 52 | 53 | return cmd; 54 | } 55 | 56 | dynet::Trainer* get_trainer(const po::variables_map& conf, dynet::ParameterCollection& model) { 57 | dynet::Trainer* trainer = nullptr; 58 | if (!conf.count("optimizer") || conf["optimizer"].as() == "simple_sgd") { 59 | float eta0 = (conf.count("optimizer_eta") ? conf["optimizer_eta"].as() : 0.1f); 60 | trainer = new dynet::SimpleSGDTrainer(model, eta0); 61 | // trainer->eta_decay = 0.08f; 62 | } else if (conf["optimizer"].as() == "momentum_sgd") { 63 | trainer = new dynet::MomentumSGDTrainer(model); 64 | // trainer->eta_decay = 0.08f; 65 | } else if (conf["optimizer"].as() == "adagrad") { 66 | trainer = new dynet::AdagradTrainer(model); 67 | } else if (conf["optimizer"].as() == "adadelta") { 68 | trainer = new dynet::AdadeltaTrainer(model); 69 | } else if (conf["optimizer"].as() == "rmsprop") { 70 | float eta0 = (conf.count("optimizer_eta") ? conf["optimizer_eta"].as() : 0.001f); 71 | float rho = (conf.count("optimizer_rmsprop_rho") ? conf["optimizer_rmsprop_rho"].as() : 0.99f); 72 | trainer = new dynet::RMSPropTrainer(model, eta0, 1e-8, rho); 73 | } else if (conf["optimizer"].as() == "adam") { 74 | // default setting is same with Kingma and Ba (2015). 75 | float eta0 = (conf.count("optimizer_eta") ? conf["optimizer_eta"].as() : 0.001f); 76 | float beta1 = conf["optimizer_adam_beta1"].as(); 77 | float beta2 = conf["optimizer_adam_beta2"].as(); 78 | trainer = new dynet::AdamTrainer(model, eta0, beta1, beta2); 79 | } else { 80 | _ERROR << "Trainier:: unknown optimizer: " << conf["optimizer"].as(); 81 | exit(1); 82 | } 83 | _INFO << "Trainer:: using " << conf["optimizer"].as() << " optimizer"; 84 | _INFO << "Trainer:: eta = " << trainer->learning_rate; 85 | 86 | if (conf["optimizer_enable_clipping"].as()) { 87 | trainer->clipping_enabled = true; 88 | _INFO << "Trainer:: gradient clipping = enabled"; 89 | } else { 90 | trainer->clipping_enabled = false; 91 | _INFO << "Trainer:: gradient clipping = false"; 92 | } 93 | 94 | if (conf["optimizer_enable_eta_decay"].as()) { 95 | _INFO << "Trainer:: eta decay = enabled"; 96 | if (conf.count("optimizer_eta_decay")) { 97 | // trainer->eta_decay = conf["optimizer_eta_decay"].as(); 98 | _INFO << "Trainer:: eta decay rate = " << conf["optimizer_eta_decay"].as(); 99 | } else { 100 | _INFO << "Trainer:: eta decay rate not set, use default = " << 0.08f; 101 | } 102 | } else { 103 | _INFO << "Trainer:: eta decay = disabled"; 104 | } 105 | return trainer; 106 | } 107 | 108 | void update_trainer(const po::variables_map& conf, const float & eta0, const float & iter, dynet::Trainer* trainer) { 109 | if (conf.count("optimizer_enable_eta_decay")) { 110 | float final_eta = conf["optimizer_final_eta"].as(); 111 | float eta_decay = (conf.count("optimizer_eta_decay") ? conf["optimizer_eta_decay"].as() : 0.08f); 112 | if (trainer->learning_rate > final_eta) { 113 | // trainer->update_epoch(); 114 | // trainer->status(); 115 | trainer->learning_rate = eta0 / (1.f + eta_decay * iter); 116 | _INFO << "Trainer:: trainer updated."; 117 | } else { 118 | trainer->learning_rate = final_eta; 119 | _INFO << "Trainer:: eta reach the final value " << final_eta; 120 | } 121 | } 122 | } 123 | 124 | 125 | -------------------------------------------------------------------------------- /amr_parser/src/trainer_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRAIN_UTILS_H 2 | #define TRAIN_UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "corpus.h" 8 | #include "dynet/model.h" 9 | #include "dynet/training.h" 10 | 11 | namespace po = boost::program_options; 12 | 13 | void random_replace_singletons(const unsigned& unk_strategy, 14 | const float& unk_prob, 15 | const std::set& singletons, 16 | const unsigned& kUNK, 17 | InputUnits& units); 18 | 19 | void restore_singletons(const unsigned& unk_strategy, 20 | InputUnits& units); 21 | 22 | void get_orders(Corpus& corpus, 23 | std::vector& order); 24 | 25 | po::options_description get_optimizer_options(); 26 | 27 | dynet::Trainer* get_trainer(const po::variables_map& conf, 28 | dynet::ParameterCollection& model); 29 | 30 | void update_trainer(const po::variables_map& conf, 31 | const float & eta0, 32 | const float & iter, 33 | dynet::Trainer* trainer); 34 | 35 | std::string get_model_name(const po::variables_map& conf, 36 | const std::string& prefix); 37 | 38 | #endif // end for TRAIN_H -------------------------------------------------------------------------------- /awesome.md: -------------------------------------------------------------------------------- 1 | Awesome AMR Parsers 2 | =================== 3 | 4 | As you may know, there are several open-source AMR parsers and our 5 | aligner improves these parsers. I would like to share 6 | some experiences with how to plugin in our alignments into the existing 7 | AMR parsers, although running most of these parser requires 8 | a certain amount of hacking. 9 | 10 | ## [JAMR](https://github.com/jflanigan/jamr) 11 | 12 | "A Discriminative Graph-Based Parser for the Abstract Meaning Representation", 13 | Jeffrey Flanigan, Sam Thomson, Jaime Carbonell, Chris Dyer, and Noah A. Smith. 14 | 15 | ### Alignment Hacking 16 | The JAMR experiments are carried out with a pipeline of shell scripts. 17 | This made plugining our alignments very easy and saved a lot of my life. 18 | The hook for replacing the alignment is in the preprocessing script: 19 | 20 | ``` 21 | jamr/scripts/preprocessing/cmd.aligned 22 | ``` 23 | 24 | It takes an input AMR file with `# ::tok` header for each graph and adds 25 | an additional `# ::alignments` header to each graph. 26 | 27 | To replace the alignment, you can use the `replace_comments.py` scripts. 28 | 29 | ### Results on LDC2014T12 30 | 31 | | JAMR parser | Smatch | 32 | |-----------------|--------| 33 | | +JAMR alignment | 65.9 | 34 | | +Our alignment | 67.6 | 35 | 36 | ### Note 37 | - JAMR uses the `cdec` tokenizer and our released alignments 38 | include the one preprocessed with `cdec`. 39 | 40 | ## [CAMR](https://github.com/c-amr/camr) 41 | "A Transition-based Algorithm for AMR Parsing", Chuan Wang, Nianwen Xue, and Sameer Pradhan 42 | 43 | ### Alignment Hacking 44 | The CAMR uses a single program entry `amr_parsing.py` in their project. 45 | You can replace the JAMR aligner generated training file with ours, 46 | using the same `replace_comments.py` script. 47 | 48 | ### Results on LDC2014T12 49 | 50 | | CAMR parser | Smatch | 51 | |-----------------|--------| 52 | | +JAMR alignment | 64.6 | 53 | | +Our alignment | 65.1 | 54 | 55 | ### Note 56 | - CAMR uses StanfordCoreNLP as tokenizer. In our release, 57 | we includes the alignment results using this tokenization (noted as `sd`). 58 | 59 | ## [CCG-AMR](https://github.com/clic-lab/amr) 60 | "Broad-coverage CCG Semantic Parsing with AMR", Yoav Artzi, Kenton Lee, and Luke Zettlemoyer. 61 | 62 | [TBD] 63 | 64 | ## [amr-eager](https://github.com/mdtux89/amr-eager) 65 | 66 | [TBD] 67 | 68 | ## [CacheTransition-Seq2Seq](https://github.com/xiaochang13/CacheTransition-Seq2seq) 69 | 70 | [TBD] 71 | 72 | 73 | -------------------------------------------------------------------------------- /pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ueo pipefail 3 | #=================================== 4 | # Config directory 5 | #=================================== 6 | # Please specify the JAMR home here. 7 | JAMR_HOME=/Users/yijialiu/work/projects/jamr/ 8 | # Please specify the TAMR home here. 9 | TAMR_HOME=/Users/yijialiu/work/projects/tamr/ 10 | 11 | if [ -z "$TAMR_HOME" ]; then 12 | echo 'Error: please specify $TAMR_HOME' 13 | exit 1 14 | fi 15 | 16 | if [ -z "$JAMR_HOME" ]; then 17 | echo 'Error: please specify $JAMR_HOME' 18 | exit 1 19 | fi 20 | 21 | TAMR_DATA=${TAMR_HOME}/data 22 | TAMR_LP_DATA=${TAMR_DATA}/little_prince 23 | TAMR_ALIGNER=${TAMR_HOME}/amr_aligner 24 | TAMR_PARSER=${TAMR_HOME}/amr_parser 25 | 26 | #=================================== 27 | # Download data 28 | #=================================== 29 | echo 'Downloading dataset (little prince) ...' 30 | mkdir -p ${TAMR_LP_DATA} 31 | wget -O ${TAMR_LP_DATA}/training.txt https://amr.isi.edu/download/amr-bank-struct-v1.6-training.txt 32 | wget -O ${TAMR_LP_DATA}/dev.txt https://amr.isi.edu/download/amr-bank-struct-v1.6-dev.txt 33 | wget -O ${TAMR_LP_DATA}/test.txt https://amr.isi.edu/download/amr-bank-struct-v1.6-test.txt 34 | 35 | pushd "$JAMR_HOME" > /dev/null 36 | set -x 37 | 38 | #================================== 39 | # Run JAMR baseline aligner 40 | #================================== 41 | . scripts/config.sh 42 | for split in training dev test; 43 | do 44 | echo 'Running JAMR aligner on '${split}; 45 | #scripts/ALIGN.sh < ${TAMR_LP_DATA}/${split}.txt > ${TAMR_LP_DATA}/${split}.txt.aligned 46 | done 47 | 48 | pushd "$TAMR_ALIGNER" > /dev/null 49 | #================================== 50 | # Run TAMR aligner 51 | #================================== 52 | for split in training dev test; 53 | do 54 | echo 'Running TAMR aligner on '${split}; 55 | python rule_base_align.py \ 56 | -verbose \ 57 | -data \ 58 | ${TAMR_LP_DATA}/${split}.txt.aligned \ 59 | -output \ 60 | ${TAMR_LP_DATA}/${split}.txt.alignment \ 61 | -wordvec \ 62 | ${TAMR_ALIGNER}/resources/word2vec/glove.840B.300d.w2v.ldc2014t12_filtered \ 63 | -trials \ 64 | 10000 \ 65 | -improve_perfect \ 66 | -morpho_match \ 67 | -semantic_match 68 | done 69 | 70 | #================================== 71 | # Replace the alignments 72 | #================================== 73 | for split in training dev test; 74 | do 75 | echo 'Replacing the alignments on '${split}; 76 | python replace_comments.py \ 77 | -key \ 78 | alignments \ 79 | -lexicon \ 80 | ${TAMR_LP_DATA}/${split}.txt.alignment \ 81 | -data \ 82 | ${TAMR_LP_DATA}/${split}.txt.aligned \ 83 | > ${TAMR_LP_DATA}/${split}.txt.new_aligned 84 | done 85 | 86 | #================================= 87 | # Generate actions 88 | #================================= 89 | for split in training dev test; 90 | do 91 | echo 'Generating actions on '${split}; 92 | python eager_oracle.py \ 93 | -mod \ 94 | dump \ 95 | -aligned \ 96 | ${TAMR_LP_DATA}/${split}.txt.new_aligned \ 97 | > ${TAMR_LP_DATA}/${split}.txt.actions 98 | done 99 | 100 | #================================ 101 | # Training and testing the parser 102 | #================================ 103 | ./amr_parser/bin/parser_l2r \ 104 | --dynet-seed \ 105 | 1 \ 106 | --train \ 107 | --training_data \ 108 | ./data/little_prince/training.txt.actions \ 109 | --devel_data \ 110 | ./data/little_prince/dev.txt.actions \ 111 | --test_data \ 112 | ./data/little_prince/test.txt.actions \ 113 | --pretrained \ 114 | ./amr_aligner/resources/word2vec/glove.840B.300d.w2v.ldc2014t12_filtered \ 115 | --model \ 116 | data/little_prince/model \ 117 | --optimizer_enable_eta_decay \ 118 | true \ 119 | --optimizer_enable_clipping \ 120 | true \ 121 | --external_eval \ 122 | ./amr_parser/scripts/eval_eager.sh \ 123 | --devel_gold \ 124 | ./data/little_prince/dev.txt.new_aligned \ 125 | --test_gold \ 126 | ./data/little_prince/test.txt.new_aligned 127 | -------------------------------------------------------------------------------- /release/ldc2014t12/README.md: -------------------------------------------------------------------------------- 1 | TAMR alignment for LDC2014T12 2 | ============================= 3 | 4 | We release the alignment file (the output of `rule_base_aligner.py`). 5 | It's in the zipped format. Extract the zip file to use it. 6 | 7 | You can replace the JAMR alignment with ours using the following 8 | commands: 9 | ``` 10 | python replace_comments.py \ 11 | -key \ 12 | alignments \ 13 | -lexicon \ 14 | /path/to/your/alignment/data \ 15 | -data \ 16 | /path/to/your/baseline/data \ 17 | > /path/to/your/new/alignment/data 18 | ``` 19 | 20 | Since JAMR and CAMR uses different tokenizer, we provide 21 | alignment for cdec tokenizer (used by JAMR) and stanford tokenizer 22 | (used by CAMR). 23 | 24 | - for cdec tokenizer: see `amr-release-1.0-training_fix.txt.cdec_tok.tamr_alignment.bz2` 25 | - for stanford tokenizer: see `amr-release-1.0-training_fix.txt.sd_tok.tamr_alignment.bz2` 26 | 27 | To reproduce the alignment, you need to do a patch on the original ldc2014t12, 28 | because there are illegal AMR graph in the original data (like two concepts 29 | using the same variable). You can get the patched ldc2014t12 with the following 30 | steps: 31 | 32 | ### Merge the Training Data 33 | 34 | Go into the `amr_anno_1.0/data/split/training` folder of the original release of `ldc2014t12`, 35 | and get a concatenated training data with the following commands: 36 | ``` 37 | cat amr-release-1.0-training-proxy.txt \ 38 | amr-release-1.0-training-bolt.txt \ 39 | amr-release-1.0-training-dfa.txt \ 40 | amr-release-1.0-training-mt09sdl.txt \ 41 | amr-release-1.0-training-xinhua.txt > amr-release-1.0-training.txt 42 | ``` 43 | 44 | ### Patching 45 | Do the patching with the following commands: 46 | ``` 47 | patch amr-release-1.0-training.txt \ 48 | -i amr-release-1.0-training_fix.patch \ 49 | -o amr-release-1.0-training_fix.txt 50 | ``` 51 | 52 | It's done! -------------------------------------------------------------------------------- /release/ldc2014t12/amr-release-1.0-training_fix.patch: -------------------------------------------------------------------------------- 1 | 9945c9945 2 | < :name (n / name :op1 "Pakistan")))) 3 | --- 4 | > :name (n4/ name :op1 "Pakistan")))) 5 | 41213c41213 6 | < :time (a / after 7 | --- 8 | > :time (a2 / after 9 | 58814c58814 10 | < :time (b / before 11 | --- 12 | > :time (b3 / before 13 | 81657c81657 14 | < # ::snt A paper prepared for the talks said the expansion of the narcotics industry represents the single greatest threat to afghanistan’s stability and is increasingly linked to insecurity and terrorist activities. 15 | --- 16 | > # ::snt A paper prepared for the talks said the expansion of the narcotics industry represents the single greatest threat to afghanistanâ's stability and is increasingly linked to insecurity and terrorist activities. 17 | 140733c140733 18 | < # ::snt Bjørn Lomborg - Wikipedia, the free encyclopedia 19 | --- 20 | > # ::snt Bjorn Lomborg - Wikipedia, the free encyclopedia 21 | 140735c140735 22 | < (p / person :name (n / name :op1 "Bjørn" :op2 "Lomborg") 23 | --- 24 | > (p / person :name (n / name :op1 "Bjorn" :op2 "Lomborg") 25 | 164536c164536 26 | < # ::snt Rockström said that when rich countries increase their consumption , they also accelerate the exploitation of the world 's national resources , with the result that they emit more greenhouse gases . 27 | --- 28 | > # ::snt Rockstrom said that when rich countries increase their consumption , they also accelerate the exploitation of the world 's national resources , with the result that they emit more greenhouse gases . 29 | 164539c164539 30 | < :ARG0 (p / person :name (n / name :op1 "Rockström")) 31 | --- 32 | > :ARG0 (p / person :name (n / name :op1 "Rockstrom")) 33 | -------------------------------------------------------------------------------- /release/ldc2014t12/amr-release-1.0-training_fix.txt.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2014t12/amr-release-1.0-training_fix.txt.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2014t12/amr-release-1.0-training_fix.txt.sd_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2014t12/amr-release-1.0-training_fix.txt.sd_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/README.md: -------------------------------------------------------------------------------- 1 | TAMR alignment for LDC2017T10 2 | ============================= 3 | 4 | You can replace the JAMR alignment with ours using the following commands: 5 | 6 | ``` 7 | python replace_comments.py \ 8 | -key \ 9 | alignments \ 10 | -lexicon \ 11 | /path/to/your/alignment/data \ 12 | -data \ 13 | /path/to/your/baseline/data \ 14 | > /path/to/your/new/alignment/data 15 | ``` 16 | 17 | Similar to LDC2017T10, you need to do a little patching on the original data 18 | to use this alignment. The patch file in under this folder with `.patch` suffix. 19 | 20 | In addition to the patching, you will also need to remove the entity linking (`:wiki`). 21 | We provide a python script `remove_wiki.py` and you can use it with as 22 | ``` 23 | python remove_wiki.py /path/to/your/input > /path/to/your/output 24 | ``` -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-bolt.txt.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-bolt.txt.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-cctv.txt.no_wiki.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-cctv.txt.no_wiki.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-dfa_fix.patch: -------------------------------------------------------------------------------- 1 | 6324c6324 2 | < # ::snt Bjorn Lomborg - Wikipedia, the free encyclopedia 3 | --- 4 | > # ::snt Bjørn Lomborg - Wikipedia, the free encyclopedia 5 | 6326c6326 6 | < (p / person :wiki "Bjorn_Lomborg" 7 | --- 8 | > (p / person :wiki "Bjørn_Lomborg" 9 | 6331c6331 10 | < :name (n / name :op1 "Bjorn" :op2 "Lomborg")) 11 | --- 12 | > :name (n / name :op1 "Bjørn" :op2 "Lomborg")) 13 | -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-dfa_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-dfa_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-dfb_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-dfb_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-guidelines_fix.patch: -------------------------------------------------------------------------------- 1 | 5201c5201 2 | < # ::snt Albert Einstein: Zur Elektrodynamik bewegter Korper (1905) Annalen der Physik, 322 (10): 891-921 (in German) 3 | --- 4 | > # ::snt Albert Einstein: Zur Elektrodynamik bewegter Körper (1905) Annalen der Physik, 322 (10): 891-921 (in German) 5 | 5207c5207 6 | < :name (n2 / name :op1 "Zur" :op2 "Elektrodynamik" :op3 "bewegter" :op4 "Korper")) 7 | --- 8 | > :name (n2 / name :op1 "Zur" :op2 "Elektrodynamik" :op3 "bewegter" :op4 "Körper")) 9 | -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-guidelines_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-guidelines_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-mt09sdl_fix.patch: -------------------------------------------------------------------------------- 1 | 4c4 2 | < # ::snt Rockstrom said that when rich countries increase their consumption , they also accelerate the exploitation of the world 's national resources , with the result that they emit more greenhouse gases . 3 | --- 4 | > # ::snt Rockström said that when rich countries increase their consumption , they also accelerate the exploitation of the world 's national resources , with the result that they emit more greenhouse gases . 5 | 7,8c7,8 6 | < :ARG0 (p / person :wiki "Johan_Rockstrom" 7 | < :name (n / name :op1 "Rockstrom")) 8 | --- 9 | > :ARG0 (p / person :wiki "Johan_Rockström" 10 | > :name (n / name :op1 "Rockström")) 11 | -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-mt09sdl_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-mt09sdl_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-proxy_fix.patch: -------------------------------------------------------------------------------- 1 | 81140c81140 2 | < # ::snt A paper prepared for the talks said the expansion of the narcotics industry represents the single greatest threat to afghanistan's stability and is increasingly linked to insecurity and terrorist activities. 3 | --- 4 | > # ::snt A paper prepared for the talks said the expansion of the narcotics industry represents the single greatest threat to afghanistan’s stability and is increasingly linked to insecurity and terrorist activities. 5 | -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-proxy_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-proxy_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-wb_fix.patch: -------------------------------------------------------------------------------- 1 | 3438a3439,3445 2 | > # ::id wb.eng_0002.163 ::date 2012-11-25T17:27:11 ::annotator SDL-AMR-09 ::preferred 3 | > # ::snt posted by <$BlogBacklinkAuthor$> @ <$BlogBacklinkDateTime$> 4 | > # ::save-date Wed Jul 29, 2015 ::file wb_eng_0002_163.txt 5 | > (p / post-01 6 | > :ARG0 (p2 / person) 7 | > :time (d / date-entity)) 8 | > 9 | -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-wb_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-wb_fix.txt.no_wiki.cdec_tok.tamr_alignment.bz2 -------------------------------------------------------------------------------- /release/ldc2017t10/amr-release-2.0-amrs-training-xinhua.txt.no_wiki.cdec_tok.tamr_alignment.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneplus/tamr/c7a480a019d1d765f0ce3d04a37e31709af47f4a/release/ldc2017t10/amr-release-2.0-amrs-training-xinhua.txt.no_wiki.cdec_tok.tamr_alignment.bz2 --------------------------------------------------------------------------------