├── .Rhistory ├── .idea ├── AMR_AS_GRAPH.iml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── AMR_FEATURE ├── .classpath ├── .gitignore ├── .project ├── .settings │ └── org.eclipse.jdt.core.prefs ├── bin │ ├── convertingAMR.class │ └── json-20170516.jar ├── joints.txt └── src │ ├── convertingAMR.java │ └── json-20170516.jar ├── README.md ├── data ├── aux_dict ├── category_dict ├── graph_to_node_dict.txt ├── graph_to_node_dict_extended_without_jamr ├── graph_to_node_dict_extended_without_jamr.txt ├── high_dict ├── joints.txt ├── lemma_dict ├── ner_dict ├── non_rule_set ├── pos_dict ├── rel_dict ├── rule_f_without_jamr ├── sensed_dict └── word_dict ├── np_sents.txt ├── np_sents.txt_parsed ├── parser ├── AMRProcessors.py ├── DataIterator.py ├── Dict.py ├── Optim.py ├── __init__.py ├── __pycache__ │ ├── AMRProcessors.cpython-36.pyc │ ├── DataIterator.cpython-36.pyc │ ├── Dict.cpython-36.pyc │ ├── Optim.cpython-36.pyc │ └── __init__.cpython-36.pyc ├── models │ ├── ConceptModel.py │ ├── MultiPassRelModel.py │ ├── __init__.py │ └── __pycache__ │ │ ├── ConceptModel.cpython-36.pyc │ │ ├── MultiPassRelModel.cpython-36.pyc │ │ └── __init__.cpython-36.pyc └── modules │ ├── GumbelSoftMax.py │ ├── __initial__.py │ ├── __pycache__ │ ├── GumbelSoftMax.cpython-36.pyc │ └── helper_module.cpython-36.pyc │ └── helper_module.py ├── src ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── train.cpython-36.pyc ├── data_build.py ├── generate.py ├── parse.py ├── preprocessing.py ├── rule_system_build.py └── train.py └── utility ├── AMRGraph.py ├── Naive_Scores.py ├── PropbankReader.py ├── ReCategorization.py ├── StringCopyRules.py ├── __init__.py ├── __init__.pyc ├── __pycache__ ├── AMRGraph.cpython-36.pyc ├── Naive_Scores.cpython-36.pyc ├── PropbankReader.cpython-36.pyc ├── ReCategorization.cpython-36.pyc ├── StringCopyRules.cpython-36.pyc ├── __init__.cpython-36.pyc ├── amr.cpython-36.pyc ├── constants.cpython-36.pyc └── data_helper.cpython-36.pyc ├── amr.peg ├── amr.py ├── amr.pyc ├── constants.py ├── constants.pyc └── data_helper.py /.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/.Rhistory -------------------------------------------------------------------------------- /.idea/AMR_AS_GRAPH.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /AMR_FEATURE/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /AMR_FEATURE/.gitignore: -------------------------------------------------------------------------------- 1 | /.metadata/ 2 | -------------------------------------------------------------------------------- /AMR_FEATURE/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | AMR_FEATURE 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /AMR_FEATURE/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.8 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.8 12 | -------------------------------------------------------------------------------- /AMR_FEATURE/bin/convertingAMR.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/AMR_FEATURE/bin/convertingAMR.class -------------------------------------------------------------------------------- /AMR_FEATURE/bin/json-20170516.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/AMR_FEATURE/bin/json-20170516.jar -------------------------------------------------------------------------------- /AMR_FEATURE/joints.txt: -------------------------------------------------------------------------------- 1 | have to 2 | at all 3 | so far 4 | more than 5 | less than 6 | no one 7 | as well 8 | at least 9 | right wing 10 | left wing 11 | as long as 12 | all over 13 | of course 14 | kind of 15 | after all 16 | by oneself 17 | by the way 18 | in fact 19 | be all 20 | head up 21 | come out 22 | coop up 23 | seize up 24 | bust up 25 | hang out 26 | limber up 27 | quieten down 28 | crack up 29 | fuck up 30 | get out 31 | clear out 32 | rip up 33 | rock on 34 | shout down 35 | bundle up 36 | pump up 37 | smooth out 38 | set down 39 | drop off 40 | think over 41 | core out 42 | tidy up 43 | make off 44 | fight on 45 | set out 46 | think up 47 | try out 48 | sign in 49 | take out 50 | top off 51 | nail down 52 | block up 53 | cash in 54 | fork out 55 | mark down 56 | rattle off 57 | bandage up 58 | sleep over 59 | patch up 60 | freeze over 61 | seal off 62 | free up 63 | clown around 64 | tear down 65 | dust off 66 | live up 67 | cut loose 68 | louse up 69 | sit down 70 | stand by 71 | take up 72 | steal away 73 | lay off 74 | turn in 75 | meet up 76 | check up 77 | taper off 78 | dole out 79 | catch up 80 | shape up 81 | tax away 82 | pass off 83 | give in 84 | speak up 85 | call upon 86 | stall out 87 | butt in 88 | carve out 89 | step up 90 | trigger off 91 | prop up 92 | scoop up 93 | summon forth 94 | boss around 95 | cool down 96 | give back 97 | cut down 98 | jot down 99 | doze off 100 | drum up 101 | bog down 102 | throw out 103 | shy away 104 | frost over 105 | rack up 106 | even out 107 | light up 108 | shack up 109 | bone up 110 | cut out 111 | sum up 112 | shut up 113 | send out 114 | pine away 115 | take over 116 | gobble up 117 | shoot back 118 | lay on 119 | swear off 120 | spread out 121 | pin down 122 | find out 123 | drag on 124 | thaw out 125 | bump off 126 | fatten up 127 | get back 128 | arm up 129 | load up 130 | give vent 131 | top up 132 | bounce back 133 | bad off 134 | come by 135 | single out 136 | call out 137 | slow down 138 | ask out 139 | slice up 140 | roll up 141 | divide up 142 | hold over 143 | touch off 144 | pass out 145 | have mod 146 | screw up 147 | iron out 148 | tell on 149 | dry out 150 | zero out 151 | rev up 152 | request confirmation 153 | scrawl out 154 | tie in 155 | pass up 156 | scratch out 157 | miss out 158 | root out 159 | frighten off 160 | have subevent 161 | go on 162 | follow through 163 | lighten up 164 | trade off 165 | carry over 166 | pay out 167 | mellow out 168 | fool around 169 | get down 170 | stretch out 171 | run down 172 | scrub up 173 | splash out 174 | stop by 175 | touch upon 176 | dig out 177 | stick around 178 | act out 179 | pass by 180 | watch out 181 | share out 182 | shut out 183 | get along 184 | go through 185 | tease out 186 | kill off 187 | slug out 188 | bottom out 189 | tie down 190 | neaten up 191 | dress down 192 | turn off 193 | bandy around 194 | yammer away 195 | gulp down 196 | cut back 197 | chatter away 198 | glaze over 199 | drop by 200 | slack off 201 | fess up 202 | seek out 203 | creep out 204 | hold up 205 | knock up 206 | shine through 207 | fence off 208 | zero in 209 | flip out 210 | rein in 211 | screen out 212 | cheer up 213 | saw up 214 | sign off 215 | flatten out 216 | heat up 217 | add on 218 | clip off 219 | doll up 220 | touch on 221 | fall off 222 | suit up 223 | palm off 224 | mist over 225 | flesh out 226 | burn up 227 | sweat out 228 | work up 229 | brazen out 230 | peel off 231 | pay up 232 | get even 233 | fill out 234 | whip up 235 | shout out 236 | kick in 237 | draw up 238 | thrash out 239 | head off 240 | come in 241 | break up 242 | speed up 243 | spout off 244 | type up 245 | polish off 246 | trot out 247 | puke up 248 | bank up 249 | rip off 250 | dry up 251 | settle down 252 | cry out 253 | go out 254 | face off 255 | ride up 256 | buckle up 257 | pair up 258 | come off 259 | auction off 260 | roll back 261 | throw in 262 | eat up 263 | suck up 264 | shut down 265 | wipe out 266 | nod off 267 | choke off 268 | sleep off 269 | stand up 270 | frost up 271 | join in 272 | mix up 273 | crisp up 274 | knock out 275 | talk out 276 | set off 277 | sit in 278 | bang on 279 | flake out 280 | take off 281 | queue up 282 | square off 283 | make over 284 | ramp up 285 | let down 286 | toss out 287 | finish up 288 | blow over 289 | sound off 290 | cut up 291 | rough in 292 | blot out 293 | stave off 294 | stop off 295 | act up 296 | scout out 297 | pay off 298 | beat out 299 | copy out 300 | wolf down 301 | have manner 302 | get through 303 | break off 304 | drug up 305 | pump out 306 | take hold 307 | polish up 308 | pucker up 309 | write off 310 | shell out 311 | come over 312 | color in 313 | tamp down 314 | shut off 315 | have mode 316 | strike up 317 | beat up 318 | sweep up 319 | come up 320 | blast off 321 | lie in 322 | warm over 323 | ratchet up 324 | bump up 325 | play out 326 | look out 327 | tip over 328 | fudge over 329 | warm up 330 | throw away 331 | crank up 332 | tip off 333 | have quant 334 | go back 335 | roll out 336 | trim down 337 | set up 338 | rake in 339 | piss off 340 | give over 341 | buoy up 342 | pen up 343 | touch up 344 | parcel out 345 | boom out 346 | give off 347 | jump up 348 | leave over 349 | tone down 350 | dream on 351 | lock in 352 | win over 353 | stop over 354 | turn over 355 | play on 356 | edge out 357 | get up 358 | leave off 359 | finish off 360 | slim down 361 | wall off 362 | puff up 363 | plug up 364 | write out 365 | let out 366 | stop up 367 | calm down 368 | bring about 369 | phase out 370 | belly up 371 | break down 372 | stick up 373 | lock up 374 | pull out 375 | set upon 376 | jet off 377 | pay down 378 | fart around 379 | zone out 380 | bear out 381 | take away 382 | bleed off 383 | write up 384 | lash out 385 | lam out 386 | tie up 387 | siphon off 388 | dress up 389 | stamp out 390 | black out 391 | snuff out 392 | whip out 393 | go off 394 | ease up 395 | tune out 396 | gun down 397 | freak out 398 | chop down 399 | strip away 400 | step down 401 | hit up 402 | read up 403 | chew up 404 | start out 405 | own up 406 | close down 407 | come upon 408 | cone down 409 | yield up 410 | get away 411 | gear up 412 | bring on 413 | figure out 414 | turn up 415 | check out 416 | bead up 417 | ship out 418 | crank out 419 | flush out 420 | let on 421 | put on 422 | usher in 423 | spin off 424 | knock off 425 | skim off 426 | pass on 427 | finish out 428 | instead of 429 | leave out 430 | frighten away 431 | buy up 432 | knock over 433 | straighten out 434 | wear off 435 | whiz away 436 | call on 437 | put out 438 | totter around 439 | salt away 440 | spell out 441 | creep up 442 | hold out 443 | sign up 444 | branch out 445 | mark up 446 | hail down 447 | pick out 448 | shoot off 449 | din out 450 | beef up 451 | get off 452 | break through 453 | smarten up 454 | help out 455 | buy out 456 | stake out 457 | take in 458 | do in 459 | come to 460 | sell out 461 | shore up 462 | hem in 463 | hang up 464 | boil over 465 | sort out 466 | wipe up 467 | curl up 468 | whack off 469 | track down 470 | dig up 471 | run out 472 | haul out 473 | plot out 474 | loan out 475 | coil up 476 | die off 477 | pipe down 478 | kick off 479 | come through 480 | print out 481 | pick away 482 | gloss over 483 | ring up 484 | go down 485 | read off 486 | pitch in 487 | choke up 488 | break in 489 | crack down 490 | boot up 491 | blurt out 492 | sluice down 493 | fill up 494 | spring up 495 | lock out 496 | pack up 497 | look over 498 | whittle down 499 | chicken out 500 | bandy about 501 | cart off 502 | plug in 503 | buy off 504 | pick on 505 | crash out 506 | total up 507 | pile on 508 | pan out 509 | prick up 510 | dish up 511 | stash away 512 | round up 513 | shoot up 514 | balance out 515 | bring along 516 | quiet down 517 | cut off 518 | vamp up 519 | run off 520 | pull down 521 | team up 522 | hold back 523 | hammer out 524 | stack up 525 | think through 526 | match up 527 | rise up 528 | have concession 529 | wipe off 530 | hash out 531 | come down 532 | sock away 533 | jump in 534 | hang on 535 | ferret out 536 | wake up 537 | brick over 538 | burst out 539 | tack down 540 | spike out 541 | use up 542 | carry on 543 | bottle up 544 | tighten up 545 | start up 546 | carry off 547 | speak out 548 | set about 549 | tag along 550 | hook up 551 | oil up 552 | fend off 553 | start over 554 | sit up 555 | sign on 556 | take down 557 | study up 558 | while away 559 | fold up 560 | cheer on 561 | bust out 562 | rate entity 563 | play down 564 | book up 565 | bind up 566 | stay on 567 | come about 568 | put up 569 | dine out 570 | have frequency 571 | store up 572 | give up 573 | vote down 574 | bring up 575 | tape up 576 | leave behind 577 | turn on 578 | save up 579 | break out 580 | wash up 581 | fork over 582 | hollow out 583 | freshen up 584 | screw over 585 | dash off 586 | have part 587 | mess up 588 | buy into 589 | burn out 590 | cave in 591 | lead up 592 | clear up 593 | cry down 594 | stand out 595 | turn away 596 | drown out 597 | run in 598 | cover up 599 | spill over 600 | die out 601 | farm out 602 | hand over 603 | poke around 604 | ride out 605 | come across 606 | give away 607 | tack on 608 | bow out 609 | squeeze out 610 | write in 611 | show up 612 | come on 613 | fix up 614 | sew up 615 | fort up 616 | do away 617 | liven up 618 | scrunch up 619 | log on 620 | ham up 621 | look down 622 | firm up 623 | tally up 624 | tool up 625 | weigh in 626 | flare up 627 | strike down 628 | thin out 629 | blast away 630 | reel off 631 | feed up 632 | camp out 633 | well off 634 | crop up 635 | be like 636 | open up 637 | link up 638 | lick up 639 | look up 640 | statistical test 641 | charge off 642 | drop out 643 | keep up 644 | tick off 645 | tune in 646 | write down 647 | bat in 648 | stay over 649 | gas up 650 | pick up 651 | cook up 652 | boil down 653 | pull through 654 | call off 655 | pop off 656 | hand out 657 | push up 658 | fritter away 659 | trail off 660 | chop up 661 | rear end 662 | fuck around 663 | rattle on 664 | tire out 665 | street address 666 | keep on 667 | pack away 668 | keg stand 669 | close off 670 | lose out 671 | wring out 672 | make believe 673 | soak up 674 | tee off 675 | shake up 676 | scent out 677 | steer clear 678 | have instrument 679 | tear up 680 | feel up 681 | live down 682 | bowl over 683 | step in 684 | hobnob around 685 | bow down 686 | buzz off 687 | tangle up 688 | catch on 689 | price out 690 | snap up 691 | live out 692 | touch base 693 | be done 694 | have li 695 | vomit up 696 | clean out 697 | laid back 698 | buckle down 699 | slip in 700 | swear in 701 | stall off 702 | shoot down 703 | be from 704 | serve up 705 | join up 706 | back up 707 | well up 708 | pull up 709 | put down 710 | wash down 711 | dish out 712 | age out 713 | fight back 714 | bring down 715 | run up 716 | zip up 717 | switch over 718 | spend down 719 | call up 720 | be polite 721 | pop up 722 | fall apart 723 | net out 724 | jut out 725 | wind up 726 | rent out 727 | cross out 728 | rough up 729 | broke ass 730 | dredge up 731 | wait out 732 | shuffle off 733 | build up 734 | box in 735 | shake off 736 | cool off 737 | get on 738 | hit on 739 | straighten up 740 | start off 741 | belch out 742 | lie down 743 | play up 744 | give out 745 | haul in 746 | hard put 747 | make up 748 | snap off 749 | follow suit 750 | pass away 751 | smooth over 752 | hole up 753 | turn out 754 | clog up 755 | sober up 756 | smash up 757 | contract out 758 | go over 759 | dope up 760 | bed down 761 | sit out 762 | hype up 763 | drop in 764 | put off 765 | ward off 766 | get together 767 | turn down 768 | back off 769 | swoop up 770 | out trade 771 | size up 772 | pull off 773 | conjure up 774 | stock up 775 | sleep away 776 | monkey around 777 | break away 778 | pile up 779 | put in 780 | dream up 781 | wrap up 782 | gum up 783 | bound up 784 | tuck away 785 | board up 786 | have purpose 787 | stick out 788 | fall out 789 | take aback 790 | chart out 791 | latch on 792 | belt out 793 | wear on 794 | muck up 795 | step aside 796 | lead off 797 | point out 798 | line up 799 | check in 800 | start in 801 | bunch up 802 | watch over 803 | fill in 804 | work out 805 | joke around 806 | hum along 807 | lock down 808 | wear out 809 | rip out 810 | bleed out 811 | come along 812 | play off 813 | show off 814 | have extent 815 | concrete over 816 | narrow down 817 | jack up 818 | stare down 819 | pipe up 820 | loosen up 821 | wear down 822 | bear up 823 | cover over 824 | have polarity 825 | mic up 826 | make do 827 | close over 828 | deck out 829 | blow out 830 | play to 831 | hammer away 832 | ration out 833 | sell off 834 | have name 835 | strike out 836 | shuttle off 837 | call in 838 | shrug off 839 | chalk up 840 | perk up 841 | knock down 842 | follow up 843 | pass over 844 | brush off 845 | drink up 846 | fly out 847 | close in 848 | grow up 849 | eat away 850 | have condition 851 | snatch away 852 | pick off 853 | stress out 854 | take on 855 | muddle up 856 | tuck in 857 | live on 858 | skip off 859 | look forward 860 | stir up 861 | bail out 862 | stand down 863 | close up 864 | run over 865 | throw up 866 | fuck off 867 | swallow up 868 | spill out 869 | fall back 870 | fight off 871 | rig up 872 | sweat off 873 | hide out 874 | divvy up 875 | flash back 876 | end up 877 | make it 878 | toss in 879 | round out 880 | sniff out 881 | grind up 882 | chip in 883 | cough up 884 | phase in 885 | let up 886 | water down 887 | hold on 888 | level off 889 | have value 890 | fit in 891 | yammer on 892 | key in 893 | hold off 894 | silt up 895 | get by 896 | split up 897 | make out 898 | look after 899 | rubber stamp 900 | sketch out 901 | pull over 902 | spruce up 903 | glass over 904 | add up 905 | mist up 906 | brush up 907 | wind down 908 | clutch on 909 | knock back 910 | pare down 911 | rule out 912 | fall through 913 | hack away 914 | asphalt over 915 | clean up 916 | pound out 917 | die down 918 | carry out 919 | fall over 920 | blow up 921 | weasel out 922 | break even 923 | -------------------------------------------------------------------------------- /AMR_FEATURE/src/json-20170516.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/AMR_FEATURE/src/json-20170516.jar -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AMR AS GRAPH PREDICTION 2 | 3 | This repository contains code for training and using the Abstract Meaning Representation model described in: 4 | [AMR Parsing as Graph Prediction with Latent Alignment](https://arxiv.org/pdf/1805.05286.pdf) 5 | 6 | If you use our code, please cite our paper as follows: 7 | > @inproceedings{Lyu2018AMRPA, 8 | > title={AMR Parsing as Graph Prediction with Latent Alignment}, 9 | > author={Chunchuan Lyu and Ivan Titov}, 10 | > booktitle={Proceedings of the Annual Meeting of the Association for Computational Linguistics}, 11 | > year={2018} 12 | > } 13 | 14 | ## Prerequisites 15 | 16 | * Python 3.6 17 | * Stanford Corenlp 3.9.1 (the python wrapper is not compatible with the new one) 18 | * pytorch 0.20 19 | * [GloVe](https://nlp.stanford.edu/projects/glove/) embeddings 20 | * [AMR dataset and resources files](https://amr.isi.edu/download.html) 21 | 22 | ## Configuration 23 | 24 | * Set up [Stanford Corenlp server](https://stanfordnlp.github.io/CoreNLP/corenlp-server.html), which feature extraction relies on. 25 | * Change file paths in utility/constants.py accordingly. 26 | 27 | 28 | ## Preprocessing 29 | 30 | Either a) combine all `*.txt` files into a single one, and use Stanford CoreNLP to extract ner, pos and lemma. 31 | Processed file saved in the same folder. 32 | 33 | python src/preprocessing.py 34 | 35 | or b) process from [AMR-to-English aligner](https://www.isi.edu/natural-language/mt/amr_eng_align.pdf) using java script in AMR_FEATURE (I used Eclipse to run it). 36 | 37 | Build the copying dictionary and recategorization system (can skip as they are in data/). 38 | 39 | python src/rule_system_build.py 40 | 41 | Build data into tensor. 42 | 43 | python src/data_build.py 44 | 45 | ## Training 46 | 47 | Default model is saved in [save_to]/gpus_0valid_best.pt . (save_to is defined in constants.py) 48 | 49 | python src/train.py 50 | 51 | ## Testing 52 | 53 | Load model to parse from pre-build data. 54 | 55 | python src/generate.py -train_from [gpus_0valid_best.pt] 56 | 57 | ## Evaluation 58 | 59 | Please use [amr-evaluation-tool-enhanced](https://github.com/ChunchuanLv/amr-evaluation-tool-enhanced). 60 | This is based on Marco Damonte's [amr-evaluation-tool](https://github.com/mdtux89/amr-evaluation) 61 | But with correction concerning unlabeled edge score. 62 | 63 | ## Parsing 64 | 65 | Either a) parse a file where each line consists of a single sentence, output saved at `[file]_parsed` 66 | 67 | python src/parse.py -train_from [gpus_0valid_best.pt] -input [file] 68 | 69 | or b) parse a sentence where each line consists of a single sentence, output saved at `[file]_parsed` 70 | 71 | python src/parse.py -train_from [gpus_0valid_best.pt] -text [type sentence here] 72 | 73 | ## Pretrained models 74 | 75 | Keeping the files under data/ folder unchanged, download [model](https://drive.google.com/open?id=1jNTG3tuIfS-WoUpqGQydRgYWst51kjHx) 76 | Should allow one to run parsing. 77 | 78 | ## Notes 79 | 80 | This "python src/preprocessing.py" starts with sentence original AMR files, while the paper version is trained on tokenized version provided by [AMR-to-English aligner](https://www.isi.edu/natural-language/mt/amr_eng_align.pdf) 81 | So the results could be slightly different. Also, to build a parser for out of domain data, please start preprocessing with "python src/preprocessing.py" to make everything consistent. 82 | 83 | ## Contact 84 | 85 | Contact if you have any questions! 86 | -------------------------------------------------------------------------------- /data/aux_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/aux_dict -------------------------------------------------------------------------------- /data/category_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/category_dict -------------------------------------------------------------------------------- /data/graph_to_node_dict_extended_without_jamr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/graph_to_node_dict_extended_without_jamr -------------------------------------------------------------------------------- /data/high_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/high_dict -------------------------------------------------------------------------------- /data/joints.txt: -------------------------------------------------------------------------------- 1 | have to 2 | at all 3 | so far 4 | more than 5 | less than 6 | no one 7 | as well 8 | at least 9 | right wing 10 | left wing 11 | as long as 12 | all over 13 | of course 14 | kind of 15 | after all 16 | by oneself 17 | by the way 18 | in fact 19 | be all 20 | head up 21 | come out 22 | coop up 23 | seize up 24 | bust up 25 | hang out 26 | limber up 27 | quieten down 28 | crack up 29 | fuck up 30 | get out 31 | clear out 32 | rip up 33 | rock on 34 | shout down 35 | bundle up 36 | pump up 37 | smooth out 38 | set down 39 | drop off 40 | think over 41 | core out 42 | tidy up 43 | make off 44 | fight on 45 | set out 46 | think up 47 | try out 48 | sign in 49 | take out 50 | top off 51 | nail down 52 | block up 53 | cash in 54 | fork out 55 | mark down 56 | rattle off 57 | bandage up 58 | sleep over 59 | patch up 60 | freeze over 61 | seal off 62 | free up 63 | clown around 64 | tear down 65 | dust off 66 | live up 67 | cut loose 68 | louse up 69 | sit down 70 | stand by 71 | take up 72 | steal away 73 | lay off 74 | turn in 75 | meet up 76 | check up 77 | taper off 78 | dole out 79 | catch up 80 | shape up 81 | tax away 82 | pass off 83 | give in 84 | speak up 85 | call upon 86 | stall out 87 | butt in 88 | carve out 89 | step up 90 | trigger off 91 | prop up 92 | scoop up 93 | summon forth 94 | boss around 95 | cool down 96 | give back 97 | cut down 98 | jot down 99 | doze off 100 | drum up 101 | bog down 102 | throw out 103 | shy away 104 | frost over 105 | rack up 106 | even out 107 | light up 108 | shack up 109 | bone up 110 | cut out 111 | sum up 112 | shut up 113 | send out 114 | pine away 115 | take over 116 | gobble up 117 | shoot back 118 | lay on 119 | swear off 120 | spread out 121 | pin down 122 | find out 123 | drag on 124 | thaw out 125 | bump off 126 | fatten up 127 | get back 128 | arm up 129 | load up 130 | give vent 131 | top up 132 | bounce back 133 | bad off 134 | come by 135 | single out 136 | call out 137 | slow down 138 | ask out 139 | slice up 140 | roll up 141 | divide up 142 | hold over 143 | touch off 144 | pass out 145 | have mod 146 | screw up 147 | iron out 148 | tell on 149 | dry out 150 | zero out 151 | rev up 152 | request confirmation 153 | scrawl out 154 | tie in 155 | pass up 156 | scratch out 157 | miss out 158 | root out 159 | frighten off 160 | have subevent 161 | go on 162 | follow through 163 | lighten up 164 | trade off 165 | carry over 166 | pay out 167 | mellow out 168 | fool around 169 | get down 170 | stretch out 171 | run down 172 | scrub up 173 | splash out 174 | stop by 175 | touch upon 176 | dig out 177 | stick around 178 | act out 179 | pass by 180 | watch out 181 | share out 182 | shut out 183 | get along 184 | go through 185 | tease out 186 | kill off 187 | slug out 188 | bottom out 189 | tie down 190 | neaten up 191 | dress down 192 | turn off 193 | bandy around 194 | yammer away 195 | gulp down 196 | cut back 197 | chatter away 198 | glaze over 199 | drop by 200 | slack off 201 | fess up 202 | seek out 203 | creep out 204 | hold up 205 | knock up 206 | shine through 207 | fence off 208 | zero in 209 | flip out 210 | rein in 211 | screen out 212 | cheer up 213 | saw up 214 | sign off 215 | flatten out 216 | heat up 217 | add on 218 | clip off 219 | doll up 220 | touch on 221 | fall off 222 | suit up 223 | palm off 224 | mist over 225 | flesh out 226 | burn up 227 | sweat out 228 | work up 229 | brazen out 230 | peel off 231 | pay up 232 | get even 233 | fill out 234 | whip up 235 | shout out 236 | kick in 237 | draw up 238 | thrash out 239 | head off 240 | come in 241 | break up 242 | speed up 243 | spout off 244 | type up 245 | polish off 246 | trot out 247 | puke up 248 | bank up 249 | rip off 250 | dry up 251 | settle down 252 | cry out 253 | go out 254 | face off 255 | ride up 256 | buckle up 257 | pair up 258 | come off 259 | auction off 260 | roll back 261 | throw in 262 | eat up 263 | suck up 264 | shut down 265 | wipe out 266 | nod off 267 | choke off 268 | sleep off 269 | stand up 270 | frost up 271 | join in 272 | mix up 273 | crisp up 274 | knock out 275 | talk out 276 | set off 277 | sit in 278 | bang on 279 | flake out 280 | take off 281 | queue up 282 | square off 283 | make over 284 | ramp up 285 | let down 286 | toss out 287 | finish up 288 | blow over 289 | sound off 290 | cut up 291 | rough in 292 | blot out 293 | stave off 294 | stop off 295 | act up 296 | scout out 297 | pay off 298 | beat out 299 | copy out 300 | wolf down 301 | have manner 302 | get through 303 | break off 304 | drug up 305 | pump out 306 | take hold 307 | polish up 308 | pucker up 309 | write off 310 | shell out 311 | come over 312 | color in 313 | tamp down 314 | shut off 315 | have mode 316 | strike up 317 | beat up 318 | sweep up 319 | come up 320 | blast off 321 | lie in 322 | warm over 323 | ratchet up 324 | bump up 325 | play out 326 | look out 327 | tip over 328 | fudge over 329 | warm up 330 | throw away 331 | crank up 332 | tip off 333 | have quant 334 | go back 335 | roll out 336 | trim down 337 | set up 338 | rake in 339 | piss off 340 | give over 341 | buoy up 342 | pen up 343 | touch up 344 | parcel out 345 | boom out 346 | give off 347 | jump up 348 | leave over 349 | tone down 350 | dream on 351 | lock in 352 | win over 353 | stop over 354 | turn over 355 | play on 356 | edge out 357 | get up 358 | leave off 359 | finish off 360 | slim down 361 | wall off 362 | puff up 363 | plug up 364 | write out 365 | let out 366 | stop up 367 | calm down 368 | bring about 369 | phase out 370 | belly up 371 | break down 372 | stick up 373 | lock up 374 | pull out 375 | set upon 376 | jet off 377 | pay down 378 | fart around 379 | zone out 380 | bear out 381 | take away 382 | bleed off 383 | write up 384 | lash out 385 | lam out 386 | tie up 387 | siphon off 388 | dress up 389 | stamp out 390 | black out 391 | snuff out 392 | whip out 393 | go off 394 | ease up 395 | tune out 396 | gun down 397 | freak out 398 | chop down 399 | strip away 400 | step down 401 | hit up 402 | read up 403 | chew up 404 | start out 405 | own up 406 | close down 407 | come upon 408 | cone down 409 | yield up 410 | get away 411 | gear up 412 | bring on 413 | figure out 414 | turn up 415 | check out 416 | bead up 417 | ship out 418 | crank out 419 | flush out 420 | let on 421 | put on 422 | usher in 423 | spin off 424 | knock off 425 | skim off 426 | pass on 427 | finish out 428 | instead of 429 | leave out 430 | frighten away 431 | buy up 432 | knock over 433 | straighten out 434 | wear off 435 | whiz away 436 | call on 437 | put out 438 | totter around 439 | salt away 440 | spell out 441 | creep up 442 | hold out 443 | sign up 444 | branch out 445 | mark up 446 | hail down 447 | pick out 448 | shoot off 449 | din out 450 | beef up 451 | get off 452 | break through 453 | smarten up 454 | help out 455 | buy out 456 | stake out 457 | take in 458 | do in 459 | come to 460 | sell out 461 | shore up 462 | hem in 463 | hang up 464 | boil over 465 | sort out 466 | wipe up 467 | curl up 468 | whack off 469 | track down 470 | dig up 471 | run out 472 | haul out 473 | plot out 474 | loan out 475 | coil up 476 | die off 477 | pipe down 478 | kick off 479 | come through 480 | print out 481 | pick away 482 | gloss over 483 | ring up 484 | go down 485 | read off 486 | pitch in 487 | choke up 488 | break in 489 | crack down 490 | boot up 491 | blurt out 492 | sluice down 493 | fill up 494 | spring up 495 | lock out 496 | pack up 497 | look over 498 | whittle down 499 | chicken out 500 | bandy about 501 | cart off 502 | plug in 503 | buy off 504 | pick on 505 | crash out 506 | total up 507 | pile on 508 | pan out 509 | prick up 510 | dish up 511 | stash away 512 | round up 513 | shoot up 514 | balance out 515 | bring along 516 | quiet down 517 | cut off 518 | vamp up 519 | run off 520 | pull down 521 | team up 522 | hold back 523 | hammer out 524 | stack up 525 | think through 526 | match up 527 | rise up 528 | have concession 529 | wipe off 530 | hash out 531 | come down 532 | sock away 533 | jump in 534 | hang on 535 | ferret out 536 | wake up 537 | brick over 538 | burst out 539 | tack down 540 | spike out 541 | use up 542 | carry on 543 | bottle up 544 | tighten up 545 | start up 546 | carry off 547 | speak out 548 | set about 549 | tag along 550 | hook up 551 | oil up 552 | fend off 553 | start over 554 | sit up 555 | sign on 556 | take down 557 | study up 558 | while away 559 | fold up 560 | cheer on 561 | bust out 562 | rate entity 563 | play down 564 | book up 565 | bind up 566 | stay on 567 | come about 568 | put up 569 | dine out 570 | have frequency 571 | store up 572 | give up 573 | vote down 574 | bring up 575 | tape up 576 | leave behind 577 | turn on 578 | save up 579 | break out 580 | wash up 581 | fork over 582 | hollow out 583 | freshen up 584 | screw over 585 | dash off 586 | have part 587 | mess up 588 | buy into 589 | burn out 590 | cave in 591 | lead up 592 | clear up 593 | cry down 594 | stand out 595 | turn away 596 | drown out 597 | run in 598 | cover up 599 | spill over 600 | die out 601 | farm out 602 | hand over 603 | poke around 604 | ride out 605 | come across 606 | give away 607 | tack on 608 | bow out 609 | squeeze out 610 | write in 611 | show up 612 | come on 613 | fix up 614 | sew up 615 | fort up 616 | do away 617 | liven up 618 | scrunch up 619 | log on 620 | ham up 621 | look down 622 | firm up 623 | tally up 624 | tool up 625 | weigh in 626 | flare up 627 | strike down 628 | thin out 629 | blast away 630 | reel off 631 | feed up 632 | camp out 633 | well off 634 | crop up 635 | be like 636 | open up 637 | link up 638 | lick up 639 | look up 640 | statistical test 641 | charge off 642 | drop out 643 | keep up 644 | tick off 645 | tune in 646 | write down 647 | bat in 648 | stay over 649 | gas up 650 | pick up 651 | cook up 652 | boil down 653 | pull through 654 | call off 655 | pop off 656 | hand out 657 | push up 658 | fritter away 659 | trail off 660 | chop up 661 | rear end 662 | fuck around 663 | rattle on 664 | tire out 665 | street address 666 | keep on 667 | pack away 668 | keg stand 669 | close off 670 | lose out 671 | wring out 672 | make believe 673 | soak up 674 | tee off 675 | shake up 676 | scent out 677 | steer clear 678 | have instrument 679 | tear up 680 | feel up 681 | live down 682 | bowl over 683 | step in 684 | hobnob around 685 | bow down 686 | buzz off 687 | tangle up 688 | catch on 689 | price out 690 | snap up 691 | live out 692 | touch base 693 | be done 694 | have li 695 | vomit up 696 | clean out 697 | laid back 698 | buckle down 699 | slip in 700 | swear in 701 | stall off 702 | shoot down 703 | be from 704 | serve up 705 | join up 706 | back up 707 | well up 708 | pull up 709 | put down 710 | wash down 711 | dish out 712 | age out 713 | fight back 714 | bring down 715 | run up 716 | zip up 717 | switch over 718 | spend down 719 | call up 720 | be polite 721 | pop up 722 | fall apart 723 | net out 724 | jut out 725 | wind up 726 | rent out 727 | cross out 728 | rough up 729 | broke ass 730 | dredge up 731 | wait out 732 | shuffle off 733 | build up 734 | box in 735 | shake off 736 | cool off 737 | get on 738 | hit on 739 | straighten up 740 | start off 741 | belch out 742 | lie down 743 | play up 744 | give out 745 | haul in 746 | hard put 747 | make up 748 | snap off 749 | follow suit 750 | pass away 751 | smooth over 752 | hole up 753 | turn out 754 | clog up 755 | sober up 756 | smash up 757 | contract out 758 | go over 759 | dope up 760 | bed down 761 | sit out 762 | hype up 763 | drop in 764 | put off 765 | ward off 766 | get together 767 | turn down 768 | back off 769 | swoop up 770 | out trade 771 | size up 772 | pull off 773 | conjure up 774 | stock up 775 | sleep away 776 | monkey around 777 | break away 778 | pile up 779 | put in 780 | dream up 781 | wrap up 782 | gum up 783 | bound up 784 | tuck away 785 | board up 786 | have purpose 787 | stick out 788 | fall out 789 | take aback 790 | chart out 791 | latch on 792 | belt out 793 | wear on 794 | muck up 795 | step aside 796 | lead off 797 | point out 798 | line up 799 | check in 800 | start in 801 | bunch up 802 | watch over 803 | fill in 804 | work out 805 | joke around 806 | hum along 807 | lock down 808 | wear out 809 | rip out 810 | bleed out 811 | come along 812 | play off 813 | show off 814 | have extent 815 | concrete over 816 | narrow down 817 | jack up 818 | stare down 819 | pipe up 820 | loosen up 821 | wear down 822 | bear up 823 | cover over 824 | have polarity 825 | mic up 826 | make do 827 | close over 828 | deck out 829 | blow out 830 | play to 831 | hammer away 832 | ration out 833 | sell off 834 | have name 835 | strike out 836 | shuttle off 837 | call in 838 | shrug off 839 | chalk up 840 | perk up 841 | knock down 842 | follow up 843 | pass over 844 | brush off 845 | drink up 846 | fly out 847 | close in 848 | grow up 849 | eat away 850 | have condition 851 | snatch away 852 | pick off 853 | stress out 854 | take on 855 | muddle up 856 | tuck in 857 | live on 858 | skip off 859 | look forward 860 | stir up 861 | bail out 862 | stand down 863 | close up 864 | run over 865 | throw up 866 | fuck off 867 | swallow up 868 | spill out 869 | fall back 870 | fight off 871 | rig up 872 | sweat off 873 | hide out 874 | divvy up 875 | flash back 876 | end up 877 | make it 878 | toss in 879 | round out 880 | sniff out 881 | grind up 882 | chip in 883 | cough up 884 | phase in 885 | let up 886 | water down 887 | hold on 888 | level off 889 | have value 890 | fit in 891 | yammer on 892 | key in 893 | hold off 894 | silt up 895 | get by 896 | split up 897 | make out 898 | look after 899 | rubber stamp 900 | sketch out 901 | pull over 902 | spruce up 903 | glass over 904 | add up 905 | mist up 906 | brush up 907 | wind down 908 | clutch on 909 | knock back 910 | pare down 911 | rule out 912 | fall through 913 | hack away 914 | asphalt over 915 | clean up 916 | pound out 917 | die down 918 | carry out 919 | fall over 920 | blow up 921 | weasel out 922 | break even 923 | -------------------------------------------------------------------------------- /data/lemma_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/lemma_dict -------------------------------------------------------------------------------- /data/ner_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/ner_dict -------------------------------------------------------------------------------- /data/non_rule_set: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/non_rule_set -------------------------------------------------------------------------------- /data/pos_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/pos_dict -------------------------------------------------------------------------------- /data/rel_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/rel_dict -------------------------------------------------------------------------------- /data/rule_f_without_jamr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/rule_f_without_jamr -------------------------------------------------------------------------------- /data/sensed_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/sensed_dict -------------------------------------------------------------------------------- /data/word_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/data/word_dict -------------------------------------------------------------------------------- /parser/DataIterator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Iterating over data set 6 | 7 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 8 | @since: 2018-05-30 9 | ''' 10 | from utility.constants import * 11 | from utility.data_helper import * 12 | import torch 13 | from torch.autograd import Variable 14 | import math 15 | from torch.nn.utils.rnn import PackedSequence 16 | from parser.modules.helper_module import MyPackedSequence 17 | from torch.nn.utils.rnn import pack_padded_sequence as pack 18 | import re 19 | end= re.compile(".txt\_[a-z]*") 20 | def rel_to_batch(rel_batch_p,rel_index_batch_p,data_iterator,dicts): 21 | lemma_dict,category_dict = dicts["lemma_dict"], dicts["category_dict"] 22 | 23 | data = [torch.LongTensor([[category_dict[uni.cat],lemma_dict[uni.le],0] for uni in uni_seq]) for uni_seq in rel_batch_p ] 24 | rel_index = [torch.LongTensor(index) for index in rel_index_batch_p] 25 | 26 | rel_batch,rel_index_batch,rel_lengths = data_iterator._batchify_rel_concept(data,rel_index) 27 | return MyPackedSequence(rel_batch,rel_lengths),rel_index_batch 28 | 29 | class DataIterator(object): 30 | 31 | def __init__(self, filePathes,opt,rel_dict,volatile = False ,all_data = None): 32 | self.cuda = opt.gpus[0] != -1 33 | self.volatile = volatile 34 | self.rel_dict = rel_dict 35 | self.all = [] 36 | self.opt = opt 37 | # break 38 | 39 | # self.all = sorted(self.all, key=lambda x: x[0]) 40 | self.src = [] 41 | self.tgt = [] 42 | self.align_index = [] 43 | self.rel_seq = [] 44 | self.rel_index = [] 45 | self.rel_mat = [] 46 | self.root = [] 47 | self.src_source = [] 48 | self.tgt_source = [] 49 | self.rel_tgt = [] 50 | if all_data: 51 | for data in all_data: 52 | self.read_sentence(data) 53 | self.batchSize = len(all_data) 54 | self.numBatches = 1 55 | else: 56 | 57 | for filepath in filePathes: 58 | n = self.readFile(filepath) 59 | self.batchSize = opt.batch_size 60 | self.numBatches = math.ceil(len(self.src)/self.batchSize) 61 | 62 | self.source_only = len(self.root) == 0 63 | 64 | def read_sentence(self,data): 65 | def role_mat_to_sparse(role_mat,rel_dict): 66 | index =[] 67 | value = [] 68 | for i,role_list in enumerate(role_mat): 69 | for role_index in role_list: 70 | if role_index[0] in rel_dict: 71 | index.append([i,role_index[1]]) 72 | value.append(rel_dict[role_index[0]]) 73 | size = torch.Size([len(role_mat),len(role_mat)]) 74 | v = torch.LongTensor(value) 75 | if len(v) == 0: 76 | i = torch.LongTensor([[0,0]]).t() 77 | v = torch.LongTensor([0]) 78 | return torch.sparse.LongTensor(i,v,size) 79 | 80 | i = torch.LongTensor(index).t() 81 | return torch.sparse.LongTensor(i,v,size) 82 | 83 | #src: length x n_feature 84 | 85 | self.src.append(torch.LongTensor([data["snt_id"],data["lemma_id"],data["pos_id"],data["ner_id"]]).t().contiguous()) 86 | 87 | #source 88 | 89 | self.src_source.append([data["tok"],data["lem"],data["pos"],data["ner"]]) 90 | 91 | #tgt: length x n_feature 92 | # print (data["amr_id"]) 93 | if "amr_id" in data: 94 | self.tgt.append(torch.LongTensor(data["amr_id"])) # lemma,cat, lemma_sense,ner,is_high 95 | self.align_index.append(data["index"]) 96 | 97 | amrl = len(data["amr_id"]) 98 | for i in data["amr_rel_index"]: 99 | assert i 0, (data,rel_index) 151 | second = max([x.size(1) for x in data]) 152 | total = sum(lengths) 153 | out = data[0].new(total, second) 154 | out_index = [] 155 | current = 0 156 | for i in range(len(data)): 157 | data_t = data[i].clone() 158 | out.narrow(0, current, lengths[i]).copy_(data_t) 159 | index_t = rel_index[i].clone() 160 | if self.cuda: 161 | index_t = index_t.cuda() 162 | out_index.append(Variable(index_t,volatile=self.volatile,requires_grad = False)) 163 | # out_index.append(index_t) 164 | current += lengths[i] 165 | out = Variable(out,volatile=self.volatile,requires_grad = False) 166 | 167 | if self.cuda: 168 | out = out.cuda() 169 | return out,out_index,lengths 170 | 171 | 172 | #rel_mat: batch_size x var(len) x var(len) 173 | #rel_index: batch_size x var(len) 174 | 175 | #out : (batch_size x var(len) x var(len)) 176 | def _batchify_rel_roles(self, all_data ): 177 | length_squares = [x.size(0)**2 for x in all_data] 178 | total = sum(length_squares) 179 | out = torch.LongTensor(total) 180 | current = 0 181 | for i in range(len(all_data)): 182 | data_t = all_data[i].to_dense().clone().view(-1) 183 | out.narrow(0, current, length_squares[i]).copy_(data_t) 184 | current += length_squares[i] 185 | 186 | out = Variable(out,volatile=self.volatile,requires_grad = False) 187 | if self.cuda: 188 | out = out.cuda() 189 | 190 | return out,length_squares 191 | 192 | 193 | #data: batch_size x var(len) x n_feature 194 | #out : batch_size x tgt_len x n_feature 195 | def _batchify_tgt(self, data,max_src ): 196 | lengths = [x.size(0) for x in data] 197 | max_length = max(max(x.size(0) for x in data),max_src) #if y, we need max_x 198 | out = data[0].new(len(data), max_length,data[0].size(1)).fill_(PAD) 199 | for i in range(len(data)): 200 | data_t = data[i].clone() 201 | data_length = data[i].size(0) 202 | out[i].narrow(0, 0, data_length).copy_(data_t) 203 | return out 204 | 205 | #data: batch_size x var(len) x n_feature 206 | #out : batch_size x src_len x n_feature 207 | def _batchify_src(self, data,max_length ): 208 | out = data[0].new(len(data), max_length,data[0].size(1)).fill_(PAD) 209 | 210 | for i in range(len(data)): 211 | data_t = data[i].clone() 212 | data_length = data[i].size(0) 213 | out[i].narrow(0, 0, data_length).copy_(data_t) 214 | 215 | return out 216 | 217 | def getLengths(self,index): 218 | src_data = self.src[index*self.batchSize:(index+1)*self.batchSize] 219 | src_lengths = [x.size(0) for x in src_data] 220 | if self.source_only: 221 | return src_lengths,max(src_lengths) 222 | 223 | tgt_data = self.tgt[index*self.batchSize:(index+1)*self.batchSize] 224 | tgt_lengths = [x.size(0) for x in tgt_data] 225 | lengths = [] 226 | for i,l in enumerate(src_lengths): 227 | lengths.append(max(l,tgt_lengths[i])) 228 | return lengths,max(lengths) 229 | 230 | def __getitem__(self, index): 231 | assert index < self.numBatches, "%d > %d" % (index, self.numBatches) 232 | lengths,max_len = self.getLengths(index ) 233 | def wrap(b,l ): 234 | #batch, len, feature 235 | if b is None: 236 | return b 237 | b = torch.stack(b, 0).transpose(0,1).contiguous() 238 | if self.cuda: 239 | b = b.cuda() 240 | packed = pack(b,list(l)) 241 | return PackedSequence(Variable(packed[0], volatile=self.volatile,requires_grad = False),packed[1]) 242 | 243 | def wrap_align(b,l ): 244 | #batch, len_tgt, len_src 245 | if b is None: 246 | return b 247 | b = torch.stack(b, 0).transpose(0,1).contiguous().float() 248 | if self.cuda: 249 | b = b.cuda() 250 | packed = pack(b,list(l)) 251 | return PackedSequence(Variable(packed[0], volatile=self.volatile,requires_grad = False),packed[1]) 252 | 253 | srcBatch = self._batchify_src( 254 | self.src[index*self.batchSize:(index+1)*self.batchSize],max_len) 255 | 256 | if self.source_only: 257 | src_sourceBatch = self.src_source[index*self.batchSize:(index+1)*self.batchSize] 258 | 259 | batch = zip( srcBatch,src_sourceBatch) 260 | lengths,max_len = self.getLengths(index ) 261 | order_data = sorted(list(enumerate(list(zip(batch, lengths)))),key = lambda x:-x[1][1]) 262 | order,data = zip(*order_data) 263 | batch, lengths = zip(*data) 264 | srcBatch,src_sourceBatch = zip(*batch) 265 | return order,wrap(srcBatch,lengths),src_sourceBatch 266 | 267 | else: 268 | tgtBatch = self._batchify_tgt( 269 | self.tgt[index*self.batchSize:(index+1)*self.batchSize],max_len) 270 | alignBatch = self._batchify_align( 271 | self.align_index[index*self.batchSize:(index+1)*self.batchSize],max_len) 272 | 273 | rel_seq_pre = self.rel_seq[index*self.batchSize:(index+1)*self.batchSize] 274 | rel_index_pre = self.rel_index[index*self.batchSize:(index+1)*self.batchSize] 275 | rel_role_pre = self.rel_mat[index*self.batchSize:(index+1)*self.batchSize] 276 | 277 | # roots = Variable(torch.IntTensor(self.root[index*self.batchSize:(index+1)*self.batchSize]),volatile = True) 278 | roots =self.root[index*self.batchSize:(index+1)*self.batchSize] 279 | 280 | src_sourceBatch = self.src_source[index*self.batchSize:(index+1)*self.batchSize] 281 | tgt_sourceBatch = self.tgt_source[index*self.batchSize:(index+1)*self.batchSize] 282 | sourceBatch = [ src_s +tgt_s for src_s,tgt_s in zip(src_sourceBatch,tgt_sourceBatch)] 283 | # within batch sorting by decreasing length for variable length rnns 284 | indices = range(len(srcBatch)) 285 | 286 | batch = zip(indices, srcBatch ,tgtBatch,alignBatch,rel_seq_pre,rel_index_pre,rel_role_pre,sourceBatch,roots) 287 | order_data = sorted(list(enumerate(list(zip(batch, lengths)))),key = lambda x:-x[1][1]) 288 | order,data = zip(*order_data) 289 | batch, lengths = zip(*data) 290 | indices, srcBatch,tgtBatch,alignBatch ,rel_seq_pre,rel_index_pre,rel_role_pre,sourceBatch,roots= zip(*batch) 291 | 292 | rel_batch,rel_index_batch,rel_lengths = self._batchify_rel_concept(rel_seq_pre,rel_index_pre) 293 | rel_roles,length_squares = self._batchify_rel_roles(rel_role_pre) 294 | 295 | 296 | #,wrap(charBatch)) 297 | return order,wrap(srcBatch,lengths), wrap(tgtBatch,lengths), wrap_align(alignBatch,lengths),\ 298 | MyPackedSequence(rel_batch,rel_lengths),rel_index_batch,MyPackedSequence(rel_roles,length_squares),roots,sourceBatch 299 | 300 | def __len__(self): 301 | return self.numBatches 302 | 303 | 304 | def shuffle(self): 305 | # if True: return 306 | if self.source_only: #if data set if for testing 307 | data = list(zip(self.src,self.src_source)) 308 | self.src,self.src_source = zip(*[data[i] for i in torch.randperm(len(data))]) 309 | else: 310 | data = list(zip(self.src, self.tgt,self.align_index,self.rel_seq,self.rel_index,self.rel_mat,self.root,self.src_source,self.tgt_source)) 311 | self.src, self.tgt,self.align_index,self.rel_seq,self.rel_index,self.rel_mat,self.root,self.src_source,self.tgt_source = zip(*[data[i] for i in torch.randperm(len(data))]) 312 | 313 | -------------------------------------------------------------------------------- /parser/Dict.py: -------------------------------------------------------------------------------- 1 | from utility.amr import * 2 | from utility.data_helper import * 3 | import torch 4 | 5 | def seq_to_id(dictionary,seq): 6 | id_seq = [] 7 | freq_seq = [] 8 | for i in seq: 9 | id_seq.append(dictionary[i]) 10 | freq_seq.append(dictionary.frequencies[dictionary[i]]) 11 | return id_seq,freq_seq 12 | 13 | 14 | 15 | def read_dicts(): 16 | 17 | word_dict = Dict("data/word_dict") 18 | lemma_dict = Dict("data/lemma_dict") 19 | aux_dict = Dict("data/aux_dict") 20 | high_dict = Dict("data/high_dict") 21 | pos_dict = Dict("data/pos_dict") 22 | ner_dict = Dict("data/ner_dict") 23 | rel_dict = Dict("data/rel_dict") 24 | category_dict = Dict("data/category_dict") 25 | 26 | word_dict.load() 27 | lemma_dict.load() 28 | pos_dict.load() 29 | ner_dict.load() 30 | rel_dict.load() 31 | category_dict.load() 32 | high_dict.load() 33 | aux_dict.load() 34 | dicts = dict() 35 | 36 | dicts["rel_dict"] = rel_dict 37 | dicts["word_dict"] = word_dict 38 | dicts["pos_dict"] = pos_dict 39 | dicts["ner_dict"] = ner_dict 40 | dicts["lemma_dict"] = lemma_dict 41 | dicts["category_dict"] = category_dict 42 | dicts["aux_dict"] = aux_dict 43 | dicts["high_dict"] = high_dict 44 | return dicts 45 | 46 | class Dict(object): 47 | def __init__(self, fileName,dictionary=None): 48 | self.idxToLabel = {} 49 | self.labelToIdx = {} 50 | self.frequencies = {} 51 | 52 | # Special entries will not be pruned. 53 | self.special = [] 54 | 55 | if dictionary : 56 | for label in dictionary: 57 | self.labelToIdx[label] = dictionary[label][0] 58 | self.idxToLabel[dictionary[label][0]] = label 59 | self.frequencies[dictionary[label][0]] = dictionary[label][1] 60 | self.fileName = fileName 61 | 62 | 63 | 64 | def size(self): 65 | return len(self.idxToLabel) 66 | 67 | def __len__(self): 68 | return len(self.idxToLabel) 69 | 70 | # Load entries from a file. 71 | def load(self, filename =None): 72 | if filename: 73 | self.fileName = filename 74 | else: 75 | filename = self.fileName 76 | f = Pickle_Helper(filename) 77 | data = f.load() 78 | self.idxToLabel=data["idxToLabel"] 79 | self.labelToIdx=data["labelToIdx"] 80 | self.frequencies=data["frequencies"] 81 | 82 | # Write entries to a file. 83 | def save(self, filename =None): 84 | if filename: 85 | self.fileName = filename 86 | else: 87 | filename = self.fileName 88 | f = Pickle_Helper(filename) 89 | f.dump( self.idxToLabel,"idxToLabel") 90 | f.dump( self.labelToIdx,"labelToIdx") 91 | f.dump( self.frequencies,"frequencies") 92 | f.save() 93 | 94 | def lookup(self, key, default=None): 95 | try: 96 | return self.labelToIdx[key] 97 | except KeyError: 98 | if default: return default 99 | 100 | return self.labelToIdx[UNK_WORD] 101 | def __str__(self): 102 | out_str = [] 103 | for k in self.frequencies: 104 | if k not in self.special: 105 | out_str.append(self.idxToLabel[k]+": "+str(self.frequencies[k])) 106 | return " \n".join(out_str) 107 | def __getitem__(self, label,default=None): 108 | try: 109 | return self.labelToIdx[label] 110 | except KeyError: 111 | if default: return default 112 | 113 | return self.labelToIdx[UNK_WORD] 114 | 115 | def getLabel(self, idx, default=UNK_WORD): 116 | try: 117 | return self.idxToLabel[idx] 118 | except KeyError: 119 | return default 120 | 121 | def __iter__(self): return self.labelToIdx.__iter__() 122 | def __next__(self): return self.labelToIdx.__next__() 123 | # Mark this `label` and `idx` as special (i.e. will not be pruned). 124 | def addSpecial(self, label, idx=None): 125 | idx = self.add(label, idx) 126 | self.special += [idx] 127 | 128 | # Mark all labels in `labels` as specials (i.e. will not be pruned). 129 | def addSpecials(self, labels): 130 | for label in labels: 131 | self.addSpecial(label) 132 | 133 | # Add `label` in the dictionary. Use `idx` as its index if given. 134 | def add(self, label, idx=None): 135 | if idx is not None: 136 | self.idxToLabel[idx] = label 137 | self.labelToIdx[label] = idx 138 | else: 139 | if label in self.labelToIdx: 140 | idx = self.labelToIdx[label] 141 | else: 142 | idx = len(self.idxToLabel) 143 | self.idxToLabel[idx] = label 144 | self.labelToIdx[label] = idx 145 | 146 | if idx not in self.frequencies: 147 | self.frequencies[idx] = 1 148 | else: 149 | self.frequencies[idx] += 1 150 | 151 | return idx 152 | 153 | def __setitem__(self, label, idx): 154 | self.add(label,idx) 155 | 156 | 157 | # Return a new dictionary with the `size` most frequent entries. 158 | def prune(self, size): 159 | if size >= self.size(): 160 | return self 161 | 162 | # Only keep the `size` most frequent entries. 163 | freq = torch.Tensor( 164 | [self.frequencies[i] for i in range(len(self.frequencies))]) 165 | _, idx = torch.sort(freq, 0, False) 166 | 167 | newDict = Dict(self.fileName) 168 | 169 | # Add special entries in all cases. 170 | for i in self.special: 171 | newDict.addSpecial(self.idxToLabel[i]) 172 | 173 | for i in idx[:size]: 174 | newDict.add(self.idxToLabel[i]) 175 | 176 | return newDict 177 | # Return a new dictionary with the `size` most frequent entries. 178 | def pruneByThreshold(self, threshold): 179 | # Only keep the `size` most frequent entries. 180 | high_freq = [ (self.frequencies[i],i) for i in range(len(self.frequencies)) if self.frequencies[i]>threshold] 181 | 182 | newDict = Dict(self.fileName) 183 | 184 | # Add special entries in all cases. 185 | for i in self.special: 186 | newDict.addSpecial(self.idxToLabel[i]) 187 | 188 | for freq,i in high_freq: 189 | newDict.add(self.idxToLabel[i]) 190 | newDict.frequencies[newDict.labelToIdx[self.idxToLabel[i]]] = freq 191 | 192 | return newDict 193 | # Convert `labels` to indices. Use `unkWord` if not found. 194 | # Optionally insert `bosWord` at the beginning and `eosWord` at the . 195 | def convertToIdx(self, labels, unkWord = UNK_WORD, bosWord=BOS_WORD, eosWord=EOS_WORD): 196 | vec = [] 197 | 198 | if bosWord is not None: 199 | vec += [self.lookup(bosWord)] 200 | 201 | unk = self.lookup(unkWord) 202 | vec += [self.lookup(label, default=unk) for label in labels] 203 | 204 | if eosWord is not None: 205 | vec += [self.lookup(eosWord)] 206 | 207 | return torch.LongTensor(vec) 208 | 209 | # Convert `idx` to labels. If index `stop` is reached, convert it and return. 210 | def convertToLabels(self, idx, stop=[]): 211 | labels = [] 212 | 213 | for i in idx: 214 | if i in stop: 215 | break 216 | labels += [self.getLabel(i)] 217 | 218 | return labels -------------------------------------------------------------------------------- /parser/Optim.py: -------------------------------------------------------------------------------- 1 | import math,torch 2 | import torch.optim as optim 3 | import numpy as np 4 | class Optim(object): 5 | 6 | def _makeOptimizer(self): 7 | if self.method == 'sgd': 8 | self.optimizer = optim.SGD(self.params, lr=self.lr,weight_decay = 0) 9 | elif self.method == 'adagrad': 10 | self.optimizer = optim.Adagrad(self.params, lr=self.lr,weight_decay = 0) 11 | elif self.method == 'adadelta': 12 | self.optimizer = optim.Adadelta(self.params, lr=self.lr,weight_decay = 0) 13 | elif self.method == 'adam': 14 | self.optimizer = optim.Adam(self.params, betas=[0.9,0.9],lr=self.lr,weight_decay = 0) 15 | elif self.method == "RMSprop": 16 | self.optimizer = optim.RMSprop(self.params, lr=self.lr, weight_decay=0) 17 | 18 | else: 19 | raise RuntimeError("Invalid optim method: " + self.method) 20 | 21 | def __init__(self, params, method, lr, max_grad_norm, lr_decay=1, start_decay_at=None, weight_decay=0,perturb = 0): 22 | self.params = list(params) # careful: params may be a generator 23 | self.last_ppl = None 24 | self.lr = lr 25 | self.max_grad_norm = max_grad_norm 26 | self.method = method 27 | self.lr_decay = lr_decay 28 | self.start_decay_at = start_decay_at 29 | self.start_decay = False 30 | self.weight_decay = weight_decay 31 | self.weight_shirnk = 1.0 -weight_decay 32 | self._makeOptimizer() 33 | 34 | def step(self): 35 | # Compute gradients norm. 36 | grad_norm = 0 37 | for param in self.params: 38 | grad_norm += math.pow(param.grad.data.norm(), 2) 39 | 40 | grad_norm = math.sqrt(grad_norm) 41 | shrinkage = self.max_grad_norm / grad_norm 42 | nan_size = [] 43 | fine = [] 44 | for param in self.params: 45 | if shrinkage < 1: 46 | param.grad.data.mul_(shrinkage) 47 | # assert not np.isnan(np.sum(param.data.cpu().numpy())),("befotr optim\n",param) 48 | # if np.isnan(np.sum(param.grad.data.cpu().numpy())): 49 | # nan_size.append(param.grad.size()) 50 | # else: fine.append(param.grad.size()) 51 | if len(nan_size) > 0: 52 | print ("befotr optim grad explodes, abandon update, still weight_decay\n",fine) 53 | self.optimizer.step() 54 | for param in self.params: 55 | assert not np.isnan(np.sum(param.data.cpu().numpy())),("befotr shrink\n",param) 56 | param.data.mul_(self.weight_shirnk) #+ torch.normal(0,1e-3*torch.ones(param.data.size()).cuda()) 57 | assert not np.isnan(np.sum(param.data.cpu().numpy())),("after shrink\n",param) 58 | return grad_norm 59 | 60 | # decay learning rate if val perf does not improve or we hit the start_decay_at limit 61 | def updateLearningRate(self, ppl, epoch): 62 | if self.start_decay_at is not None and epoch >= self.start_decay_at: 63 | self.start_decay = True 64 | if self.last_ppl is not None and ppl > self.last_ppl: 65 | self.start_decay = True 66 | 67 | if self.start_decay: 68 | self.lr = self.lr * self.lr_decay 69 | print("Decaying learning rate to %g" % self.lr) 70 | 71 | self.last_ppl = ppl 72 | 73 | self._makeOptimizer() 74 | 75 | -------------------------------------------------------------------------------- /parser/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import parser.models 3 | import parser.Optim 4 | import parser.AMRProcessors -------------------------------------------------------------------------------- /parser/__pycache__/AMRProcessors.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/__pycache__/AMRProcessors.cpython-36.pyc -------------------------------------------------------------------------------- /parser/__pycache__/DataIterator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/__pycache__/DataIterator.cpython-36.pyc -------------------------------------------------------------------------------- /parser/__pycache__/Dict.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/__pycache__/Dict.cpython-36.pyc -------------------------------------------------------------------------------- /parser/__pycache__/Optim.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/__pycache__/Optim.cpython-36.pyc -------------------------------------------------------------------------------- /parser/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /parser/models/ConceptModel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Deep Learning Models for concept identification 6 | 7 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 8 | @since: 2018-05-30 9 | ''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | from parser.modules.helper_module import data_dropout 14 | from torch.nn.utils.rnn import PackedSequence 15 | from utility.constants import * 16 | 17 | 18 | class SentenceEncoder(nn.Module): 19 | def __init__(self, opt, embs): 20 | self.layers = opt.txt_enlayers 21 | self.num_directions = 2 if opt.brnn else 1 22 | assert opt.txt_rnn_size % self.num_directions == 0 23 | self.hidden_size = opt.txt_rnn_size // self.num_directions 24 | # inputSize = opt.word_dim*2 + opt.lemma_dim + opt.pos_dim +opt.ner_dim 25 | inputSize = embs["word_fix_lut"].embedding_dim + embs["lemma_lut"].embedding_dim\ 26 | +embs["pos_lut"].embedding_dim + embs["ner_lut"].embedding_dim 27 | 28 | super(SentenceEncoder, self).__init__() 29 | self.rnn = nn.LSTM(inputSize, self.hidden_size, 30 | num_layers=self.layers, 31 | dropout=opt.dropout, 32 | bidirectional=opt.brnn) 33 | 34 | 35 | self.lemma_lut = embs["lemma_lut"] 36 | 37 | self.word_fix_lut = embs["word_fix_lut"] 38 | 39 | 40 | self.pos_lut = embs["pos_lut"] 41 | 42 | self.ner_lut = embs["ner_lut"] 43 | 44 | self.drop_emb = nn.Dropout(opt.dropout) 45 | self.alpha = opt.alpha 46 | 47 | if opt.cuda: 48 | self.rnn.cuda() 49 | 50 | def forward(self, packed_input: PackedSequence,hidden=None): 51 | #input: pack(data x n_feature ,batch_size) 52 | input = packed_input.data 53 | if self.alpha and self.training: 54 | input = data_dropout(input,self.alpha) 55 | 56 | word_fix_embed = self.word_fix_lut(input[:,TXT_WORD]) 57 | lemma_emb = self.lemma_lut(input[:,TXT_LEMMA]) 58 | pos_emb = self.pos_lut(input[:,TXT_POS]) 59 | ner_emb = self.ner_lut(input[:,TXT_NER]) 60 | 61 | 62 | emb = self.drop_emb(torch.cat([lemma_emb,pos_emb,ner_emb],1))# data,embed 63 | emb = torch.cat([word_fix_embed,emb],1)# data,embed 64 | emb = PackedSequence(emb, packed_input.batch_sizes) 65 | outputs, hidden_t = self.rnn(emb, hidden) 66 | return outputs 67 | 68 | class Concept_Classifier(nn.Module): 69 | 70 | def __init__(self, opt, embs): 71 | super(Concept_Classifier, self).__init__() 72 | self.txt_rnn_size = opt.txt_rnn_size 73 | 74 | self.n_cat = embs["cat_lut"].num_embeddings 75 | self.n_high = embs["high_lut"].num_embeddings 76 | self.n_aux = embs["aux_lut"].num_embeddings 77 | 78 | self.cat_score =nn.Sequential( 79 | nn.Dropout(opt.dropout), 80 | nn.Linear(self.txt_rnn_size,self.n_cat,bias = opt.cat_bias)) 81 | 82 | self.le_score =nn.Sequential( 83 | nn.Dropout(opt.dropout), 84 | nn.Linear(self.txt_rnn_size,self.n_high+1,bias = opt.lemma_bias)) 85 | 86 | self.ner_score =nn.Sequential( 87 | nn.Dropout(opt.dropout), 88 | nn.Linear(self.txt_rnn_size,self.n_aux,bias = opt.cat_bias)) 89 | 90 | self.t = 1 91 | self.sm = nn.Softmax() 92 | if opt.cuda: 93 | self.cuda() 94 | 95 | 96 | 97 | def forward(self, src_enc ): 98 | ''' 99 | src_enc: pack(data x txt_rnn_size ,batch_size) 100 | src_le: pack(data x 1 ,batch_size) 101 | 102 | out: (datax n_cat, batch_size), (data x n_high+1,batch_size) 103 | ''' 104 | 105 | assert isinstance(src_enc,PackedSequence) 106 | 107 | 108 | # high_embs = self.high_lut.weight.expand(le_score.size(0),self.n_high,self.dim) 109 | # le_self_embs = self.lemma_lut(src_le.data).unsqueeze(1) 110 | # le_emb = torch.cat([high_embs,le_self_embs],dim=1) #data x high+1 x dim 111 | 112 | pre_enc =src_enc.data 113 | 114 | cat_score = self.cat_score(pre_enc) # n_data x n_cat 115 | ner_score = self.ner_score(pre_enc)# n_data x n_cat 116 | le_score = self.le_score (src_enc.data) 117 | le_prob = self.sm(le_score) 118 | cat_prob = self.sm(cat_score) 119 | ner_prob = self.sm(ner_score) 120 | batch_sizes = src_enc.batch_sizes 121 | return PackedSequence(cat_prob,batch_sizes),PackedSequence(le_prob,batch_sizes),PackedSequence(ner_prob,batch_sizes) 122 | 123 | class ConceptIdentifier(nn.Module): 124 | #could share encoder with other model 125 | def __init__(self, opt,embs,encoder = None): 126 | super(ConceptIdentifier, self).__init__() 127 | if encoder: 128 | self.encoder = encoder 129 | else: 130 | self.encoder = SentenceEncoder( opt, embs) 131 | self.generator = Concept_Classifier( opt, embs) 132 | 133 | 134 | def forward(self, srcBatch): 135 | src_enc = self.encoder(srcBatch) 136 | probBatch = self.generator(src_enc) 137 | return probBatch,src_enc 138 | -------------------------------------------------------------------------------- /parser/models/MultiPassRelModel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Deep Learning Models for relation identification 6 | 7 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 8 | @since: 2018-05-30 9 | ''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch.autograd import Variable 14 | from parser.modules.helper_module import mypack ,myunpack,MyPackedSequence,MyDoublePackedSequence,mydoubleunpack,mydoublepack,DoublePackedSequence,doubleunpack,data_dropout 15 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 16 | from torch.nn.utils.rnn import pack_padded_sequence as pack 17 | from torch.nn.utils.rnn import PackedSequence 18 | import torch.nn.functional as F 19 | from utility.constants import * 20 | 21 | 22 | 23 | #sentence encoder for root identification 24 | class RootSentenceEncoder(nn.Module): 25 | 26 | def __init__(self, opt, embs): 27 | self.layers = opt.root_enlayers 28 | self.num_directions = 2 if opt.brnn else 1 29 | assert opt.txt_rnn_size % self.num_directions == 0 30 | self.hidden_size = opt.rel_rnn_size // self.num_directions 31 | inputSize = embs["word_fix_lut"].embedding_dim + embs["lemma_lut"].embedding_dim\ 32 | +embs["pos_lut"].embedding_dim+embs["ner_lut"].embedding_dim 33 | 34 | super(RootSentenceEncoder, self).__init__() 35 | 36 | 37 | self.rnn =nn.LSTM(inputSize, self.hidden_size, 38 | num_layers=self.layers, 39 | dropout=opt.dropout, 40 | bidirectional=opt.brnn, 41 | batch_first=True) 42 | 43 | self.lemma_lut = embs["lemma_lut"] 44 | 45 | self.word_fix_lut = embs["word_fix_lut"] 46 | 47 | self.pos_lut = embs["pos_lut"] 48 | 49 | 50 | self.ner_lut = embs["ner_lut"] 51 | 52 | self.alpha = opt.alpha 53 | if opt.cuda: 54 | self.rnn.cuda() 55 | 56 | 57 | 58 | def forward(self, packed_input,hidden=None): 59 | #input: pack(data x n_feature ,batch_size) 60 | #posterior: pack(data x src_len ,batch_size) 61 | assert isinstance(packed_input,PackedSequence) 62 | input = packed_input.data 63 | 64 | if self.alpha and self.training: 65 | input = data_dropout(input,self.alpha) 66 | 67 | word_fix_embed = self.word_fix_lut(input[:,TXT_WORD]) 68 | lemma_emb = self.lemma_lut(input[:,TXT_LEMMA]) 69 | pos_emb = self.pos_lut(input[:,TXT_POS]) 70 | ner_emb = self.ner_lut(input[:,TXT_NER]) 71 | 72 | emb = torch.cat([word_fix_embed,lemma_emb,pos_emb,ner_emb],1)# data,embed 73 | 74 | emb = PackedSequence(emb, packed_input.batch_sizes) 75 | 76 | outputs = self.rnn(emb, hidden)[0] 77 | 78 | return outputs 79 | 80 | #combine amr node embedding and aligned sentence token embedding 81 | class RootEncoder(nn.Module): 82 | 83 | def __init__(self, opt, embs): 84 | self.layers = opt.amr_enlayers 85 | #share hyper parameter with relation model 86 | self.size = opt.rel_dim 87 | inputSize = embs["cat_lut"].embedding_dim + embs["lemma_lut"].embedding_dim+opt.rel_rnn_size 88 | super(RootEncoder, self).__init__() 89 | 90 | self.cat_lut = embs["cat_lut"] 91 | 92 | self.lemma_lut = embs["lemma_lut"] 93 | 94 | self.root = nn.Sequential( 95 | nn.Dropout(opt.dropout), 96 | nn.Linear(inputSize,self.size ), 97 | nn.ReLU() 98 | ) 99 | 100 | 101 | self.alpha = opt.alpha 102 | if opt.cuda: 103 | self.cuda() 104 | 105 | def getEmb(self,indexes,src_enc): 106 | head_emb,lengths = [],[] 107 | src_enc = myunpack(*src_enc) # pre_amr_l/src_l x batch x dim 108 | for i, index in enumerate(indexes): 109 | enc = src_enc[i] #src_l x dim 110 | head_emb.append(enc[index]) #var(amr_l x dim) 111 | lengths.append(len(index)) 112 | return mypack(head_emb,lengths) 113 | 114 | #input: all_data x n_feature, lengths 115 | #index: batch_size x var(amr_len) 116 | #src_enc (batch x amr_len) x src_len x txt_rnn_size 117 | 118 | #head: batch x var( amr_len x txt_rnn_size ) 119 | 120 | #dep : batch x var( amr_len x amr_len x txt_rnn_size ) 121 | 122 | #heads: [var(len),rel_dim] 123 | #deps: [var(len)**2,rel_dim] 124 | def forward(self, input, index,src_enc): 125 | assert isinstance(input, MyPackedSequence),input 126 | input,lengths = input 127 | if self.alpha and self.training: 128 | input = data_dropout(input,self.alpha) 129 | cat_embed = self.cat_lut(input[:,AMR_CAT]) 130 | lemma_embed = self.lemma_lut(input[:,AMR_LE]) 131 | 132 | amr_emb = torch.cat([cat_embed,lemma_embed],1) 133 | # print (input,lengths) 134 | 135 | head_emb = self.getEmb(index,src_enc) #packed, mydoublepacked 136 | 137 | 138 | root_emb = torch.cat([amr_emb,head_emb.data],1) 139 | root_emb = self.root(root_emb) 140 | 141 | return MyPackedSequence(root_emb,lengths) 142 | 143 | #multi pass sentence encoder for relation identification 144 | class RelSentenceEncoder(nn.Module): 145 | 146 | def __init__(self, opt, embs): 147 | self.layers = opt.rel_enlayers 148 | self.num_directions = 2 if opt.brnn else 1 149 | assert opt.txt_rnn_size % self.num_directions == 0 150 | self.hidden_size = opt.rel_rnn_size // self.num_directions 151 | inputSize = embs["word_fix_lut"].embedding_dim + embs["lemma_lut"].embedding_dim\ 152 | +embs["pos_lut"].embedding_dim+embs["ner_lut"].embedding_dim+1 153 | super(RelSentenceEncoder, self).__init__() 154 | 155 | 156 | self.rnn =nn.LSTM(inputSize, self.hidden_size, 157 | num_layers=self.layers, 158 | dropout=opt.dropout, 159 | bidirectional=opt.brnn, 160 | batch_first=True) #first is for root 161 | 162 | self.lemma_lut = embs["lemma_lut"] 163 | 164 | self.word_fix_lut = embs["word_fix_lut"] 165 | 166 | self.pos_lut = embs["pos_lut"] 167 | 168 | 169 | self.ner_lut = embs["ner_lut"] 170 | 171 | self.alpha = opt.alpha 172 | if opt.cuda: 173 | self.rnn.cuda() 174 | 175 | def posteriorIndictedEmb(self,embs,posterior): 176 | #real alignment is sent in as list of index 177 | #variational relaxed posterior is sent in as MyPackedSequence 178 | 179 | #out (batch x amr_len) x src_len x (dim+1) 180 | embs,src_len = unpack(embs) 181 | 182 | if isinstance(posterior,MyPackedSequence): 183 | # print ("posterior is packed") 184 | posterior = myunpack(*posterior) 185 | embs = embs.transpose(0,1) 186 | out = [] 187 | lengths = [] 188 | amr_len = [len(p) for p in posterior] 189 | for i,emb in enumerate(embs): 190 | expanded_emb = emb.unsqueeze(0).expand([amr_len[i]]+[i for i in emb.size()]) # amr_len x src_len x dim 191 | indicator = posterior[i].unsqueeze(2) # amr_len x src_len x 1 192 | out.append(torch.cat([expanded_emb,indicator],2)) # amr_len x src_len x (dim+1) 193 | lengths = lengths + [src_len[i]]*amr_len[i] 194 | data = torch.cat(out,dim=0) 195 | 196 | return pack(data,lengths,batch_first=True),amr_len 197 | elif isinstance(posterior,list): 198 | embs = embs.transpose(0,1) 199 | src_l = embs.size(1) 200 | amr_len = [len(i) for i in posterior] 201 | out = [] 202 | lengths = [] 203 | for i,emb in enumerate(embs): 204 | amr_l = len(posterior[i]) 205 | expanded_emb = emb.unsqueeze(0).expand([amr_l]+[i for i in emb.size()]) # amr_len x src_len x dim 206 | indicator = emb.data.new(amr_l,src_l).zero_() 207 | indicator.scatter_(1, posterior[i].data.unsqueeze(1), 1.0) # amr_len x src_len x 1 208 | indicator = Variable(indicator.unsqueeze(2)) 209 | out.append(torch.cat([expanded_emb,indicator],2)) # amr_len x src_len x (dim+1) 210 | lengths = lengths + [src_len[i]]*amr_l 211 | data = torch.cat(out,dim=0) 212 | 213 | return pack(data,lengths,batch_first=True),amr_len 214 | 215 | 216 | def forward(self, packed_input, packed_posterior,hidden=None): 217 | #input: pack(data x n_feature ,batch_size) 218 | #posterior: pack(data x src_len ,batch_size) 219 | assert isinstance(packed_input,PackedSequence) 220 | input = packed_input.data 221 | 222 | if self.alpha and self.training: 223 | input = data_dropout(input,self.alpha) 224 | word_fix_embed = self.word_fix_lut(input[:,TXT_WORD]) 225 | lemma_emb = self.lemma_lut(input[:,TXT_LEMMA]) 226 | pos_emb = self.pos_lut(input[:,TXT_POS]) 227 | ner_emb = self.ner_lut(input[:,TXT_NER]) 228 | 229 | emb = torch.cat([word_fix_embed,lemma_emb,pos_emb,ner_emb],1)# data,embed 230 | 231 | emb = PackedSequence(emb, packed_input.batch_sizes) 232 | poster_emb,amr_len = self.posteriorIndictedEmb(emb,packed_posterior) 233 | 234 | Outputs = self.rnn(poster_emb, hidden)[0] 235 | 236 | return DoublePackedSequence(Outputs,amr_len,Outputs.data) 237 | 238 | 239 | #combine amr node embedding and aligned sentence token embedding 240 | class RelEncoder(nn.Module): 241 | 242 | def __init__(self, opt, embs): 243 | super(RelEncoder, self).__init__() 244 | 245 | self.layers = opt.amr_enlayers 246 | 247 | self.size = opt.rel_dim 248 | inputSize = embs["cat_lut"].embedding_dim + embs["lemma_lut"].embedding_dim+opt.rel_rnn_size 249 | 250 | self.head = nn.Sequential( 251 | nn.Dropout(opt.dropout), 252 | nn.Linear(inputSize,self.size ) 253 | ) 254 | 255 | self.dep = nn.Sequential( 256 | nn.Dropout(opt.dropout), 257 | nn.Linear(inputSize,self.size ) 258 | ) 259 | 260 | self.cat_lut = embs["cat_lut"] 261 | 262 | self.lemma_lut = embs["lemma_lut"] 263 | self.alpha = opt.alpha 264 | 265 | if opt.cuda: 266 | self.cuda() 267 | 268 | def getEmb(self,indexes,src_enc): 269 | head_emb,dep_emb = [],[] 270 | src_enc,src_l = doubleunpack(src_enc) # batch x var(amr_l x src_l x dim) 271 | length_pairs = [] 272 | for i, index in enumerate(indexes): 273 | enc = src_enc[i] #amr_l src_l dim 274 | dep_emb.append(enc.index_select(1,index)) #var(amr_l x amr_l x dim) 275 | head_index = index.unsqueeze(1).unsqueeze(2).expand(enc.size(0),1,enc.size(-1)) 276 | # print ("getEmb",enc.size(),dep_index.size(),head_index.size()) 277 | head_emb.append(enc.gather(1,head_index).squeeze(1)) #var(amr_l x dim) 278 | length_pairs.append([len(index),len(index)]) 279 | return mypack(head_emb,[ls[0] for ls in length_pairs]),mydoublepack(dep_emb,length_pairs),length_pairs 280 | 281 | #input: all_data x n_feature, lengths 282 | #index: batch_size x var(amr_len) 283 | #src_enc (batch x amr_len) x src_len x txt_rnn_size 284 | 285 | #head: batch x var( amr_len x txt_rnn_size ) 286 | 287 | #dep : batch x var( amr_len x amr_len x txt_rnn_size ) 288 | 289 | #heads: [var(len),rel_dim] 290 | #deps: [var(len)**2,rel_dim] 291 | def forward(self, input, index,src_enc): 292 | assert isinstance(input, MyPackedSequence),input 293 | input,lengths = input 294 | if self.alpha and self.training: 295 | input = data_dropout(input,self.alpha) 296 | cat_embed = self.cat_lut(input[:,AMR_CAT]) 297 | lemma_embed = self.lemma_lut(input[:,AMR_LE]) 298 | 299 | amr_emb = torch.cat([cat_embed,lemma_embed],1) 300 | # print (input,lengths) 301 | 302 | head_emb_t,dep_emb_t,length_pairs = self.getEmb(index,src_enc) #packed, mydoublepacked 303 | 304 | 305 | head_emb = torch.cat([amr_emb,head_emb_t.data],1) 306 | 307 | dep_amr_emb_t = myunpack(*MyPackedSequence(amr_emb,lengths)) 308 | dep_amr_emb = [ emb.unsqueeze(0).expand(emb.size(0),emb.size(0),emb.size(-1)) for emb in dep_amr_emb_t] 309 | 310 | mydouble_amr_emb = mydoublepack(dep_amr_emb,length_pairs) 311 | 312 | # print ("rel_encoder",mydouble_amr_emb.data.size(),dep_emb_t.data.size()) 313 | dep_emb = torch.cat([mydouble_amr_emb.data,dep_emb_t.data],-1) 314 | 315 | # emb_unpacked = myunpack(emb,lengths) 316 | 317 | head_packed = MyPackedSequence(self.head(head_emb),lengths) # total,rel_dim 318 | head_amr_packed = MyPackedSequence(amr_emb,lengths) # total,rel_dim 319 | 320 | # print ("dep_emb",dep_emb.size()) 321 | size = dep_emb.size() 322 | dep = self.dep(dep_emb.view(-1,size[-1])).view(size[0],size[1],-1) 323 | 324 | dep_packed = MyDoublePackedSequence(MyPackedSequence(dep,mydouble_amr_emb[0][1]),mydouble_amr_emb[1],dep) 325 | 326 | return head_amr_packed,head_packed,dep_packed #,MyPackedSequence(emb,lengths) 327 | 328 | 329 | class RelModel(nn.Module): 330 | def __init__(self, opt,embs): 331 | super(RelModel, self).__init__() 332 | self.root_encoder = RootEncoder(opt,embs) 333 | self.encoder = RelEncoder( opt, embs) 334 | self.generator = RelCalssifierBiLinear( opt, embs,embs["rel_lut"].num_embeddings) 335 | 336 | self.root = nn.Linear(opt.rel_dim,1) 337 | self.LogSoftmax = nn.LogSoftmax() 338 | 339 | 340 | def root_score(self,mypackedhead): 341 | heads = myunpack(*mypackedhead) 342 | output = [] 343 | for head in heads: 344 | score = self.root(head).squeeze(1) 345 | output.append(self.LogSoftmax(score)) 346 | return output 347 | 348 | def forward(self, srlBatch, index,src_enc,root_enc): 349 | mypacked_root_enc = self.root_encoder(srlBatch, index,root_enc) #with information from le cat enc 350 | roots = self.root_score(mypacked_root_enc) 351 | 352 | encoded= self.encoder(srlBatch, index,src_enc) 353 | score_packed = self.generator(*encoded) 354 | 355 | return score_packed,roots #,arg_logit_packed 356 | 357 | 358 | class RelCalssifierBiLinear(nn.Module): 359 | 360 | def __init__(self, opt, embs,n_rel): 361 | super(RelCalssifierBiLinear, self).__init__() 362 | self.n_rel = n_rel 363 | self.cat_lut = embs["cat_lut"] 364 | self.inputSize = opt.rel_dim 365 | 366 | 367 | self.bilinear = nn.Sequential(nn.Dropout(opt.dropout), 368 | nn.Linear(self.inputSize,self.inputSize* self.n_rel)) 369 | self.head_bias = nn.Sequential(nn.Dropout(opt.dropout), 370 | nn.Linear(self.inputSize,self.n_rel)) 371 | self.dep_bias = nn.Sequential(nn.Dropout(opt.dropout), 372 | nn.Linear(self.inputSize,self.n_rel)) 373 | self.bias = nn.Parameter(torch.normal(torch.zeros(self.n_rel)).cuda()) 374 | 375 | 376 | # self.lsm = nn.LogSoftmax() 377 | self.cat_lut = embs["cat_lut"] 378 | self.lemma_lut = embs["lemma_lut"] 379 | if opt.cuda: 380 | self.cuda() 381 | 382 | def bilinearForParallel(self,inputs,length_pairs): 383 | output = [] 384 | ls = [] 385 | for i,input in enumerate(inputs): 386 | 387 | #head_t : amr_l x ( rel_dim x n_rel) 388 | #dep_t : amr_l x amr_l x rel_dim 389 | #head_bias : amr_l x n_rel 390 | #dep_bias : amr_l x amr_l x n_rel 391 | head_t,dep_t,head_bias,dep_bias = input 392 | l = len(head_t) 393 | ls.append(l) 394 | head_t = head_t.view(l,-1,self.n_rel) 395 | score =dep_t[:,:length_pairs[i][1]].bmm( head_t.view(l,-1,self.n_rel)).view(l,l,self.n_rel).transpose(0,1) 396 | 397 | dep_bias = dep_bias[:,:length_pairs[i][1]] 398 | score = score + dep_bias 399 | 400 | score = score + head_bias.unsqueeze(1).expand_as(score) 401 | score = score+self.bias.unsqueeze(0).unsqueeze(1).expand_as(score) 402 | score = F.log_softmax(score.view(ls[-1]*ls[-1],self.n_rel)) # - score.exp().sum(2,keepdim=True).log().expand_as(score) 403 | 404 | output.append(score.view(ls[-1]*ls[-1],self.n_rel)) 405 | return output,[l**2 for l in ls] 406 | 407 | 408 | def forward(self, _,heads,deps): 409 | '''heads.data: mypacked amr_l x rel_dim 410 | deps.data: mydoublepacked amr_l x amr_l x rel_dim 411 | ''' 412 | heads_data = heads.data 413 | deps_data = deps.data 414 | 415 | head_bilinear_transformed = self.bilinear (heads_data) #all_data x ( n_rel x inputsize) 416 | 417 | head_bias_unpacked = myunpack(self.head_bias(heads_data),heads.lengths) #[len x n_rel] 418 | 419 | size = deps_data.size() 420 | dep_bias = self.dep_bias(deps_data.view(-1,size[-1])).view(size[0],size[1],-1) 421 | 422 | dep_bias_unpacked,length_pairs = mydoubleunpack(MyDoublePackedSequence(MyPackedSequence( dep_bias,deps[0][1]),deps[1],dep_bias) ) #[len x n_rel] 423 | 424 | bilinear_unpacked = myunpack(head_bilinear_transformed,heads.lengths) 425 | 426 | deps_unpacked,length_pairs = mydoubleunpack(deps) 427 | output,l = self.bilinearForParallel( zip(bilinear_unpacked,deps_unpacked,head_bias_unpacked,dep_bias_unpacked),length_pairs) 428 | myscore_packed = mypack(output,l) 429 | 430 | # prob_packed = MyPackedSequence(myscore_packed.data,l) 431 | return myscore_packed -------------------------------------------------------------------------------- /parser/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Deep Learning Models for variational inference of alignment. 6 | Posterior , LikeliHood helps computing posterior weighted likelihood regarding relaxation. 7 | 8 | Also the whole AMR model is combined here. 9 | 10 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 11 | @since: 2018-05-30 12 | ''' 13 | 14 | import numpy as np 15 | from parser.models.ConceptModel import * 16 | from parser.models.MultiPassRelModel import * 17 | 18 | from parser.modules.GumbelSoftMax import renormalize,sink_horn,gumbel_noise_sample 19 | from parser.modules.helper_module import doublepack 20 | 21 | from copy import deepcopy 22 | 23 | #Encoding linearized AMR concepts for vartiaonal alignment model 24 | class AmrEncoder(nn.Module): 25 | 26 | def __init__(self, opt, embs): 27 | self.layers = opt.amr_enlayers 28 | self.num_directions = 2 if opt.brnn else 1 29 | assert opt.amr_rnn_size % self.num_directions == 0 30 | self.hidden_size = opt.amr_rnn_size // self.num_directions 31 | inputSize = embs["cat_lut"].embedding_dim + embs["lemma_lut"].embedding_dim 32 | super(AmrEncoder, self).__init__() 33 | 34 | self.rnn = nn.LSTM(inputSize, self.hidden_size, 35 | num_layers=opt.amr_enlayers, 36 | dropout=opt.dropout, 37 | bidirectional=opt.brnn) 38 | self.cat_lut = embs["cat_lut"] 39 | 40 | self.lemma_lut = embs["lemma_lut"] 41 | 42 | 43 | 44 | self.alpha = opt.alpha #unk with alpha 45 | if opt.cuda: 46 | self.cuda() 47 | 48 | #input:len, batch, n_feature 49 | #output: len, batch, hidden_size * num_directions 50 | def forward(self, packed_input, hidden=None): 51 | assert isinstance(packed_input,PackedSequence) 52 | input = packed_input.data 53 | 54 | if self.alpha and self.training: 55 | input = data_dropout(input,self.alpha) 56 | 57 | cat_embed = self.cat_lut(input[:,AMR_CAT]) 58 | lemma_embed = self.lemma_lut(input[:,AMR_LE]) 59 | 60 | emb = torch.cat([cat_embed,lemma_embed],1) # len,batch,embed 61 | emb = PackedSequence(emb, packed_input.batch_sizes) 62 | outputs, hidden_t = self.rnn(emb, hidden) 63 | return outputs, hidden_t 64 | 65 | #Model to compute relaxed posteior 66 | # we constraint alignment if copying mechanism can be used 67 | class Posterior(nn.Module): 68 | def __init__(self,opt): 69 | super(Posterior, self).__init__() 70 | self.txt_rnn_size = opt.txt_rnn_size 71 | self.amr_rnn_size = opt.amr_rnn_size 72 | self.jamr = opt.jamr 73 | if self.jamr : #if use fixed alignment, then no need for variational model 74 | return 75 | self.transform = nn.Sequential( 76 | nn.Dropout(opt.dropout), 77 | nn.Linear(self.txt_rnn_size,self.amr_rnn_size,bias = opt.lemma_bias)) 78 | self.sm = nn.Softmax() 79 | self.sink = opt.sink 80 | self.sink_t = opt.sink_t 81 | if opt.cuda: 82 | self.cuda() 83 | 84 | def forward(self,src_enc,amr_enc,aligns): 85 | 86 | '''src_enc: src_len x batch x txt_rnn_size, src_l 87 | amr_enc: amr_len x batch x amr_rnn_size, amr_l 88 | aligns: amr_len x batch x src_len , amr_l 89 | 90 | 91 | posterior: amr_len x batch x src_len , amr_l 92 | ''' 93 | if self.jamr : 94 | return aligns,aligns,0 95 | src_enc,amr_enc,aligns =unpack(src_enc),unpack(amr_enc),unpack(aligns) 96 | 97 | src_enc = src_enc[0] 98 | amr_enc = amr_enc[0] 99 | lengths = aligns[1] 100 | aligns = aligns[0] 101 | assert not np.isnan(np.sum(src_enc.data.cpu().numpy())),("src_enc \n",src_enc) 102 | assert not np.isnan(np.sum(amr_enc.data.cpu().numpy())),("amr_enc \n",amr_enc) 103 | src_len , batch , src_rnn_size = src_enc.size() 104 | src_transformed = self.transform(src_enc.view(-1,src_rnn_size)).view(src_len,batch,-1).transpose(0,1) #batch x src_len x amr_rnn_size 105 | amr_enc = amr_enc.transpose(0,1).transpose(1,2) #batch x amr_rnn_size x amr_len 106 | score = src_transformed.bmm(amr_enc).transpose(1,2).transpose(0,1) #/ self.amr_rnn_size #amr_len x batch x src_len 107 | assert not np.isnan(np.sum(score.data.cpu().numpy())),("score \n",score) 108 | final_score = gumbel_noise_sample(score) if self.training else score 109 | assert not np.isnan(np.sum(final_score.data.cpu().numpy())),("final_score \n",final_score) 110 | if self.sink: 111 | posterior = sink_horn((final_score- (1-aligns)*1e6 ,lengths),k=self.sink,t=self.sink_t ) 112 | else: 113 | final_score = final_score- (1-aligns)*1e6 114 | dim = final_score.size() 115 | final_score = final_score.view(-1, final_score.size(-1)) 116 | posterior =self.sm(final_score).view(dim) 117 | return pack(posterior, lengths),pack(score,lengths) #amr_len x batch x src_len 118 | 119 | #directly compute likelihood of concept being generated at words (a matrix for each training example) 120 | def LikeliHood(tgtBatch,probBatch): 121 | '''tgtBatch: data x [n_feature + 1 (AMR_CAN_COPY)], batch_sizes 122 | probaBatch: (data x n_out, lengths ) * 123 | aligns: amr_len x batch x src_len , amr_l 124 | 125 | likelihood: data (amr) x src_len , batch_sizes 126 | ''' 127 | 128 | batch_sizes = tgtBatch.batch_sizes 129 | likelihoods = [] 130 | for i,prob in enumerate(probBatch): 131 | assert isinstance(prob, PackedSequence),"only support packed" 132 | if i == AMR_LE: 133 | prob_batch,lengths = unpack(prob) 134 | prob_batch = prob_batch.transpose(0,1) # batch x src_len x n_out 135 | n_out = prob_batch.size(-1) 136 | src_len = prob_batch.size(1) 137 | packed_index_data = tgtBatch.data[:,i].clamp(max=n_out-1) #so lemma not in high maps to last index ,data x 1 138 | 139 | copy_data = (packed_index_data re-categorized_id 281 | # posterior: re-categorized_id -> alignment_soft_posterior 282 | rel_batch,rel_index,srcBatch,posterior = input 283 | assert not np.isnan(np.sum(posterior.data.data.cpu().numpy())),("posterior.data \n",posterior.data) 284 | posterior_data = renormalize(posterior.data+epsilon) 285 | assert not np.isnan(np.sum(posterior_data.data.cpu().numpy())),("posterior_data \n",posterior_data) 286 | posterior = PackedSequence(posterior_data,posterior.batch_sizes) 287 | indexed_posterior = self.index_posterior(posterior,rel_index) 288 | 289 | src_enc = self.rel_encoder(srcBatch,indexed_posterior) 290 | root_enc = self.root_encoder(srcBatch) 291 | 292 | weighted_root_enc = self.root_posterior_enc(posterior,root_enc) 293 | weighted_enc= self.weight_posterior_enc(posterior,src_enc) #src_enc MyDoublePackedSequence, amr_len 294 | 295 | # self_rel_index = [ Variable(index.data.new(list(range(len(index))))) for index in rel_index] 296 | rel_prob = self.relModel(rel_batch,rel_index,weighted_enc,weighted_root_enc) 297 | # assert not np.isnan(np.sum(rel_prob[0].data.data.cpu().numpy())),("inside srl\n",rel_prob[0].data.data) 298 | return rel_prob 299 | if len(input)==3 and rel: 300 | # relation identification evaluation 301 | rel_batch,srcBatch,alginBatch = input # 302 | src_enc = self.rel_encoder(srcBatch,alginBatch) 303 | root_enc = self.root_encoder(srcBatch) 304 | root_data,lengths = unpack(root_enc) 305 | mypacked_root_enc = mypack(root_data.transpose(0,1).contiguous(),lengths) 306 | rel_prob = self.relModel(rel_batch,alginBatch,src_enc,mypacked_root_enc) 307 | return rel_prob 308 | else: 309 | # concept identification evaluation 310 | srcBatch = input 311 | probBatch,src_enc= self.concept_decoder(srcBatch) 312 | return probBatch 313 | 314 | 315 | #encoding relaxation for root identification 316 | def root_posterior_enc(self,posterior,src_enc): 317 | '''src_enc: # batch x var( src_l x dim) 318 | posterior = pre_amr_len x batch x src_len , amr_l 319 | 320 | out: batch x amr_len x txt_rnn_size 321 | ''' 322 | posterior,lengths = unpack(posterior) 323 | enc,length_src = unpack(src_enc) 324 | # print ("length_pairs",length_pairs) 325 | # print ("lengths",lengths) 326 | weighted = [] 327 | for i, src_l in enumerate(length_src): #src_len x dim 328 | p = posterior[:,i,:src_l] #pre_full_amr_len x src_len 329 | enc_t = enc[:src_l,i,:] 330 | weighted_enc = p.mm(enc_t) #pre_amr_len x dim 331 | weighted.append(weighted_enc) #pre_amr_len x dim 332 | # print ("length_pairs",length_pairs) 333 | return mypack(weighted,lengths) 334 | 335 | #encoding relaxation for relation identification 336 | def weight_posterior_enc(self,posterior,src_enc): 337 | '''src_enc: # batch x var(pre_amr_len x src_l x dim) 338 | posterior = pre_amr_len x batch x src_len , amr_l 339 | 340 | out: batch x amr_len x txt_rnn_size 341 | ''' 342 | posterior,lengths = unpack(posterior) 343 | def handle_enc(enc): 344 | enc,length_pairs = doubleunpack(enc) 345 | # print ("length_pairs",length_pairs) 346 | # print ("lengths",lengths) 347 | dim = enc[0].size(-1) 348 | weighted = [] 349 | new_length_pairs = [] 350 | for i, src_enc_t in enumerate(enc): 351 | p = posterior[:lengths[i],i,:] #pre_amr_len x src_len 352 | enc_trans = src_enc_t.transpose(0,1).contiguous().view(p.size(-1),-1) #src_len x (pre_amr_len x dim) 353 | weighted_enc = p.mm(enc_trans) #pre_amr_len x (pre_amr_len x dim) 354 | weighted.append(weighted_enc.view(lengths[i],length_pairs[i][0],dim).transpose(0,1).contiguous()) #pre_amr_len x pre_amr_len x dim 355 | new_length_pairs.append([length_pairs[i][0],lengths[i]]) 356 | # print ("length_pairs",length_pairs) 357 | return doublepack(weighted,length_pairs) 358 | 359 | return handle_enc(src_enc) 360 | 361 | -------------------------------------------------------------------------------- /parser/models/__pycache__/ConceptModel.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/models/__pycache__/ConceptModel.cpython-36.pyc -------------------------------------------------------------------------------- /parser/models/__pycache__/MultiPassRelModel.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/models/__pycache__/MultiPassRelModel.cpython-36.pyc -------------------------------------------------------------------------------- /parser/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /parser/modules/GumbelSoftMax.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Helper functions regarding gumbel noise 6 | 7 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 8 | @since: 2018-05-30 9 | ''' 10 | 11 | import torch 12 | from torch.autograd import Variable 13 | import torch.nn.functional as F 14 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 15 | from torch.nn.utils.rnn import pack_padded_sequence as pack 16 | from torch.nn.utils.rnn import PackedSequence 17 | 18 | eps = 1e-8 19 | def sample_gumbel(input): 20 | noise = torch.rand(input.size()).type_as(input.data) 21 | noise.add_(eps).log_().neg_() 22 | noise.add_(eps).log_().neg_() 23 | return Variable(noise,requires_grad=False) 24 | 25 | 26 | def gumbel_noise_sample(input,temperature = 1): 27 | noise = sample_gumbel(input) 28 | x = (input + noise) / temperature 29 | return x.view_as(input) 30 | 31 | 32 | import numpy as np 33 | 34 | def sink_horn(input,k = 5,t = 1,batch_first = False): 35 | def sink_horn_data(x,lengths): 36 | assert not np.isnan(np.sum(x.data.cpu().numpy())),("start x\n",x.data) 37 | over_flow = x-80*t 38 | x = x.clamp(max=80*t)+F.tanh(over_flow)*(over_flow>0).float() 39 | x = torch.exp(x/t) 40 | assert not np.isnan(np.sum(x.data.cpu().numpy())),("exp x\n",x.data) 41 | musks = torch.zeros(x.size()) 42 | for i,l in enumerate(lengths): 43 | musks[:l,i,:l] = 1 44 | musks = Variable(musks,requires_grad=False).type_as(x) 45 | x = x*musks+eps 46 | for i in range(0,k): 47 | x = x / x.sum(0,keepdim=True).expand_as(x) 48 | x = x*musks+eps 49 | x = x / x.sum(2,keepdim=True).expand_as(x) 50 | x = x*musks+eps 51 | 52 | assert not np.isnan(np.sum(x.data.cpu().numpy())),("end x\n",x.data) 53 | return x 54 | if isinstance(input,PackedSequence): 55 | data,l = unpack(input,batch_first=batch_first) 56 | data = sink_horn_data(data,l) 57 | return pack(data,l,batch_first) 58 | else: 59 | return sink_horn_data(*input) 60 | 61 | 62 | def renormalize(input,t=1): 63 | 64 | x = ((input+eps).log() ) / t 65 | x = F.softmax(x) 66 | return x.view_as(input) 67 | 68 | -------------------------------------------------------------------------------- /parser/modules/__initial__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/modules/__initial__.py -------------------------------------------------------------------------------- /parser/modules/__pycache__/GumbelSoftMax.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/modules/__pycache__/GumbelSoftMax.cpython-36.pyc -------------------------------------------------------------------------------- /parser/modules/__pycache__/helper_module.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/parser/modules/__pycache__/helper_module.cpython-36.pyc -------------------------------------------------------------------------------- /parser/modules/helper_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Some data structure to save memory for packing variable lengthed data into batch, 6 | Not actually sure whether it's better (time or space) than zero padding, 7 | 8 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 9 | @since: 2018-05-30 10 | ''' 11 | import torch 12 | from torch.autograd import Variable 13 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 14 | from torch.nn.utils.rnn import pack_padded_sequence as pack 15 | from collections import namedtuple 16 | MyPackedSequence = namedtuple('MyPackedSequence', ['data', 'lengths']) 17 | MyDoublePackedSequence = namedtuple('MyDoublePackedSequence', ['PackedSequence', 'length_pairs','data']) #packed sequence must be batch_first, inner length 18 | DoublePackedSequence = namedtuple('DoublePackedSequence', ['PackedSequence', 'outer_lengths','data']) #packed sequence must be batch_first, inner length 19 | 20 | def sort_index(seq): 21 | sorted([(v, i) for (i, v) in enumerate(seq)],reverse = True) 22 | 23 | def mypack(data,lengths): 24 | if isinstance(data,list): 25 | return MyPackedSequence(torch.cat(data,0),lengths) 26 | else: 27 | data_list = [] 28 | for i, l in enumerate(lengths): 29 | data_list.append(data[i][:l]) 30 | return mypack(data_list,lengths) 31 | 32 | 33 | def myunpack(*mypacked): 34 | data,lengths = mypacked 35 | data_list = [] 36 | current = 0 37 | for i, l in enumerate(lengths): 38 | data_list.append(data[current:l+current]) 39 | current += l 40 | return data_list 41 | 42 | def mydoubleunpack(mydoublepacked): 43 | packeddata,length_pairs,data = mydoublepacked 44 | data = myunpack(*packeddata) 45 | data_list = [] 46 | for i, ls in enumerate(length_pairs): 47 | out_l,in_l = ls 48 | data_list.append(data[i][:,:in_l]) #outl x max_l x dim 49 | return data_list,length_pairs 50 | 51 | 52 | def mydoublepack(data_list,length_pairs): #batch x var(amr_l x src_l x dim) 53 | data = [] 54 | max_in_l = max([ls[1] for ls in length_pairs]) 55 | outer_l = [] 56 | for d, ls in list(zip(data_list,length_pairs)): 57 | outl,inl = ls 58 | size = [i for i in d.size()] 59 | if size[1] == max_in_l: 60 | tdata = d 61 | else: 62 | size[1] = max_in_l 63 | tdata = Variable(d.data.new(*size).fill_(0)) 64 | # print (tdata) 65 | tdata[:,:inl] = d 66 | data.append( tdata) #amr_l x src_l x dim 67 | outer_l.append(outl) 68 | 69 | packed = mypack(data,outer_l) 70 | 71 | return MyDoublePackedSequence(packed,length_pairs,packed.data) 72 | 73 | def doubleunpack(doublepacked): 74 | assert isinstance(doublepacked,DoublePackedSequence) 75 | packeddata,outer_lengths,data = doublepacked 76 | data,in_l = unpack(packeddata,batch_first=True) 77 | data_list = [] 78 | length_pairs = [] 79 | current = 0 80 | for i, l in enumerate(outer_lengths): 81 | data_list.append(data[current:l+current]) #outl x max_l x dim 82 | length_pairs.append((l,in_l[current])) 83 | current += l 84 | return data_list,length_pairs 85 | 86 | 87 | def doublepack(data_list,length_pairs): #batch x var(amr_l x src_l x dim) 88 | data = [] 89 | lengths = [] 90 | max_in_l = max([ls[1] for ls in length_pairs]) 91 | outer_l = [] 92 | for d, ls in list(zip(data_list,length_pairs)): 93 | outl,inl = ls 94 | size = [i for i in d.size()] 95 | if size[1] == max_in_l: 96 | tdata = d 97 | else: 98 | size[1] = max_in_l 99 | tdata = Variable(d.data.new(*size).fill_(0)) 100 | # print (tdata) 101 | tdata[:,:inl] = d 102 | data.append( tdata) #amr_l x src_l x dim 103 | lengths = lengths + [inl]*outl 104 | outer_l.append(outl) 105 | 106 | packed = pack(torch.cat(data,0),lengths,batch_first=True) 107 | 108 | return DoublePackedSequence(packed,outer_l,packed.data) 109 | 110 | 111 | 112 | def data_dropout(data:Variable,frequency,UNK = 1)->Variable: 113 | if frequency == 0: return data 114 | if isinstance(frequency,Variable): 115 | f = frequency 116 | unk_mask = Variable(torch.bernoulli(f.data),requires_grad = False).cuda() 117 | data = data*(1-unk_mask).long()+(unk_mask*Variable(torch.ones(data.size()).cuda()*UNK,requires_grad = False)).long() 118 | else: 119 | f = torch.ones(data.size()).cuda()*frequency 120 | unk_mask = Variable(torch.bernoulli(f),requires_grad = False) 121 | data = data*(1-unk_mask).long()+(unk_mask*Variable(torch.ones(data.size()).cuda()*UNK,requires_grad = False)).long() 122 | return data 123 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import torch 4 | import torch.nn as nn 5 | def freeze(m,t=0): 6 | if isinstance(m,nn.Dropout): 7 | m.p = t 8 | m.dropout =t 9 | 10 | 11 | from copy import deepcopy 12 | def load_old_model(dicts,opt,generate=False): 13 | model_from = opt.train_from 14 | print('Loading from checkpoint at %s' % model_from) 15 | if opt.gpus[0] != -1: 16 | print ('from model in gpus:'+str(opt.from_gpus[0]),' to gpu:'+str(opt.gpus[0])) 17 | checkpoint = torch.load(model_from, map_location={'cuda:'+str(opt.from_gpus[0]): 'cuda:'+str(opt.gpus[0])}) 18 | else: 19 | print ('from model in gpus:'+str(opt.from_gpus[0]),'to cpu ') 20 | checkpoint = torch.load(model_from, map_location={'cuda:'+str(opt.from_gpus[0]): 'cpu'}) 21 | print("Model loaded") 22 | optt = checkpoint["opt"] 23 | rel = optt.rel 24 | AmrModel = checkpoint['model'] 25 | if optt.rel == 1: 26 | if not opt.train_all: 27 | AmrModel.concept_decoder = deepcopy(AmrModel.concept_decoder) 28 | for name, param in AmrModel.concept_decoder.named_parameters(): 29 | param.requires_grad = False 30 | AmrModel.concept_decoder.apply(freeze) 31 | 32 | parameters_to_train = [] 33 | for name, param in AmrModel.named_parameters(): 34 | if name == "word_fix_lut" or param.size(0) == len(dicts["word_dict"]): 35 | param.requires_grad = False 36 | if param.requires_grad: 37 | parameters_to_train.append(param) 38 | print (AmrModel) 39 | print ("training parameters: "+str(len(parameters_to_train))) 40 | return AmrModel,parameters_to_train,optt 41 | 42 | optt.rel = opt.rel 43 | if opt.rel and not rel : 44 | if opt.jamr == 0: 45 | AmrModel.poserior_m.align_weight = 1 46 | AmrModel.concept_decoder.apply(freeze) 47 | opt.independent = True 48 | AmrModel.start_rel(opt) 49 | embs = AmrModel.embs 50 | embs["lemma_lut"].requires_grad = False ##need load 51 | embs["pos_lut"].requires_grad = False 52 | embs["ner_lut"].requires_grad = False 53 | embs["word_fix_lut"].requires_grad = False 54 | embs["rel_lut"] = nn.Embedding(dicts["rel_dict"].size(), 55 | opt.rel_dim) 56 | for param in AmrModel.concept_decoder.parameters(): 57 | param.requires_grad = False 58 | if not generate and opt.jamr == 0: 59 | AmrModel.poserior_m.posterior.ST = opt.ST 60 | AmrModel.poserior_m.posterior.sink = opt.sink 61 | AmrModel.poserior_m.posterior.sink_t = opt.sink_t 62 | 63 | if opt.cuda: 64 | AmrModel.cuda() 65 | else: 66 | AmrModel.cpu() 67 | 68 | if not generate and opt.jamr == 0: 69 | if opt.train_posterior: 70 | for param in AmrModel.poserior_m.parameters(): 71 | param.requires_grad = True 72 | AmrModel.poserior_m.apply(lambda x: freeze(x,opt.dropout)) 73 | else: 74 | opt.prior_t = 0 75 | opt.sink_re = 0 76 | for param in AmrModel.poserior_m.parameters(): 77 | param.requires_grad = False 78 | parameters_to_train = [] 79 | if opt.train_all: 80 | for name, param in AmrModel.named_parameters(): 81 | if name != "word_fix_lut": 82 | param.requires_grad = True 83 | parameters_to_train.append(param) 84 | else: 85 | print ("not updating "+name) 86 | 87 | else: 88 | if opt.rel: 89 | for param in AmrModel.concept_decoder.parameters(): 90 | if param.requires_grad: 91 | param.requires_grad = False 92 | print("turing off concept model: ",param) 93 | for name,p in AmrModel.named_parameters(): 94 | if name == "word_fix_lut" or p.size(0) == len(dicts["word_dict"]): 95 | p.requires_grad = False 96 | if p.requires_grad: 97 | parameters_to_train.append(p) 98 | else: 99 | print ([p.size() for p in AmrModel.concept_decoder.parameters()]) 100 | AmrModel.apply(freeze) 101 | for p in AmrModel.concept_decoder.parameters(): 102 | p.requires_grad = True 103 | parameters_to_train.append(p) 104 | print (AmrModel) 105 | print ("training parameters: "+str(len(parameters_to_train))) 106 | return AmrModel,parameters_to_train,optt 107 | -------------------------------------------------------------------------------- /src/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/src/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/train.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/src/__pycache__/train.cpython-36.pyc -------------------------------------------------------------------------------- /src/data_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Scripts build dictionary and data into numbers 6 | 7 | Data path information should also be specified here for 8 | trainFolderPath, devFolderPath and testFolderPath 9 | as we allow option to choose from two version of data. 10 | 11 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 12 | @since: 2018-05-30 13 | ''' 14 | 15 | from utility.StringCopyRules import * 16 | from utility.ReCategorization import * 17 | from parser.Dict import * 18 | 19 | import argparse 20 | 21 | 22 | def data_build_parser(): 23 | parser = argparse.ArgumentParser(description='data_build.py') 24 | 25 | ## Data options 26 | parser.add_argument('-threshold', default=10, type=int, 27 | help="""threshold for high frequency concepts""") 28 | 29 | parser.add_argument('-jamr', default=0, type=int, 30 | help="""wheather to add .jamr at the end""") 31 | parser.add_argument('-skip', default=0, type=int, 32 | help="""skip dict build if dictionary already built""") 33 | parser.add_argument('-suffix', default=".txt_pre_processed", type=str, 34 | help="""suffix of files to combine""") 35 | parser.add_argument('-folder', default=allFolderPath, type=str, 36 | help="""the folder""") 37 | return parser 38 | 39 | 40 | parser = data_build_parser() 41 | 42 | opt = parser.parse_args() 43 | 44 | suffix = opt.suffix + "_jamr" if opt.jamr else opt.suffix 45 | with_jamr = "_with_jamr" if opt.jamr else "_without_jamr" 46 | trainFolderPath = opt.folder + "/training/" 47 | trainingFilesPath = folder_to_files_path(trainFolderPath, suffix) 48 | 49 | devFolderPath = opt.folder + "/dev/" 50 | devFilesPath = folder_to_files_path(devFolderPath, suffix) 51 | 52 | testFolderPath = opt.folder + "/test/" 53 | testFilesPath = folder_to_files_path(testFolderPath, suffix) 54 | 55 | 56 | def myamr_to_seq(amr, snt_token, lemma_token, pos, rl, fragment_to_node_converter, 57 | high_freq): # high_freq should be a dict() 58 | 59 | def uni_to_list(uni, can_copy=0): 60 | # if can_copy: print (uni) 61 | le = uni.le 62 | cat = uni.cat # use right category anyway 63 | ner = uni.aux 64 | data = [0, 0, 0, 0, 0] 65 | data[AMR_AUX] = ner 66 | data[AMR_LE_SENSE] = uni.sense 67 | data[AMR_LE] = le 68 | data[AMR_CAT] = cat 69 | data[AMR_CAN_COPY] = 1 if can_copy else 0 70 | return data 71 | 72 | output_concepts = [] 73 | lemma_str = " ".join(lemma_token) 74 | fragment_to_node_converter.convert(amr, rl, snt_token, lemma_token, lemma_str) 75 | concepts, rel, rel_prefix, root_id = amr.node_value(keys=["value", "align"], all=True) 76 | 77 | results = rl.get_matched_concepts(snt_token, concepts, lemma_token, pos, jamr=opt.jamr) 78 | aligned_index = [] 79 | n_amr = len(results) 80 | n_snt = len(snt_token) 81 | l = len(lemma_token) if lemma_token[-1] != "." else len(lemma_token) - 1 82 | 83 | # hello, linguistic prior here 84 | old_unaligned_index = [i for i in range(l) if not ( 85 | pos[i] in ["IN", "POS"] or lemma_token[i] == "would" or lemma_token[i] == "will" and pos[i] == "MD" 86 | or lemma_token[i] == "have" and pos[i] not in ["VB", "VBG"]) 87 | or lemma_token[i] in ["although", "while", "of", "if", "in", "per", "like", "by", "for"]] 88 | 89 | for i, n_c_a in enumerate(results): 90 | uni = n_c_a[1] 91 | align = [a[0] for a in n_c_a[2]] if len(n_c_a[2]) > 0 else old_unaligned_index 92 | aligned_index += align 93 | 94 | data = uni_to_list(uni, len(n_c_a[2]) > 0) 95 | data.append(align) 96 | output_concepts.append(data) 97 | if len(aligned_index) == 0: 98 | output_concepts[0][-1] = [int((len(lemma_token) - 1) / 2)] 99 | aligned_index = [int((len(lemma_token) - 1) / 2)] 100 | assert len(aligned_index) > 0, (results, amr._anno, " ".join(lemma_token)) 101 | unaligned_index = [i for i in range(n_snt) if i not in aligned_index] # or [-1 n_snt] for all 102 | if len(unaligned_index) == 0: unaligned_index = [-1, n_snt] 103 | # assert n_snt <= n_amr or unaligned_index != [],(n_amr,n_snt,concepts,snt_token,amr 104 | for i in range(n_amr, n_snt): 105 | output_concepts.append([NULL_WORD, NULL_WORD, NULL_WORD, NULL_WORD, 0, [-1, n_snt]]) # len(amr) >= len(snt) 106 | printed = False 107 | for i in range(len(output_concepts)): 108 | if output_concepts[i][-1] == []: 109 | if not printed: 110 | print(output_concepts[i]) 111 | print (list(zip(snt_token, lemma_token, pos))) 112 | print(concepts, amr) 113 | printed = True 114 | output_concepts[i][-1] = [-1, n_snt] 115 | 116 | rel_feature = [] 117 | rel_tgt = [] 118 | for i, (amr_index, role_list) in enumerate(rel): 119 | amr_concept = uni_to_list(amr_index[ 120 | 0]) # if align else uni_to_list(AMRUniversal(UNK_WORD,output_concepts[amr_index[1]][AMR_CAT],NULL_WORD)) 121 | rel_feature.append(amr_concept[:4] + [amr_index[1]]+[rel_prefix[i]]) 122 | # assert amr_index[1] < len(results), (concepts, rel) 123 | rel_tgt.append(role_list) # [role,rel_index] 124 | return output_concepts, [rel_feature, rel_tgt, root_id], unaligned_index # [[[lemma1,lemma2],category,relation]] 125 | 126 | 127 | def filter_non_aligned(input_concepts, rel, unaligned_index): 128 | rel_feature, rel_tgt, root_id = rel 129 | 130 | filtered_index = {} # original -> filtered 131 | 132 | output_concepts = [] 133 | for i, data in enumerate(input_concepts): 134 | if len(data[-1]) == 0: 135 | output_concepts.append( 136 | [NULL_WORD, NULL_WORD, NULL_WORD, NULL_WORD, 0, unaligned_index]) # len(amr) >= len(snt) 137 | elif len(data[-1]) == 1 or data[AMR_CAT] == NULL_WORD: 138 | output_concepts.append(data) 139 | filtered_index[i] = len(output_concepts) - 1 140 | else: 141 | assert False, (i, data, input_concepts, rel) 142 | out_rel_feature, out_rel_tgt = [], [] 143 | filtered_rel_index = {} # original -> filtered for dependency indexing 144 | for i, data in enumerate(rel_feature): 145 | index = data[-1] 146 | if index in filtered_index: 147 | new_index = filtered_index[index] 148 | out_rel_feature.append(data[:-1] + [new_index]) 149 | filtered_rel_index[i] = len(out_rel_feature) - 1 150 | 151 | for i, roles in enumerate(rel_tgt): 152 | if i in filtered_rel_index: 153 | new_roles = [[role, filtered_rel_index[j]] for role, j in roles if j in filtered_rel_index] 154 | out_rel_tgt.append(new_roles) 155 | 156 | if root_id not in filtered_rel_index: 157 | root_id = 0 158 | 159 | assert len(output_concepts) > 0, (input_concepts, rel, unaligned_index) 160 | 161 | return output_concepts, [out_rel_feature, out_rel_tgt, root_id] 162 | 163 | 164 | def add_seq_to_dict(dictionary, seq): 165 | for i in seq: 166 | dictionary.add(i) 167 | 168 | 169 | def aligned(align_list): 170 | return align_list[0] == -1 171 | 172 | 173 | # id_seq : [(lemma,cat,lemma_sensed,ner])] 174 | def amr_seq_to_id(lemma_dict, category_dict, lemma_sensed_dict, aux_dict, amr_seq): 175 | id_seq = [] 176 | for l in amr_seq: 177 | data = [0] * 5 178 | data[AMR_CAT] = category_dict[l[AMR_CAT]] 179 | data[AMR_LE] = lemma_dict[l[AMR_LE]] 180 | data[AMR_AUX] = aux_dict[l[AMR_AUX]] 181 | data[AMR_SENSE] = sensed_dict[l[AMR_SENSE]] 182 | data[AMR_CAN_COPY] = l[AMR_CAN_COPY] 183 | id_seq.append(data) 184 | return id_seq 185 | 186 | 187 | def amr_seq_to_dict(lemma_dict, category_dict, sensed_dict, aux_dict, amr_seq): # le,cat,le_sense,ner,align 188 | for i in amr_seq: 189 | category_dict.add(i[AMR_CAT]) 190 | lemma_dict.add(i[AMR_LE]) 191 | aux_dict.add(i[AMR_NER]) 192 | sensed_dict.add(i[AMR_SENSE]) 193 | 194 | 195 | def rel_seq_to_dict(lemma_dict, category_dict, sensed_dict, rel_dict, rel): # (amr,index,[[role,amr,index]]) 196 | rel_feature, rel_tgt, root_id = rel 197 | for i in rel_feature: 198 | category_dict.add(i[AMR_CAT]) 199 | lemma_dict.add(i[AMR_LE]) 200 | # sensed_dict.add(i[AMR_SENSE]) 201 | for role_list in rel_tgt: 202 | for role_index in role_list: 203 | # assert (role_index[0]==":top"),rel_tgt 204 | rel_dict.add(role_index[0]) 205 | 206 | 207 | def rel_seq_to_id(lemma_dict, category_dict, sensed_dict, rel_dict, rel): 208 | rel_feature, rel_tgt, root_id = rel 209 | feature_seq = [] 210 | index_seq = [] 211 | prefix_seq = [] 212 | roles_mat = [] 213 | for l in rel_feature: 214 | data = [0] * 3 215 | data[0] = category_dict[l[AMR_CAT]] 216 | data[1] = lemma_dict[l[AMR_LE]] 217 | data[2] = sensed_dict[l[AMR_SENSE]] 218 | feature_seq.append(data) 219 | index_seq.append(l[-2]) 220 | prefix_seq.append(l[-1]) 221 | for role_list in rel_tgt: 222 | roles_id = [] 223 | for role_index in role_list: 224 | roles_id.append([role_index[0], role_index[1]]) 225 | roles_mat.append(roles_id) 226 | 227 | return feature_seq, index_seq, roles_mat, root_id,prefix_seq 228 | 229 | 230 | def handle_sentence(data, filepath, build_dict, n, word_only): 231 | if n % 1000 == 0: 232 | print (n) 233 | 234 | ner = data["ner"] 235 | snt_token = data["tok"] 236 | pos = data["pos"] 237 | lemma_token = data["lem"] 238 | amr_t = data["amr_t"] 239 | 240 | if build_dict: 241 | if word_only: 242 | add_seq_to_dict(word_dict, snt_token) 243 | else: 244 | add_seq_to_dict(word_dict, snt_token) 245 | add_seq_to_dict(lemma_dict, lemma_token) 246 | add_seq_to_dict(pos_dict, pos) 247 | add_seq_to_dict(ner_dict, ner) 248 | amr = AMRGraph(amr_t) 249 | amr_seq, rel, unaligned_index = myamr_to_seq(amr, snt_token, lemma_token, pos, rl, 250 | fragment_to_node_converter, high_freq) 251 | amr_seq_to_dict(lemma_dict, category_dict, sensed_dict, aux_dict, amr_seq) 252 | rel_seq_to_dict(lemma_dict, category_dict, sensed_dict, rel_dict, rel) 253 | else: 254 | amr = AMRGraph(amr_t) 255 | amr_seq, rel, unaligned_index = myamr_to_seq(amr, snt_token, lemma_token, pos, rl, fragment_to_node_converter, 256 | high_freq) 257 | if opt.jamr: 258 | amr_seq, rel = filter_non_aligned(amr_seq, rel, unaligned_index) 259 | data["snt_id"] = seq_to_id(word_dict, snt_token)[0] 260 | data["lemma_id"] = seq_to_id(lemma_dict, lemma_token)[0] 261 | data["pos_id"] = seq_to_id(pos_dict, pos)[0] 262 | data["ner_id"] = seq_to_id(ner_dict, ner)[0] 263 | 264 | l = len(data["pos_id"]) 265 | if not (l == len(data["snt_id"]) and l == len(data["lemma_id"]) and l == len(data["ner_id"])): 266 | print (l, len(data["snt_id"]), len(data["lemma_id"]), len(data["ner_id"])) 267 | print (data["pos_id"]) 268 | print (data["snt_id"]) 269 | print (data["lemma_id"]) 270 | print (data["ner_id"]) 271 | print (pos) 272 | print (snt_token) 273 | print (lemma_token) 274 | print (ner) 275 | print (data["snt"]) 276 | assert (False) 277 | data["amr_seq"] = amr_seq 278 | data["convertedl_seq"] = amr.node_value() 279 | data["rel_seq"], data["rel_triples"] = amr.get_gold() 280 | data["amr_id"] = amr_seq_to_id(lemma_dict, category_dict, sensed_dict, aux_dict, amr_seq) 281 | data["amr_rel_id"], data["amr_rel_index"], data["roles_mat"], data["root"],data["prefix"] = rel_seq_to_id(lemma_dict, 282 | category_dict, 283 | sensed_dict, 284 | rel_dict, rel) 285 | 286 | for i in data["amr_rel_index"]: 287 | assert i < len(data["amr_id"]), (data["amr_rel_index"], amr_seq, data["amr_id"]) 288 | data["index"] = [all[-1] for all in amr_seq] 289 | 290 | 291 | def readFile(filepath, build_dict=False, word_only=False): 292 | all_data = load_text_jamr(filepath) 293 | 294 | n = 0 295 | for data in all_data: 296 | n = n + 1 297 | handle_sentence(data, filepath, build_dict, n, word_only) 298 | if not build_dict: 299 | outfile = Pickle_Helper(re.sub(end, ".pickle" + with_jamr, filepath)) 300 | outfile.dump(all_data, "data") 301 | outfile.save() 302 | return len(all_data) 303 | 304 | 305 | # Creating ReUsable Object 306 | rl = rules() 307 | rl.load("data/rule_f" + with_jamr) 308 | # initializer = lasagne.init.Uniform() 309 | fragment_to_node_converter = ReCategorizor(from_file=True, path="data/graph_to_node_dict_extended" + with_jamr, 310 | training=False, auto_convert_threshold=opt.threshold) 311 | non_rule_set_f = Pickle_Helper("data/non_rule_set") 312 | non_rule_set = non_rule_set_f.load()["non_rule_set"] 313 | threshold = opt.threshold 314 | high_text_num, high_frequency, low_frequency, low_text_num = unmixe(non_rule_set, threshold) 315 | print ( 316 | "initial converted,threshold,len(non_rule_set),high_text_num,high_frequency,low_frequency,low_text_num,high_freq") 317 | high_freq = {**high_text_num, **high_frequency} 318 | 319 | # high_freq =high_frequency 320 | 321 | print ("initial converted", threshold, len(non_rule_set), len(high_text_num), len(high_frequency), len(low_frequency), 322 | len(low_text_num), len(high_freq)) 323 | 324 | 325 | def initial_dict(filename, with_unk=False): 326 | d = Dict(filename) 327 | d.addSpecial(NULL_WORD) 328 | if with_unk: 329 | d.addSpecial(UNK_WORD) 330 | # d.addSpecial(BOS_WORD) 331 | return d 332 | 333 | 334 | if not opt.skip: 335 | word_dict = initial_dict("data/word_dict", with_unk=True) 336 | pos_dict = initial_dict("data/pos_dict", with_unk=True) 337 | 338 | ner_dict = initial_dict("data/ner_dict", with_unk=True) # from stanford 339 | 340 | high_dict = initial_dict("data/high_dict", with_unk=True) 341 | 342 | lemma_dict = initial_dict("data/lemma_dict", with_unk=True) 343 | 344 | aux_dict = initial_dict("data/aux_dict", with_unk=True) 345 | 346 | rel_dict = initial_dict("data/rel_dict", with_unk=True) 347 | 348 | category_dict = initial_dict("data/category_dict", with_unk=True) 349 | sensed_dict = initial_dict("data/sensed_dict", with_unk=True) 350 | 351 | # print ("high freq") 352 | for uni in high_freq: 353 | le = uni.le 354 | lemma_dict.add(le) 355 | high_dict.add(le) 356 | # print (le,high_freq[uni][0]) 357 | 358 | for filepath in trainingFilesPath: 359 | print(("reading " + filepath.split("/")[-1] + "......")) 360 | n = readFile(filepath, build_dict=True) 361 | print(("done reading " + filepath.split("/")[-1] + ", " + str(n) + " sentences processed")) 362 | 363 | # only to allow fixed word embedding to be used for those data, alternatively we can build a huge word_embedding for all words from GLOVE... 364 | for filepath in devFilesPath: 365 | print(("reading " + filepath.split("/")[-1] + "......")) 366 | n = readFile(filepath, build_dict=True, word_only=True) 367 | print(("done reading " + filepath.split("/")[-1] + ", " + str(n) + " sentences processed")) 368 | 369 | for filepath in testFilesPath: 370 | print(("reading " + filepath.split("/")[-1] + "......")) 371 | n = readFile(filepath, build_dict=True, word_only=True) 372 | print(("done reading " + filepath.split("/")[-1] + ", " + str(n) + " sentences processed")) 373 | 374 | print ("len(aux_dict),len(rel_dict),threshold", len(aux_dict), len(rel_dict), threshold) 375 | 376 | rel_dict = rel_dict.pruneByThreshold(threshold) 377 | aux_dict = aux_dict.pruneByThreshold(threshold) 378 | category_dict = category_dict.pruneByThreshold(threshold) 379 | # print (rel_dict) 380 | word_dict.save() 381 | lemma_dict.save() 382 | pos_dict.save() 383 | aux_dict.save() 384 | ner_dict.save() 385 | high_dict.save() 386 | category_dict.save() 387 | rel_dict.save() 388 | sensed_dict.save() 389 | else: 390 | 391 | word_dict = Dict("data/word_dict") 392 | lemma_dict = Dict("data/lemma_dict") 393 | aux_dict = Dict("data/aux_dict") 394 | high_dict = Dict("data/high_dict") 395 | pos_dict = Dict("data/pos_dict") 396 | ner_dict = Dict("data/ner_dict") 397 | rel_dict = Dict("data/rel_dict") 398 | category_dict = Dict("data/category_dict") 399 | sensed_dict = Dict("data/sensed_dict") 400 | 401 | word_dict.load() 402 | lemma_dict.load() 403 | pos_dict.load() 404 | ner_dict.load() 405 | rel_dict.load() 406 | category_dict.load() 407 | high_dict.load() 408 | aux_dict.load() 409 | sensed_dict.save() 410 | 411 | fragment_to_node_converter = ReCategorizor(from_file=True, path="data/graph_to_node_dict_extended" + with_jamr, 412 | training=False, ner_cat_dict=aux_dict) 413 | print("dictionary building done") 414 | print("word_dict \t lemma_dict \tpos_dict \tner_dict \thigh_dict\tsensed_dict \tcategory_dict \taux_dict\trel_dict") 415 | print( 416 | len(word_dict), len(lemma_dict), len(pos_dict), len(ner_dict), len(high_dict), len(sensed_dict), len(category_dict), 417 | len(aux_dict), len(rel_dict)) 418 | 419 | print(("processing development set")) 420 | for filepath in devFilesPath: 421 | print(("reading " + filepath.split("/")[-1] + "......")) 422 | n = readFile(filepath, build_dict=False) 423 | print(("done reading " + filepath.split("/")[-1] + ", " + str(n) + " sentences processed")) 424 | 425 | print(("processing test set")) 426 | for filepath in testFilesPath: 427 | print(("reading " + filepath.split("/")[-1] + "......")) 428 | n = readFile(filepath, build_dict=False) 429 | 430 | print(("processing training set")) 431 | for filepath in trainingFilesPath: 432 | print(("reading " + filepath.split("/")[-1] + "......")) 433 | n = readFile(filepath, build_dict=False) 434 | print(("done reading " + filepath.split("/")[-1] + ", " + str(n) + " sentences processed")) 435 | 436 | print ("initial converted,threshold,len(non_rule_set),high_text_num,high_frequency,low_frequency,low_text_num") 437 | print ("initial converted", threshold, len(non_rule_set), len(high_text_num), len(high_frequency), len(low_frequency), 438 | len(low_text_num)) 439 | 440 | -------------------------------------------------------------------------------- /src/generate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Scripts to run the model over preprocessed data to generate evaluatable results 6 | 7 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 8 | @since: 2018-05-30 9 | ''' 10 | 11 | from parser.DataIterator import DataIterator,rel_to_batch 12 | import parser 13 | import torch 14 | from torch import cuda 15 | from utility.Naive_Scores import * 16 | from parser.AMRProcessors import graph_to_amr 17 | from utility.data_helper import folder_to_files_path 18 | 19 | from src.train import read_dicts,load_old_model,train_parser 20 | 21 | def generate_parser(): 22 | parser = train_parser() 23 | parser.add_argument('-output', default="_generate") 24 | parser.add_argument('-with_graphs', type=int,default=1) 25 | return parser 26 | 27 | 28 | 29 | def generate_graph(model,AmrDecoder, data_set,dicts,file): 30 | 31 | concept_scores = concept_score_initial(dicts) 32 | 33 | rel_scores = rel_scores_initial() 34 | 35 | model.eval() 36 | AmrDecoder.eval() 37 | output = [] 38 | gold_file = [] 39 | for batchIdx in range(len(data_set)): 40 | order,srcBatch,_,_,_,_,_,gold_roots,sourceBatch =data_set[batchIdx] 41 | 42 | probBatch = model(srcBatch ) 43 | 44 | 45 | 46 | amr_pred_seq,concept_batches,aligns_raw,dependent_mark_batch = AmrDecoder.probAndSourceToAmr(sourceBatch,srcBatch,probBatch,getsense = opt.get_sense ) 47 | 48 | amr_pred_seq = [ [(uni.cat,uni.le,uni.aux,uni.sense,uni) for uni in seq ] for seq in amr_pred_seq ] 49 | 50 | 51 | rel_batch,aligns = rel_to_batch(concept_batches,aligns_raw,data_set,dicts) 52 | rel_prob,roots = model((rel_batch,srcBatch,aligns),rel=True) 53 | graphs,rel_triples = AmrDecoder.relProbAndConToGraph(concept_batches,rel_prob,roots,(dependent_mark_batch,aligns_raw),opt.get_sense,opt.get_wiki) 54 | batch_out = [0]*len(graphs) 55 | for score_h in rel_scores: 56 | if score_h.second_filter: 57 | t,p,tp = score_h.T_P_TP_Batch(rel_triples,list(zip(*sourceBatch))[5],second_filter_material = (concept_batches,list(zip(*sourceBatch))[4])) 58 | else: 59 | t,p,tp = score_h.T_P_TP_Batch(rel_triples,list(zip(*sourceBatch))[5]) 60 | for score_h in concept_scores: 61 | t,p,tp = score_h.T_P_TP_Batch(concept_batches,list(zip(*sourceBatch))[4]) 62 | for i,data in enumerate(zip( sourceBatch,amr_pred_seq,concept_batches,rel_triples,graphs)): 63 | source,amr_pred,concept, rel_triple,graph= data 64 | predicated_graph = graph_to_amr(graph) 65 | 66 | out = [] 67 | out.append( "# ::tok "+" ".join(source[0])+"\n") 68 | out.append( "# ::lem "+" ".join(source[1])+"\n") 69 | out.append( "# ::pos "+" ".join(source[2])+"\n") 70 | out.append( "# ::ner "+" ".join(source[3])+"\n") 71 | out.append( "# ::predicated "+" ".join([str(re_cat[-1]) for re_cat in amr_pred])+"\n") 72 | out.append( "# ::transformed final predication "+" ".join([str(c) for c in concept])+"\n") 73 | out.append( AmrDecoder.nodes_jamr(graph)) 74 | out.append( AmrDecoder.edges_jamr(graph)) 75 | out.append( predicated_graph) 76 | batch_out[order[i]] = "".join(out)+"\n" 77 | output += batch_out 78 | t_p_tp = list(map(lambda a,b:a+b, concept_scores[1].t_p_tp,rel_scores[1].t_p_tp)) 79 | total_out = "Smatch"+"\nT,P,TP: "+ " ".join([str(i) for i in t_p_tp])+"\nPrecesion,Recall,F1: "+ " ".join([str(i)for i in P_R_F1(*t_p_tp)]) 80 | print(total_out) 81 | for score_h in rel_scores: 82 | print("") 83 | print(score_h) 84 | file = file.replace(".pickle",".txt") 85 | with open(file+ opt.output, 'w+') as the_file: 86 | for data in output: 87 | the_file.write(data+'\n') 88 | print(file+ opt.output+" written.") 89 | return concept_scores,rel_scores,output 90 | 91 | 92 | def main(opt): 93 | dicts = read_dicts() 94 | assert opt.train_from 95 | with_jamr = "_with_jamr" if opt.jamr else "_without_jamr" 96 | suffix = ".pickle"+with_jamr+"_processed" 97 | trainFolderPath = opt.folder+"/training/" 98 | trainingFilesPath = folder_to_files_path(trainFolderPath,suffix) 99 | 100 | devFolderPath = opt.folder+"/dev/" 101 | devFilesPath = folder_to_files_path(devFolderPath,suffix) 102 | 103 | testFolderPath = opt.folder+"/test/" 104 | testFilesPath = folder_to_files_path(testFolderPath,suffix) 105 | 106 | 107 | 108 | AmrDecoder = parser.AMRProcessors.AMRDecoder(opt,dicts) 109 | AmrDecoder.eval() 110 | AmrModel,parameters,optt = load_old_model(dicts,opt,True) 111 | opt.start_epoch = 1 112 | 113 | out = "/".join(testFilesPath[0].split("/")[:-2])+ "/model" 114 | with open(out, 'w') as outfile: 115 | outfile.write(opt.train_from+"\n") 116 | outfile.write(str(AmrModel)+"\n") 117 | outfile.write(str(optt)+"\n") 118 | outfile.write(str(opt)) 119 | 120 | print('processing testing') 121 | for file in testFilesPath: 122 | dev_data = DataIterator([file],opt,dicts["rel_dict"],volatile = True) 123 | concept_scores,rel_scores,output =generate_graph(AmrModel,AmrDecoder,dev_data,dicts,file) 124 | 125 | print('processing validation') 126 | for file in devFilesPath: 127 | dev_data = DataIterator([file],opt,dicts["rel_dict"],volatile = True) 128 | concept_scores,rel_scores,output =generate_graph(AmrModel,AmrDecoder,dev_data,dicts,file) 129 | 130 | 131 | 132 | print('processing training') 133 | for file in trainingFilesPath: 134 | dev_data = DataIterator([file],opt,dicts["rel_dict"],volatile = True) 135 | concept_scores,rel_scores,output =generate_graph(AmrModel,AmrDecoder,dev_data,dicts,file) 136 | 137 | 138 | if __name__ == "__main__": 139 | print (" ") 140 | print (" ") 141 | global opt 142 | opt = generate_parser().parse_args() 143 | opt.lemma_dim = opt.dim 144 | opt.high_dim = opt.dim 145 | 146 | opt.cuda = len(opt.gpus) 147 | 148 | print(opt) 149 | 150 | if torch.cuda.is_available() and not opt.cuda: 151 | print("WARNING: You have a CUDA device, so you should probably run with -cuda") 152 | 153 | if opt.cuda and opt.gpus[0] != -1: 154 | cuda.set_device(opt.gpus[0]) 155 | main(opt) -------------------------------------------------------------------------------- /src/parse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | 6 | data["ner"] = [] 7 | data["tok"] = [] 8 | data["lem"] = [] 9 | data["pos"] = [] 10 | for snt_tok in snt: 11 | data["ner"].append(snt_tok['ner']) 12 | data["tok"].append(snt_tok['word']) 13 | data["lem"].append(snt_tok['lemma']) 14 | data["pos"].append(snt_tok['pos']) 15 | data["ner"].append(snt_tok['ner']) 16 | data["tok"].append(snt_tok['word']) 17 | data["lem"].append(snt_tok['lemma']) 18 | data["pos"].append(snt_tok['pos']) 19 | 20 | Scripts to run the model to parse a file. Input file should contain each sentence per line 21 | A file containing output will be generated at the same folder unless output is specified. 22 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 23 | @since: 2018-05-30 24 | ''' 25 | 26 | from torch import cuda 27 | from parser.AMRProcessors import * 28 | from src.train import read_dicts,train_parser 29 | 30 | def generate_parser(): 31 | parser = train_parser() 32 | parser.add_argument('-output', default=None) 33 | parser.add_argument('-with_graphs', type=int,default=1) 34 | parser.add_argument("-input",default=None,type=str, 35 | help="""input file path""") 36 | parser.add_argument("-text",default=None,type=str, 37 | help="""a single sentence to parse""") 38 | parser.add_argument("-processed",default=0,type=int, 39 | help="""a single sentence to parse""") 40 | return parser 41 | 42 | if __name__ == "__main__": 43 | global opt 44 | opt = generate_parser().parse_args() 45 | opt.lemma_dim = opt.dim 46 | opt.high_dim = opt.dim 47 | 48 | opt.cuda = len(opt.gpus) 49 | 50 | if opt.cuda and opt.gpus[0] != -1: 51 | cuda.set_device(opt.gpus[0]) 52 | dicts = read_dicts() 53 | processed = opt.processed==1 54 | Parser = AMRParser(opt,dicts,parse_from_processed= processed) 55 | 56 | if opt.input: 57 | 58 | filepath = opt.input 59 | out = opt.output if opt.output else filepath+"_parsed" 60 | print ("processing "+filepath) 61 | n = 0 62 | processed_sentences = 0 63 | with open(out,'w') as out_f: 64 | with open(filepath,'r') as f: 65 | line = f.readline() 66 | batch = [] 67 | while line and line.strip() != "": 68 | while line and line.strip() != "" and len(batch) < opt.batch_size: 69 | batch.append(line.strip()) 70 | line = f.readline() 71 | 72 | output = Parser.parse_batch(batch) 73 | for snt, others in zip(batch,output): 74 | out_f.write("# ::snt "+snt+"\n") 75 | out_f.write(others) 76 | out_f.write("\n") 77 | processed_sentences = processed_sentences + len(batch) 78 | print ("processed_sentences" , processed_sentences) 79 | batch = [] 80 | print ("done processing "+filepath) 81 | print (out +" is generated") 82 | 83 | elif opt.input: 84 | filepath = opt.input 85 | out = opt.output if opt.output else filepath+"_parsed" 86 | print ("processing "+filepath) 87 | n = 0 88 | with open(out,'w') as out_f: 89 | with open(filepath,'r') as f: 90 | line = f.readline() 91 | while line != '' : 92 | if line.strip() != "": 93 | output = Parser.parse_batch([line.strip()]) 94 | out_f.write("# ::snt "+line) 95 | out_f.write(output[0]) 96 | out_f.write("\n") 97 | line = f.readline() 98 | print ("done processing "+filepath) 99 | print (out +" is generated") 100 | elif opt.text: 101 | output = Parser.parse_one(opt.text) 102 | print ("# ::snt "+opt.text) 103 | for i in output: 104 | print (i) 105 | else: 106 | print ("option -input [file] or -text [sentence] is required.") -------------------------------------------------------------------------------- /src/preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Combine multiple AMR data files in the same directory into a single one 6 | Need to specify folder containing all subfolders of training, dev and test 7 | 8 | Then extract features for futher process based on stanford core nlp tools 9 | 10 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 11 | @since: 2018-06-01 12 | ''' 13 | 14 | from parser.AMRProcessors import * 15 | 16 | import argparse 17 | 18 | 19 | def combine_files(files): 20 | out = "/".join(files[0].split("/")[:-1]) 21 | out = out + "/combined.txt_" 22 | with open(out, 'w+') as outfile: 23 | for fname in files: 24 | with open(fname) as infile: 25 | line = infile.readline() 26 | line = infile.readline() 27 | while line != '' : 28 | line = infile.readline() 29 | outfile.write(line) 30 | outfile.write("\n") 31 | 32 | def write_features(filepath,feature_extractor:AMRInputPreprocessor): 33 | out = filepath + "pre_processed" 34 | print ("processing "+filepath) 35 | n = 0 36 | with open(out,'w') as out_f: 37 | with open(filepath,'r') as f: 38 | line = f.readline() 39 | while line != '' : 40 | if line.startswith("# ::snt") or line.startswith("# ::tok"): 41 | text = line[7:] 42 | data = feature_extractor.preprocess(text) 43 | out_f.write(line.replace("# ::tok","# ::snt")) 44 | for key in ["tok","lem","pos","ner"]: 45 | out_f.write("# ::"+key+"\t"+"\t".join(data[key])+"\n") 46 | n = n+1 47 | if n % 500 ==0: 48 | print (str(n)+" sentences processed") 49 | elif not line.startswith("# AMR release; "): 50 | out_f.write(line) 51 | line = f.readline() 52 | print ("done processing "+filepath) 53 | print (out +" is generated") 54 | 55 | def combine_arg(): 56 | parser = argparse.ArgumentParser(description='preprocessing.py') 57 | 58 | ## Data options 59 | parser.add_argument('-suffix', default="txt", type=str, 60 | help="""suffix of files to combine""") 61 | parser.add_argument('-folder', default=allFolderPath, type=str , 62 | help="""the folder""") 63 | return parser 64 | 65 | 66 | parser = combine_arg() 67 | 68 | 69 | opt = parser.parse_args() 70 | feature_extractor = AMRInputPreprocessor() 71 | 72 | trainFolderPath = opt.folder+"/training/" 73 | trainingFilesPath = folder_to_files_path(trainFolderPath,opt.suffix) 74 | combine_files(trainingFilesPath) 75 | write_features(trainFolderPath+"/combined.txt_",feature_extractor) 76 | 77 | devFolderPath = opt.folder+"/dev/" 78 | devFilesPath = folder_to_files_path(devFolderPath,opt.suffix) 79 | combine_files(devFilesPath) 80 | write_features(devFolderPath+"/combined.txt_",feature_extractor) 81 | 82 | testFolderPath = opt.folder+"/test/" 83 | testFilesPath = folder_to_files_path(testFolderPath,opt.suffix) 84 | combine_files(testFilesPath) 85 | write_features(testFolderPath+"/combined.txt_",feature_extractor) 86 | 87 | -------------------------------------------------------------------------------- /src/rule_system_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Scripts to build StringCopyRules and ReCategorizor 6 | 7 | Data path information should also be specified here for 8 | trainFolderPath, devFolderPath and testFolderPath 9 | as we allow option to choose from two version of data. 10 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 11 | @since: 2018-05-30 12 | ''' 13 | 14 | from utility.StringCopyRules import * 15 | from utility.ReCategorization import * 16 | from utility.data_helper import * 17 | 18 | 19 | import argparse 20 | def arg_parser(): 21 | parser = argparse.ArgumentParser(description='rule_system_build.py') 22 | 23 | ## Data options 24 | parser.add_argument('-threshold', default=5, type=int, 25 | help="""threshold for non-aligned high frequency concepts""") 26 | 27 | parser.add_argument('-jamr', default=0, type=int, 28 | help="""wheather to enhance string matching with additional jamr alignment""") 29 | parser.add_argument('-suffix', default=".txt_pre_processed", type=str, 30 | help="""suffix of files to combine""") 31 | parser.add_argument('-folder', default=allFolderPath, type=str , 32 | help="""the folder""") 33 | return parser 34 | parser = arg_parser() 35 | opt = parser.parse_args() 36 | threshold = opt.threshold 37 | suffix = opt.suffix + "_jamr" if opt.jamr else opt.suffix 38 | with_jamr = "_with_jamr" if opt.jamr else "_without_jamr" 39 | trainFolderPath = opt.folder+"/training/" 40 | trainingFilesPath = folder_to_files_path(trainFolderPath,suffix) 41 | 42 | devFolderPath = opt.folder+"/dev/" 43 | devFilesPath = folder_to_files_path(devFolderPath,suffix) 44 | 45 | testFolderPath = opt.folder+"/test/" 46 | testFilesPath = folder_to_files_path(testFolderPath,suffix) 47 | 48 | 49 | lock = threading.Lock() 50 | def add_count(store,new,additional=None): 51 | lock.acquire() 52 | 53 | for i in new: 54 | if not i in store: 55 | store[i] = [1,[additional]] 56 | else: 57 | store[i][0] = store[i][0] + 1 58 | store[i][1].append(additional) 59 | lock.release() 60 | 61 | def handle_sentence(data,n,update_freq,use_template,jamr = False): 62 | 63 | if n % 500 == 0: 64 | print (n) 65 | snt_token = data["tok"] 66 | pos_token = data["pos"] 67 | lemma_token = data["lem"] 68 | amr_t = data["amr_t"] 69 | aligns = data["align"] 70 | v2c = data["node"] 71 | amr = AMRGraph(amr_t,aligns=aligns) 72 | amr.check_consistency(v2c) 73 | lemma_str =" ".join(lemma_token) 74 | if use_template: 75 | fragment_to_node_converter.match(amr,rl ,snt_token,lemma_token,pos_token,lemma_str,jamr=jamr ) 76 | fragment_to_node_converter.convert(amr,rl ,snt_token,lemma_token,pos_token,lemma_str ) 77 | results = rl.get_matched_concepts(snt_token,amr,lemma_token,pos_token,with_target=update_freq,jamr=jamr) 78 | if update_freq: 79 | for n_c_a in results : 80 | for i_le in n_c_a[2]: 81 | rl.add_lemma_freq(i_le[1],n_c_a[1].le,n_c_a[1].cat,sense = n_c_a[1].sense) 82 | 83 | snt_str = " ".join(snt_token) 84 | none_rule = [n_c_a[1] for n_c_a in results if len(n_c_a[2])==0] 85 | add_count(non_rule_set,none_rule,snt_str) 86 | 87 | 88 | def readFile(filepath,update_freq=False,use_template=True): 89 | all_data = load_text_jamr(filepath) 90 | 91 | with open(filepath.replace(".txt",".tok"),'w') as output_file: 92 | n = 0 93 | for data in all_data: 94 | n=n+1 95 | snt_token = data["tok"] 96 | output_file.writelines("\t".join(snt_token)) 97 | if opt.jamr: 98 | handle_sentence(data,n,update_freq,use_template,jamr=True) 99 | else: 100 | handle_sentence(data,n,update_freq,use_template,jamr=False) 101 | return n 102 | 103 | 104 | 105 | rl = rules() 106 | non_rule_set = dict() 107 | fragment_to_node_converter = ReCategorizor(training=True) 108 | # 109 | non_rule_set_last = non_rule_set 110 | rl.build_lemma_cheat() 111 | # 112 | non_rule_set = dict() 113 | #lemmas_to_concept = read_resource_files( f_r.get_frames()) 114 | for filepath in trainingFilesPath: #actually already combined into one 115 | print(("reading "+filepath.split("/")[-1]+"......")) 116 | n = readFile(filepath,update_freq=True,use_template = True) 117 | print(("done reading "+filepath.split("/")[-1]+", "+str(n)+" sentences processed")) 118 | #non_rule_set = non_rule_set_last 119 | high_text_num,high_frequency,low_frequency,low_text_num=unmixe(non_rule_set,threshold ) 120 | print ("initial converted,threshold,len(non_rule_set),high_text_num,high_frequency,low_frequency,low_text_num") 121 | print ("initial converted",threshold,len(non_rule_set),len(high_text_num),len(high_frequency),len(low_frequency),len(low_text_num)) 122 | #print (len(concept_embedding)) 123 | # 124 | # 125 | # 126 | non_rule_set_initial_converted = non_rule_set 127 | rl.build_lemma_cheat() 128 | fragment_to_node_converter.save(path="data/graph_to_node_dict_extended"+with_jamr) 129 | fragment_to_node_converter = ReCategorizor(from_file=False, path="data/graph_to_node_dict_extended"+with_jamr,training=False) 130 | rl.save("data/rule_f"+with_jamr) 131 | non_rule_set = dict() 132 | NERS = {} 133 | 134 | #need to rebuild copying dictionary again based on recategorized graph 135 | for filepath in trainingFilesPath: 136 | print(("reading "+filepath.split("/")[-1]+"......")) 137 | n = readFile(filepath,update_freq=False,use_template=False) 138 | print(("done reading "+filepath.split("/")[-1]+", "+str(n)+" sentences processed")) 139 | 140 | non_rule_set_f = Pickle_Helper("data/non_rule_set") 141 | non_rule_set_f.dump(non_rule_set,"non_rule_set") 142 | non_rule_set_f.save() 143 | 144 | 145 | 146 | #only intermediate data, won't be useful for final parser 147 | non_rule_set_f = Pickle_Helper("data/non_rule_set") 148 | non_rule_set_f.dump(non_rule_set_last,"initial_non_rule_set") 149 | non_rule_set_f.dump(non_rule_set_initial_converted,"initial_converted_non_rule_set") 150 | non_rule_set_f.dump(non_rule_set,"non_rule_set") 151 | non_rule_set_f.save() 152 | 153 | high_text_num,high_frequency,low_frequency,low_text_num=unmixe(non_rule_set,threshold ) 154 | print ("final converted,threshold,len(non_rule_set),high_text_num,high_frequency,low_frequency,low_text_num") 155 | print ("final converted",threshold,len(non_rule_set),len(high_text_num),len(high_frequency),len(low_frequency),len(low_text_num)) -------------------------------------------------------------------------------- /utility/AMRGraph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | AMRGraph builds on top of AMR from amr.py 6 | representing AMR graph as graph, 7 | and extract named entity (t1,..,tn, ner type, wiki) tuple. (we use model predicting for deciding ner type though) 8 | Being able to apply recategorization to original graph, 9 | which involves collapsing nodes for concept identification and unpacking for relation identification. 10 | 11 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 12 | @since: 2018-05-28 13 | ''' 14 | from utility.amr import * 15 | from utility.constants import * 16 | import networkx as nx 17 | 18 | class AMRGraph(AMR): 19 | def __init__(self, anno, normalize_inverses=True, 20 | normalize_mod=False, tokens=None,aligns={}): 21 | ''' 22 | create AMR from text, and convert AMR to AMRGraph of standard representation 23 | ''' 24 | super().__init__(anno, tokens) 25 | self.ners = [] 26 | self.gold_concept = [] 27 | self.gold_triple = [] 28 | self.graph = nx.DiGraph() 29 | self.wikis = [] 30 | for h, r, d in [(h, r, d) for h, r, d in self.triples(normalize_inverses=normalize_inverses, 31 | normalize_mod=normalize_mod) if 32 | (r != ":instance" )]: 33 | if r == ':wiki': 34 | h, h_v = self.var_get_uni(h, True,(h, r, d )) 35 | d, d_v = self.var_get_uni(d) 36 | self.wikis.append(d) 37 | self.ners.append((h,d_v)) 38 | continue 39 | elif r == ':top': 40 | d, d_v = self.var_get_uni(d) 41 | self.root = d 42 | self.graph.add_node(d, value=d_v, align=None,gold=True,prefix = self._index_inv[d]) 43 | else: 44 | h_prefix = self._index_inv[h] 45 | d_prefix = self._index_inv[d] if d in self._index_inv else d #d will be the prefix if it is constant 46 | assert isinstance(h_prefix,str) and isinstance(d_prefix,str) 47 | h, h_v = self.var_get_uni(h, True,(h, r, d )) 48 | d, d_v = self.var_get_uni(d) 49 | self.graph.add_node(h, value=h_v, align=None,gold=True,prefix = h_prefix) 50 | self.graph.add_node(d, value=d_v, align=None,gold=True,prefix = d_prefix) 51 | self.graph.add_edge(h, d, role=r) 52 | self.graph.add_edge(d, h, role=r + "-of") 53 | 54 | # for i in self._triples: 55 | # print(i) 56 | self.read_align(aligns) 57 | 58 | #alignment from copying mechanism 59 | def read_align(self, aligns): 60 | for prefix in aligns: 61 | i = self._index[prefix] 62 | if isinstance(i,Var): 63 | assert i in self.graph.node,(self.graph.nodes(True),self.triples(normalize_inverses=True, 64 | normalize_mod=False),self._anno) 65 | self.graph.node[i]["align"] = aligns[prefix] 66 | else: 67 | if Var(prefix) in self.wikis: continue 68 | assert Var(prefix) in self.graph.node,(prefix,aligns,self._index,self.graph.nodes(True),self._anno) 69 | self.graph.node[Var(prefix)]["align"] = aligns[prefix] 70 | 71 | 72 | def check_consistency(self,pre2c): 73 | for prefix in pre2c: 74 | var = self._index[prefix] 75 | if not isinstance(var,Var): var = Var(prefix) 76 | if var in self.wikis: continue 77 | assert var in self.graph.node,(prefix, "\n",pre2c,"\n",self.graph.node,"\n",self._anno) 78 | amr_c = self.graph.node[var]["value"] 79 | 80 | assert amr_c.gold_str() == pre2c[prefix],(prefix, var,amr_c.gold_str() ,pre2c[prefix],"\n",pre2c,"\n",self.graph.nodes(True)) 81 | 82 | def get_gold(self): 83 | cons = [] 84 | roles = [] 85 | for n, d in self.graph.nodes(True): 86 | if "gold" in d: 87 | v = d["value"] 88 | cons.append(v) 89 | 90 | for h, d, rel in self.graph.edges(data=True): 91 | r = rel["role"] 92 | if self.cannonical(r): 93 | assert "gold" in self.graph.node[h] and "gold" in self.graph.node[d] 94 | h = self.graph.node[h]["value"] 95 | d = self.graph.node[d]["value"] 96 | roles.append([h,d,r]) 97 | 98 | root = self.graph.node[self.root]["value"] 99 | roles.append([AMRUniversal(BOS_WORD,BOS_WORD,NULL_WORD),root,':top']) 100 | return cons,roles 101 | 102 | def get_ners(self): 103 | ners = [] 104 | for v,wiki in self.ners: #v is name variable 105 | name = None 106 | names = [] 107 | for nearb in self.graph[v]: 108 | if self.graph[v][nearb]["role"] == ":name": 109 | name = nearb 110 | break 111 | if name is None: 112 | print (self.graph[v],self._anno) 113 | continue 114 | ner_type = self.graph.node[v]["value"] 115 | for node in self.graph[name]: 116 | if self.graph.node[node]["value"].cat == Rule_String and ":op" in self.graph[name][node]["role"]: 117 | names.append(( self.graph.node[node]["value"],int(self.graph[name][node]["role"][-1]))) # (role, con,node) 118 | 119 | names = [t[0] for t in sorted(names,key = lambda t: t[1])] 120 | ners.append([names,wiki,ner_type]) 121 | return ners 122 | 123 | 124 | 125 | def rely(self,o_node,n_node): 126 | if "rely" in self.graph.node[o_node]: 127 | return 128 | self.graph.node[o_node].setdefault("rely",n_node) 129 | 130 | #link old node to new node 131 | def link(self,o_node,n_node,rel): 132 | self.graph.node[o_node].setdefault("original-of",[]).append( n_node ) # for storing order of replacement 133 | if n_node: 134 | self.graph.node[n_node]["has-original"] = o_node # for storing order of replacement 135 | self.graph.node[n_node]["align"] = self.graph.node[o_node]["align"] 136 | if rel: self.rely(o_node,n_node) 137 | 138 | def replace(self,node,cat_or_uni,aux=None,rel=False): 139 | 140 | aux_le = self.graph.node[aux]['value'].le if aux else None 141 | 142 | if isinstance(cat_or_uni,AMRUniversal): 143 | universal = cat_or_uni 144 | else: 145 | le = self.graph.node[node]['value'].le 146 | universal = AMRUniversal(le, cat_or_uni, None, aux_le) #aux_le is usually named entity type 147 | # create a new recategorized node 148 | # gold is not marked, so new recategorized node won't be used for relation identification 149 | var = Var(node._name+"_"+universal.cat) 150 | self.graph.add_node(var, value=universal, align=None) 151 | self.link(node,var,rel) 152 | 153 | return var 154 | 155 | 156 | #get a amr universal node from a variable in AMR or a constant in AMR 157 | def var_get_uni(self, a, head=False,tri=None): 158 | if isinstance(a,Var): 159 | return a, AMRUniversal(concept=self._v2c[a]) 160 | else: 161 | if head: 162 | assert False, "constant as head" + "\n" + a + self._anno+"\n"+str(tri) 163 | return Var(a), AMRUniversal(concept=self._index[a]) 164 | 165 | 166 | 167 | def __getitem__(self, item): 168 | return self.graph.node[item] 169 | 170 | #check whether the relation is in the cannonical direction 171 | def cannonical(self,r): 172 | return "-of" in r and not self.is_core(r) or "-of" not in r and self.is_core(r) 173 | 174 | def getRoles(self,node,index_dict,rel_index,relyed = None): 175 | # (amr,index,[[role,rel_index]]) 176 | if relyed and relyed not in index_dict: 177 | print ("rely",node,relyed,self.graph.node[relyed]["value"],index_dict,self._anno) 178 | elif relyed is None and node not in index_dict: print (self.graph.node[node]["value"]) 179 | index = index_dict[node] if relyed is None else index_dict[relyed] 180 | out = [] 181 | # if self.graph.node[node]["value"].le != "name": 182 | for n2 in self.graph[node]: 183 | r = self.graph[node][n2]["role"] 184 | if self.cannonical(r): 185 | if n2 not in rel_index: 186 | print(node,n2) 187 | print(self._anno) 188 | out.append([r,rel_index[n2]]) 189 | return [[self.graph.node[node]["value"],index], out] 190 | 191 | #return data for training concept identification or relation identification 192 | def node_value(self, keys=["value"], all=False): 193 | def concept_concept(): 194 | out = [] 195 | index = 0 196 | index_dict ={} 197 | for n, d in self.graph.nodes(True): 198 | if "original-of"in d: 199 | comps = d["original-of"] 200 | for comp in comps: 201 | if comp is None: 202 | continue 203 | comp_d = self.graph.node[comp] 204 | out.append([comp] + [comp_d[k] for k in keys]) 205 | index_dict[comp] = index 206 | index += 1 207 | elif not ("has-original" in d or "rely" in d): 208 | out.append([n] + [d[k] for k in keys]) 209 | index_dict[n] = index 210 | index += 1 211 | return out,index_dict 212 | def rel_concept(): 213 | index = 0 214 | rel_index ={} 215 | rel_prefix = [] 216 | rel_out = [] 217 | for n, d in self.graph.nodes(True): 218 | if "gold" in d: 219 | rel_out.append([n,d]) 220 | rel_index[n] = index 221 | rel_prefix.append( d["prefix"]) 222 | index += 1 223 | return rel_out,rel_index,rel_prefix 224 | 225 | out,index_dict = concept_concept() 226 | if all: 227 | rel_out, rel_index, rel_prefix = rel_concept() 228 | for i, n_d in enumerate( rel_out): 229 | n,d = n_d 230 | if "rely" in d: 231 | rel_out[i] =self.getRoles(n,index_dict,rel_index,d["rely"]) 232 | elif not ("has-original" in d or "original-of" in d): 233 | rel_out[i] = self.getRoles(n,index_dict,rel_index) 234 | else: 235 | assert False , (self._anno,n,d["value"]) 236 | assert (self.root in rel_index),(self.graph.nodes[self.root],rel_index,self._anno) 237 | return out,rel_out,rel_prefix, rel_index[self.root] 238 | else: 239 | return out -------------------------------------------------------------------------------- /utility/Naive_Scores.py: -------------------------------------------------------------------------------- 1 | __author__ = 's1544871' 2 | from utility.constants import * 3 | from utility.amr import AMRUniversal 4 | 5 | class ScoreHelper: 6 | 7 | def __init__(self,name, filter ,second_filter=None): 8 | self.t_p_tp = [0,0,0] 9 | self.name = name 10 | self.f = filter 11 | self.second_filter = second_filter 12 | self.false_positive = {} 13 | self.false_negative = {} 14 | 15 | def T_P_TP_Batch(self,hypos,golds,accumulate=True,second_filter_material =None): 16 | if self.second_filter: 17 | T,P,TP,fp,fn = T_P_TP_Batch(hypos,golds,self.f,self.second_filter,second_filter_material) 18 | else: 19 | # assert self.name != "Unlabled SRL Triple",(hypos[-20],"STOP!",golds[-20]) 20 | T,P,TP,fp,fn = T_P_TP_Batch(hypos,golds,self.f) 21 | if accumulate: 22 | self.add_t_p_tp(T,P,TP) 23 | self.add_content(fp,fn) 24 | return T,P,TP 25 | 26 | def add_t_p_tp(self,T,P,TP): 27 | self.t_p_tp[0] += T 28 | self.t_p_tp[1] += P 29 | self.t_p_tp[2] += TP 30 | 31 | def add_content(self,fp,fn ): 32 | for i in fp: 33 | self.false_positive[i] = self.false_positive.setdefault(i,0)+1 34 | for i in fn: 35 | self.false_negative[i] = self.false_negative.setdefault(i,0)+1 36 | 37 | def show_error(self,t = 5): 38 | print ("false_positive",[(k,self.false_positive[k]) for k in sorted(self.false_positive,key=self.false_positive.get) if self.false_positive[k]> t]) 39 | print ("") 40 | print ("false_negative",[(k,self.false_negative[k]) for k in sorted(self.false_negative,key=self.false_negative.get) if self.false_negative[k]>t]) 41 | def __str__(self): 42 | s = self.name+"\nT,P,TP: "+ " ".join([str(i) for i in self.t_p_tp])+"\nPrecesion,Recall,F1: "+ " ".join([str(i)for i in P_R_F1(*self.t_p_tp)]) 43 | return s 44 | 45 | 46 | 47 | def filter_mutual(hypo,gold,mutual_filter): 48 | filtered_hypo = [item for sublist in filter_seq(mutual_filter,hypo) for item in sublist] 49 | out_hypo = [] 50 | filtered_gold = [item for sublist in filter_seq(mutual_filter,gold) for item in sublist] 51 | out_gold = [] 52 | 53 | for data in hypo: 54 | d1,d2 = mutual_filter(data) 55 | if d1 in filtered_gold and d2 in filtered_gold: 56 | out_hypo.append(data) 57 | 58 | 59 | for data in gold: 60 | d1,d2 = mutual_filter(data) 61 | if d1 in filtered_hypo and d2 in filtered_hypo: 62 | out_gold.append(data) 63 | 64 | return out_hypo,out_gold 65 | 66 | def list_to_mulset(l): 67 | s = dict() 68 | for i in l: 69 | if isinstance(i,AMRUniversal) and i.le == "i"and i.cat == Rule_Concept : 70 | s[i] = 1 71 | else: 72 | s[i] = s.setdefault(i,0)+1 73 | return s 74 | 75 | def legal_concept(uni): 76 | if isinstance(uni,AMRUniversal): 77 | return (uni.cat,uni.le,uni.sense) if not uni.le in Special and not uni.cat in Special else None 78 | else: 79 | return uni 80 | 81 | def nonsense_concept(uni): 82 | return (uni.cat,uni.le) if not uni.le in Special and not uni.cat in Special else None 83 | 84 | def dynamics_filter(triple,concept_seq): 85 | if triple[0] in concept_seq and triple[1] in concept_seq or BOS_WORD in triple[0]: 86 | return triple[:3] 87 | 88 | # print (triple,concept_seq[0]) 89 | return None 90 | 91 | def filter_seq(filter,seq): 92 | out = [] 93 | for t in seq: 94 | filtered = filter(t) 95 | if filtered and filtered[0] != BOS_WORD and filtered != BOS_WORD: 96 | out.append(filtered) 97 | return out 98 | 99 | def remove_sense(uni): 100 | return (uni.cat,uni.le) 101 | 102 | def T_TP_Seq(hypo,gold,filter,second_filter = None,second_filter_material = None): 103 | gold = filter_seq(filter,gold) 104 | hypo = filter_seq(filter,hypo) 105 | fp = [] 106 | fn = [] 107 | if second_filter: #only for triple given concept 108 | 109 | 110 | second_filter_predicated = filter_seq(legal_concept,second_filter_material[0]) 111 | second_filter_with_material = lambda x: second_filter(x,second_filter_predicated) 112 | gold = filter_seq(second_filter_with_material,gold) 113 | 114 | 115 | second_filter_gold = filter_seq(legal_concept,second_filter_material[1]) 116 | second_filter_with_material = lambda x: second_filter(x,second_filter_gold) 117 | 118 | hypo = filter_seq(second_filter_with_material,hypo) 119 | 120 | if len(gold)>0 and isinstance(gold[0],tuple) and len(gold[0])==3 and False: 121 | print ("") 122 | print ("source based prediction") 123 | for t in hypo: 124 | print (t) 125 | print ("") 126 | print ("source gold seq") 127 | for t in gold: 128 | print (t) 129 | print ("") 130 | TP = 0 131 | T = len(gold) 132 | P = len(hypo) 133 | gold = list_to_mulset(gold) 134 | hypo = list_to_mulset(hypo) 135 | for d_g in gold: 136 | if d_g in hypo : 137 | TP += min(gold[d_g],hypo[d_g]) 138 | fn = fn + [d_g] *min(gold[d_g]-hypo[d_g],0) 139 | else: 140 | fn = fn + [d_g] *gold[d_g] 141 | for d_g in hypo: 142 | if d_g in gold : 143 | fp = fp + [d_g] *min(hypo[d_g]-gold[d_g],0) 144 | else: 145 | fp = fp + [d_g] *hypo[d_g] 146 | return T,P,TP,fp,fn 147 | 148 | def T_P_TP_Batch(hypos,golds,filter=legal_concept,second_filter=None,second_filter_material_batch = None): 149 | TP,T,P = 0,0,0 150 | FP,FN = [],[] 151 | assert hypos, golds 152 | for i in range(len(hypos)): 153 | if second_filter: 154 | t,p,tp,fp,fn = T_TP_Seq(hypos[i],golds[i],filter,second_filter,(second_filter_material_batch[0][i],second_filter_material_batch[1][i])) 155 | else: 156 | t,p,tp,fp,fn = T_TP_Seq(hypos[i],golds[i],filter) 157 | T += t 158 | P +=p 159 | TP += tp 160 | FP += fp 161 | FN += fn 162 | return T,P,TP,FP,FN 163 | 164 | 165 | def P_R_F1(T,P,TP): 166 | if TP == 0: 167 | return 0,0,0 168 | P = TP/P 169 | R = TP/T 170 | F1 = 2.0/(1.0/P+1.0/R) 171 | return P,R,F1 172 | 173 | 174 | 175 | #naive set overlapping for different kinds of relations 176 | def rel_scores_initial(): 177 | 178 | 179 | root_filter = lambda t:(legal_concept(t[0]),legal_concept(t[1]),t[2]) if legal_concept(t[0]) and legal_concept(t[1]) and nonsense_concept(t[0]) == (BOS_WORD,BOS_WORD) else None 180 | 181 | root_score = ScoreHelper("Root",filter=root_filter) 182 | 183 | rel_filter = lambda t:(legal_concept(t[0]),legal_concept(t[1]),t[2]) if legal_concept(t[0]) and legal_concept(t[1]) else None 184 | rel_score = ScoreHelper("REL Triple",filter=rel_filter) 185 | 186 | non_sense_rel_filter = lambda t:(nonsense_concept(t[0]),nonsense_concept(t[1]),t[2]) if legal_concept(t[0]) and legal_concept(t[1]) else None 187 | nonsense_rel_score = ScoreHelper("Nonsense REL Triple",filter=non_sense_rel_filter) 188 | 189 | unlabeled_filter =lambda t:(legal_concept(t[0]),legal_concept(t[1])) if legal_concept(t[0]) and legal_concept(t[1]) else None 190 | 191 | unlabeled_rel_score = ScoreHelper("Unlabeled Rel Triple",filter=unlabeled_filter) 192 | 193 | labeled_rel_score_given_concept = ScoreHelper("REL Triple given concept",filter = rel_filter, second_filter=dynamics_filter) 194 | 195 | 196 | un_srl_filter =lambda t:(legal_concept(t[0]),legal_concept(t[1])) if legal_concept(t[0]) and legal_concept(t[1]) and t[2].startswith(':ARG') else None 197 | 198 | un_frame_score = ScoreHelper("Unlabled SRL Triple",filter=un_srl_filter) 199 | 200 | srl_filter = lambda t:(legal_concept(t[0]),legal_concept(t[1]),t[2]) if legal_concept(t[0]) and legal_concept(t[1]) and t[2].startswith(':ARG') else None 201 | frame_score = ScoreHelper("SRL Triple",filter=srl_filter) 202 | 203 | labeled_srl_score_given_concept = ScoreHelper("SRL Triple given concept",filter = srl_filter, second_filter=dynamics_filter) 204 | 205 | unlabeled_srl_score_given_concept = ScoreHelper("Unlabeled SRL Triple given concept",filter = un_srl_filter, second_filter=dynamics_filter) 206 | 207 | return [nonsense_rel_score,rel_score,root_score,unlabeled_rel_score,labeled_rel_score_given_concept,frame_score,un_frame_score,labeled_srl_score_given_concept,unlabeled_srl_score_given_concept] 208 | 209 | 210 | #naive set overlapping for different kinds of concepts 211 | def concept_score_initial(dicts): 212 | 213 | Non_Sense = ScoreHelper("Non_Sense",filter=nonsense_concept) 214 | concept_score = ScoreHelper("Full Concept",filter=legal_concept) 215 | category_score = ScoreHelper("Category Only",filter=lambda uni:(uni.cat) 216 | if legal_concept(uni) else None) 217 | lemma_score = ScoreHelper("Lemma Only",filter=lambda uni: (uni.le) 218 | if legal_concept(uni) else None) 219 | frame_score = ScoreHelper("Frame Only",filter=lambda uni: (uni.le) 220 | if legal_concept(uni) and uni.cat==Rule_Frame else None) 221 | frame_sense_score = ScoreHelper("Frame Sensed Only",filter=lambda uni: (uni.le,uni.sense) 222 | if legal_concept(uni) and uni.cat==Rule_Frame else None) 223 | frame_non_91_score = ScoreHelper("Frame non 91 Only",filter=lambda uni: (uni.le,uni.sense) 224 | if legal_concept(uni) and uni.cat==Rule_Frame and "91" not in uni.sense else None) 225 | high_score = ScoreHelper("High Freq Only",filter=lambda uni: (uni.le,uni.cat) 226 | if uni.le in dicts["high_dict"] and legal_concept(uni) else None) 227 | default_score = ScoreHelper("Copy Only",filter=lambda uni: (uni.le,uni.cat) 228 | if uni.le not in dicts["high_dict"] and legal_concept(uni) else None) 229 | return [Non_Sense,concept_score,category_score,frame_score,frame_sense_score,frame_non_91_score,lemma_score,high_score,default_score] 230 | -------------------------------------------------------------------------------- /utility/PropbankReader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | This reader reads all amr propbank file, 6 | and add possible cannonical amr lemma 7 | to the corresponding copying dictionary of a word and aliases of the word 8 | 9 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 10 | @since: 2018-05-28 11 | ''' 12 | 13 | import xml.etree.ElementTree as ET 14 | from nltk.stem import WordNetLemmatizer 15 | from utility.amr import * 16 | from utility.data_helper import folder_to_files_path 17 | wordnet_lemmatizer = WordNetLemmatizer() 18 | 19 | def add_concept(lemmas_to_concept,le,con): 20 | 21 | if not le in lemmas_to_concept: 22 | lemmas_to_concept[le]= set([con]) 23 | else: 24 | lemmas_to_concept[le].add(con) 25 | 26 | 27 | 28 | class PropbankReader: 29 | def parse(self): 30 | self.frames = dict() 31 | self.non_sense_frames = dict() 32 | self.frame_lemmas = set() 33 | self.joints = set() 34 | for f in self.frame_files_path: 35 | self.parse_file(f) 36 | 37 | def __init__(self, folder_path=frame_folder_path): 38 | self.frame_files_path = folder_to_files_path(folder_path,".xml") 39 | self.parse() 40 | 41 | def parse_file(self,f): 42 | tree = ET.parse(f) 43 | root = tree.getroot() 44 | for child in root: 45 | if child.tag == "predicate": 46 | self.add_lemma(child) 47 | 48 | #add cannonical amr lemma to possible set of words including for aliases of the words 49 | def add_lemma(self,node): 50 | lemma = node.attrib["lemma"].replace("_","-") 51 | self.frames.setdefault(lemma,set()) 52 | self.non_sense_frames.setdefault(lemma,set()) 53 | # self.frames[lemma] = set() 54 | for child in node: 55 | if child.tag == "roleset": 56 | if "." not in child.attrib["id"]: 57 | if len(child.attrib["id"].split("-")) == 1: 58 | le,sense = child.attrib["id"],NULL_WORD 59 | else: 60 | le,sense = child.attrib["id"].split("-") 61 | # print (child.attrib["id"],lemma) 62 | else: 63 | le,sense = child.attrib["id"].replace("_","-").split(".") 64 | self.frame_lemmas.add(le) 65 | role = AMRUniversal(le,Rule_Frame,"-"+sense) 66 | if len(role.le.split("-")) == 2: 67 | k,v = role.le.split("-") 68 | self.joints.add((k,v)) 69 | no_sense_con = AMRUniversal(role.le,role.cat,None) 70 | add_concept(self.frames,lemma,role) 71 | add_concept(self.non_sense_frames,lemma,no_sense_con) 72 | aliases = child.find('aliases') 73 | if aliases: 74 | for alias in aliases.findall('alias'): 75 | if alias.text != le and alias.text not in self.frames: 76 | alias_t = alias.text.replace("_","-") 77 | add_concept(self.frames,alias_t,role) 78 | add_concept(self.non_sense_frames,alias_t,no_sense_con) 79 | #print (le, self.frames[le]) 80 | def get_frames(self): 81 | return self.frames 82 | def main(): 83 | f_r = PropbankReader() 84 | for k,v in f_r.joints: 85 | print (k+" "+v) 86 | 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /utility/StringCopyRules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | ''' 4 | Building and hanlding category based dictionary for copying mechanism 5 | Also used by ReCategorization to preduce training set, and templates (which partially rely on string matching). 6 | 7 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 8 | @since: 2018-05-28 9 | ''' 10 | 11 | import threading 12 | from utility.data_helper import * 13 | from utility.AMRGraph import * 14 | from utility.constants import * 15 | from utility.PropbankReader import PropbankReader 16 | 17 | from nltk.metrics.distance import edit_distance 18 | 19 | 20 | def de_polarity(lemma): 21 | if len(lemma) == 0: return None 22 | if lemma[0] == "a" and len(lemma)> 5: 23 | return lemma[1:] 24 | if (lemma[:2]) in ["in","un","il","ir","im"] and len(lemma)>5: 25 | return lemma[2:] 26 | if lemma[:3] in ["dis","non"] and len(lemma)>6: 27 | return lemma[3:] 28 | if lemma[-4:] in ["less"] and len(lemma)>6: 29 | return lemma[:-4] 30 | return None 31 | def polarity_match(con_lemma,lemma): 32 | lemma = de_polarity(lemma) 33 | if lemma is not None: 34 | if disMatch(lemma,con_lemma )<1: 35 | return True 36 | return False 37 | 38 | 39 | #computing string dissimilarity (e.g. 0 means perfect match) 40 | def disMatch(lemma,con_lemma,t=0.5): 41 | # if (con_lemma == "and" and lemma == ";" ): return True 42 | # if (con_lemma == "multi-sentence" and lemma in [".",";"]): return True 43 | if lemma == con_lemma: return 0 44 | if de_polarity(lemma) == con_lemma: return 1 #not match if depolaritized matched 45 | if (con_lemma in lemma or lemma in con_lemma and "-role" not in con_lemma) and len(lemma)>2 and len(con_lemma)>2 : 46 | return 0 47 | if lemma.endswith("ily") and lemma[:-3]+"y"==con_lemma: 48 | return 0 49 | if lemma.endswith("ing") and (lemma[:-3]+"e"==con_lemma or lemma[:-3]==con_lemma): 50 | return 0 51 | if lemma.endswith("ical") and lemma[:-4]+"y"==con_lemma: 52 | return 0 53 | if lemma.endswith("ially") and lemma[:-5] in con_lemma: 54 | return 0 55 | if lemma.endswith("ion") and (lemma[:-3]+"e"==con_lemma or lemma[:-3]==con_lemma): 56 | return 0 57 | if lemma in con_lemma and len(lemma)>3 and len(con_lemma)-len(lemma)<5: 58 | return 0 59 | if lemma.endswith("y") and lemma[:-1]+"ize"==con_lemma: 60 | return 0 61 | if lemma.endswith("er") and (lemma[:-2]==con_lemma or lemma[:-3]==con_lemma or lemma[:-1]==con_lemma): 62 | return 0 63 | dis = 1.0*edit_distance(lemma,con_lemma)/min(12,max(len(lemma),len(con_lemma))) 64 | if (dis < t ) : 65 | return dis 66 | return 1 67 | 68 | import calendar 69 | month_abbr = {name: num for num, name in enumerate(calendar.month_abbr) if num} 70 | 71 | _float_regexp = re.compile(r"^[-+]?(?:\b[0-9]+(?:\.[0-9]*)?|\.[0-9]+\b)(?:[eE][-+]?[0-9]+\b)?$") 72 | def is_float_re(str): 73 | return re.match(_float_regexp, str) 74 | super_scripts = '⁰¹²³⁴⁵⁶⁷⁸⁹' 75 | def parseStr(x): 76 | if x.isdigit(): 77 | if x in super_scripts: 78 | return super_scripts.find(x) 79 | return int(x) 80 | elif is_float_re(x): 81 | return float(x) 82 | return None 83 | 84 | units = [ 85 | "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", 86 | "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", 87 | "sixteen", "seventeen", "eighteen", "nineteen", 88 | ] 89 | tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] 90 | months = ["","january","february","march","april","may","june","july","august","september","october","november","december"] 91 | scales = ["hundred", "thousand", "million", "billion", "trillion"] 92 | scaless = ["hundreds", "thousands", "millions", "billions", "trillions"] 93 | numwords = {} 94 | numwords["and"] = (1, 0) 95 | for idx, word in enumerate(units): numwords[word] = (1, idx) 96 | for idx, word in enumerate(months): numwords[word] = (1, idx) 97 | for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) 98 | for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0) 99 | for idx, word in enumerate(scaless): numwords[word] = (10 ** (idx * 3 or 2), 0) 100 | ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12} 101 | ordinal_endings = [('ieth', 'y'), ('th', ''), ('st', ''), ('nd', ''), ('rd', '')] 102 | 103 | def text2int(textnum): 104 | for k, v in month_abbr.items(): 105 | if textnum == k.lower(): 106 | return v 107 | if " and " in textnum : 108 | textnums = textnum.split(" and ") 109 | out = [ j for j in [text2int(i) for i in textnums ] if j ] 110 | if len(out) > 1: return sum(out) 111 | else: return None 112 | textnum = textnum.replace(',', ' ') 113 | textnum = textnum.replace('-', ' ') 114 | textnum = textnum.replace('@-@', ' ') 115 | current = result = 0 116 | for word in textnum.split(): 117 | w_num = parseStr(word) 118 | if word in ordinal_words: 119 | scale, increment = (1, ordinal_words[word]) 120 | elif w_num: 121 | scale, increment = (1,w_num) 122 | else: 123 | for ending, replacement in ordinal_endings: 124 | if word.endswith(ending): 125 | word = "%s%s" % (word[:-len(ending)], replacement) 126 | if word not in numwords: 127 | return None 128 | scale, increment = numwords[word] 129 | current = current * scale + increment 130 | if scale > 100: 131 | result += current 132 | current = 0 133 | return int(result + current) 134 | 135 | 136 | 137 | 138 | 139 | 140 | def unmixe(mixed,threshold = 50): 141 | high_frequency = dict() 142 | low_frequency = dict() 143 | low_text_num = dict() 144 | high_text_num = dict() 145 | for i in mixed: 146 | cat = i.cat 147 | if mixed[i][0] >= threshold and ( cat in Rule_All_Constants) : 148 | high_text_num[i] = mixed[i] 149 | elif mixed[i][0] >= threshold: 150 | high_frequency[i] = mixed[i] 151 | elif (cat in Rule_All_Constants): 152 | low_text_num[i] = mixed[i] 153 | else: 154 | low_frequency[i] = mixed[i] 155 | 156 | return high_text_num,high_frequency,low_frequency,low_text_num 157 | 158 | class rules: 159 | RE_FRAME_NUM = re.compile(r'-\d\d$') 160 | NUM = re.compile(r'[-]?[1-9][0-9]*[:.]?[0-9]*') 161 | Pure_NUM = re.compile(r'[-]?[1-9][0-9]*[,]?[0-9]*') 162 | 163 | def save(self,filepath="data/rule_f"): 164 | pickle_helper= Pickle_Helper(filepath) 165 | pickle_helper.dump(self.lemma_back,"lemma_back") 166 | pickle_helper.dump([k for k in self.lemma_freq_cat.keys()],"keys") 167 | for cat in self.lemma_freq_cat: 168 | pickle_helper.dump(self.lemma_freq_cat[cat] ,cat) 169 | pickle_helper.save() 170 | 171 | self.load(filepath) 172 | 173 | lock = threading.Lock() 174 | def load(self,filepath="data/rule_f"): 175 | pickle_helper= Pickle_Helper(filepath) 176 | data = pickle_helper.load() 177 | keys = data["keys"] 178 | self.lemma_freq_cat = {} 179 | self.lemma_back = data["lemma_back"] 180 | for key in keys: 181 | self.lemma_freq_cat[key] = data[key] 182 | self.build_lemma_cheat() 183 | return self 184 | 185 | def set_rules(self): 186 | self.rules = {} 187 | self.rules[Rule_Frame]= lambda _,l,con_l ,sense: self.standard_rule(l,Rule_Frame,con_l,sense) 188 | self.rules[Rule_String]= lambda x,_,con_l ,__: self.standard_rule(x,Rule_String,con_l) 189 | self.rules[Rule_Ner]= lambda x,_ ,con_l,__:self.standard_rule(x,Rule_Ner,con_l) 190 | self.rules[Rule_B_Ner]= lambda x,_ ,con_l,__:self.standard_rule(x,Rule_B_Ner,con_l) 191 | self.rules[Rule_Constant]= lambda _,l,con_l,__: self.standard_rule(l,Rule_Constant,con_l) 192 | self.rules[Rule_Concept]= lambda _,l,con_l,__: self.standard_rule(l,Rule_Concept,con_l) 193 | self.rules[Rule_Num]= lambda _,l ,con_l,__: self.num(l) 194 | 195 | def entity(self,lemma,cat,con_lemma = None): 196 | num = self.num(lemma) 197 | if num is not None and num.le != NULL_WORD: 198 | num.cat = cat 199 | if cat == "date-entity" and self.Pure_NUM.search(lemma) and len(lemma) == 6: 200 | num.le = lemma 201 | return num 202 | return self.standard_rule(lemma,cat,con_lemma) 203 | 204 | def read_veb(self): 205 | RE_FRAME_NUM = re.compile(r'-\d\d$') 206 | f = open(verbalization,"r") 207 | f.readline() 208 | line = f.readline() 209 | while line != '' : 210 | tokens = line.replace("\n","").split(" ") 211 | if len(tokens)<= 4 and (tokens[0] =="VERBALIZE" or tokens[0] =="MAYBE-VERBALIZE"): 212 | old_lemma = tokens[1] 213 | amr_lemma = re.sub(RE_FRAME_NUM, '', tokens[3]) 214 | if tokens[0] =="MAYBE-VERBALIZE": 215 | self.add_lemma_freq(old_lemma,amr_lemma,Rule_Frame,freq=1,sense = tokens[3][-3:]) 216 | else: 217 | self.add_lemma_freq(old_lemma,amr_lemma,Rule_Frame,freq=100,sense = tokens[3][-3:]) 218 | 219 | line = f.readline() 220 | f.close() 221 | 222 | 223 | def read_frame(self): 224 | f_r = PropbankReader() 225 | f_r = f_r.frames 226 | for le,concepts in f_r.items(): 227 | i=0 228 | for c in concepts: 229 | self.add_lemma_freq(le,c.le,Rule_Frame,freq=10,sense = c.sense) 230 | i = i+1 231 | 232 | def __init__(self): 233 | self.lemma_freq_cat = {} 234 | self.lemmatize_cheat = {} 235 | self.lemma_back = {} 236 | self.read_frame() 237 | self.read_veb() 238 | self.frame_lemmas = PropbankReader().frame_lemmas 239 | self.build_lemma_cheat() 240 | self.set_rules() 241 | # self.rules[Rule_Re]= lambda _,l = wordnet.NOUN: self.re(l) 242 | 243 | def standard_rule(self,lemma,cat,con_lemma=None,sense=NULL_WORD): 244 | if con_lemma is None: #testing 245 | if cat in [Rule_Ner,Rule_B_Ner ]and len(lemma)>3: 246 | lemma = lemma.capitalize() 247 | if (lemma,cat) in self.lemmatize_cheat: 248 | # if lemma == "cooperation" and cat == Rule_Frame: 249 | # print ("before cooperation",self.lemmatize_cheat[(lemma,cat)],AMRUniversal(lemma,cat,sense)) 250 | lemma = self.lemmatize_cheat[(lemma,cat)] 251 | # if lemma == "cooperate" and cat == Rule_Frame: 252 | # print ("after cooperate",AMRUniversal(lemma,cat,sense)) 253 | # elif lemma == "cooperation" and cat == Rule_Frame: 254 | # print ("after cooperation",AMRUniversal(lemma,cat,sense)) 255 | return AMRUniversal(lemma,cat,sense) 256 | return AMRUniversal(lemma,cat,sense) 257 | else: #training 258 | if cat in [Rule_Ner,Rule_B_Ner ] and len(lemma)>3: 259 | lemma = lemma.capitalize() 260 | if cat not in self.lemma_freq_cat or lemma not in self.lemma_freq_cat[cat]: 261 | return AMRUniversal(lemma,cat,sense) 262 | candidates = self.lemma_freq_cat[cat][lemma] 263 | if con_lemma in candidates.keys(): 264 | return AMRUniversal(con_lemma,cat,sense) 265 | return AMRUniversal(lemma,cat,sense) 266 | 267 | def clear_freq(self): 268 | self.lemma_freq_cat = {} 269 | self.lemmatize_cheat = {} 270 | 271 | def add_lemma_freq(self,old_lemma,amr_lemma,cat,freq=1,sense=NULL_WORD): 272 | # if cat in Rule_All_Constants: 273 | # return 274 | self.lock.acquire() 275 | if old_lemma == amr_lemma: freq *= 10 276 | amr_con = amr_lemma 277 | self.lemma_back[amr_con][old_lemma] = self.lemma_back.setdefault(amr_con,{}).setdefault(old_lemma,0)+freq 278 | lemma_freq = self.lemma_freq_cat.setdefault(cat,{}).setdefault(old_lemma,{}) 279 | lemma_freq[amr_con] = lemma_freq.setdefault(amr_con,0)+freq 280 | self.lock.release() 281 | 282 | 283 | def build_lemma_cheat(self): 284 | for cat in self.lemma_freq_cat: 285 | lemma_freqs = self.lemma_freq_cat[cat] 286 | for word in lemma_freqs: 287 | max_score = 0 288 | max_lemma = word 289 | for arm_le in lemma_freqs[word]: 290 | score = 1.0*lemma_freqs[word][arm_le] 291 | assert (score > 0) 292 | if score >max_score: 293 | max_score = score 294 | max_lemma = arm_le 295 | self.lemmatize_cheat[(word,cat)] = max_lemma 296 | 297 | # print (self.lemmatize_cheat) 298 | 299 | # fragments_to_break = set(["up","down","make"]) 300 | 301 | def num(self,lemma): 302 | r = text2int(lemma) 303 | if r is None and self.Pure_NUM.search(lemma) is not None: 304 | lemma = lemma.replace(",","") 305 | return AMRUniversal(lemma,Rule_Num,None) 306 | if r is not None: 307 | return AMRUniversal(str(r),Rule_Num,None) 308 | return AMRUniversal(NULL_WORD,Rule_Num,None) 309 | 310 | #old_ids : batch x (cat,le,lemma,word) only cat is id 311 | def toAmrSeq(self,cats,snt,lemma,high,auxs,senses = None,ners = None): 312 | out = [] 313 | for i in range(len(snt)): 314 | sense = senses[i] if senses else None 315 | txt, le,cat,h,aux = snt[i],lemma[i],cats[i],high[i],auxs[i] 316 | assert h is None or isinstance(h,str) or isinstance(h,tuple)and isinstance(cat,str) ,(txt, le,cat,h) 317 | if h and h != UNK_WORD: 318 | if cat == Rule_Num: 319 | uni = self.to_concept(txt,h,Rule_Num,sense) 320 | if uni.le == NULL_WORD: 321 | uni = AMRUniversal(h,Rule_Concept,sense) 322 | else: 323 | uni = AMRUniversal(h,cat,sense) 324 | else: 325 | try_num = self.to_concept(txt,le,Rule_Num,sense) 326 | if " " in le and try_num.le != NULL_WORD and cat not in [Rule_String,Rule_B_Ner,Rule_Ner] and "entity" not in cat: 327 | uni = try_num 328 | else: 329 | uni = self.to_concept(txt,le,cat,sense) 330 | 331 | if cat == Rule_B_Ner: 332 | if not aux in [UNK_WORD,NULL_WORD]: 333 | uni.aux = aux 334 | elif ners[i] == "PERSON": 335 | uni.aux = "person" 336 | elif ners[i] == "LOCATION": 337 | uni.aux = "location" 338 | elif ners[i] == "ORGANIZATION": 339 | uni.aux = "organization" 340 | else: 341 | uni.aux = UNK_WORD 342 | assert isinstance(uni.le,str) and isinstance(uni.cat,str ),(txt, le,cat,h,uni.le,uni,cat) 343 | 344 | 345 | if ners[i] == "URL": #over write ML decision, otherwise creating bug 346 | uni = AMRUniversal(le,"url-entity",None) 347 | 348 | out.append(uni) 349 | 350 | return out 351 | 352 | 353 | def to_concept(self,txt,le,cat,con_lemma=None,sense=NULL_WORD): 354 | if cat in self.rules: 355 | return self.rules[cat](txt,le,con_lemma,sense) 356 | elif cat.endswith("-entity"): # entity 357 | return self.entity(le,cat,con_lemma) 358 | else: 359 | return self.standard_rule(le,cat,con_lemma) 360 | 361 | 362 | #amr is myamr 363 | def get_matched_concepts(self,snt_token,amr_or_node_value,lemma,pos,with_target = False,jamr=False,full=1): 364 | results = [] 365 | node_value = amr_or_node_value.node_value(keys=["value","align"]) if isinstance(amr_or_node_value,AMRGraph) else amr_or_node_value 366 | for n,c,a in node_value: 367 | if full == 1: 368 | align = self.match_concept(snt_token,c,lemma,pos,with_target) 369 | if jamr and a is not None: 370 | exist = False 371 | for i,l,p in align: 372 | if i == a: 373 | exist = True 374 | if not exist: 375 | align += [(a,lemma[a],pos[a])] 376 | results.append([n,c,align]) 377 | else: 378 | if jamr and a is not None: 379 | align = [(a,lemma[a],pos[a])] 380 | else: 381 | align = self.match_concept(snt_token,c,lemma,pos,with_target) 382 | results.append([n,c,align]) 383 | return results 384 | 385 | def match_concept(self,snt_token,concept,lemma,pos,with_target = False,candidate = None): 386 | if len(lemma) == 1: return [[0,lemma[0],pos[0]]] 387 | le,cat,sense = decompose(concept) 388 | align = [] 389 | if candidate is None: 390 | candidate = range(len(snt_token)) 391 | for i in candidate: 392 | if with_target and disMatch(lemma[i],le) <1: # and pos[i] not in ["IN"]: 393 | align.append((i,lemma[i],pos[i])) 394 | continue 395 | if with_target: 396 | amr_c = self.to_concept(snt_token[i],lemma[i],cat,le,sense) 397 | else: 398 | amr_c = self.to_concept(snt_token[i],lemma[i],cat) 399 | if amr_c is None: 400 | continue 401 | le_i,cat_i,sen_i = decompose(amr_c) 402 | 403 | assert cat == cat_i, "cat mismatch "+ snt_token[i]+" "+lemma[i]+" "+cat+" "+le+" "+cat_i+" "+le_i+"\n"+" ".join(snt_token) 404 | if amr_c.non_sense_equal(concept): # and pos[i] not in ["IN"]: 405 | align.append((i,lemma[i],pos[i])) 406 | 407 | if le == "and" and len(align) == 0: 408 | for i in range(len(lemma)): 409 | if lemma[i] == ";" or lemma[i] == "and": 410 | align.append((i,lemma[i],pos[i])) 411 | if len(align)>0: return [align[-1]] 412 | 413 | # if len(align) > 0 : print (le,align,lemma) 414 | 415 | if le == "multi-sentence" and len(align) == 0 and False: 416 | for i in range(len(lemma)): 417 | if lemma[i] in [".",";","?","!"]: 418 | align.append((i,lemma[i],pos[i])) 419 | return align 420 | # if len(align) > 0 : print (le,align,lemma) 421 | return align 422 | 423 | -------------------------------------------------------------------------------- /utility/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__init__.py -------------------------------------------------------------------------------- /utility/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__init__.pyc -------------------------------------------------------------------------------- /utility/__pycache__/AMRGraph.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/AMRGraph.cpython-36.pyc -------------------------------------------------------------------------------- /utility/__pycache__/Naive_Scores.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/Naive_Scores.cpython-36.pyc -------------------------------------------------------------------------------- /utility/__pycache__/PropbankReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/PropbankReader.cpython-36.pyc -------------------------------------------------------------------------------- /utility/__pycache__/ReCategorization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/ReCategorization.cpython-36.pyc -------------------------------------------------------------------------------- /utility/__pycache__/StringCopyRules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/StringCopyRules.cpython-36.pyc -------------------------------------------------------------------------------- /utility/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /utility/__pycache__/amr.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/amr.cpython-36.pyc -------------------------------------------------------------------------------- /utility/__pycache__/constants.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/constants.cpython-36.pyc -------------------------------------------------------------------------------- /utility/__pycache__/data_helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/__pycache__/data_helper.cpython-36.pyc -------------------------------------------------------------------------------- /utility/amr.peg: -------------------------------------------------------------------------------- 1 | # PEG (parsing expression grammar) for a single AMR annotation.MEDCONST = ~r"[a-z]{2,}" ALIGNMENT? # aside from + and -, named constants must have at least 2 letters (to distinguish from variable names) 2 | # Designed for Parsimonious library (the https://github.com/erikrose/parsimonious), 3 | # though a bit of automatic cleanup is required when loading this file. 4 | # Nathan Schneider, 2015-05-05 5 | 6 | ALL = ~r"\s*" X ~r"\s*$" 7 | 8 | X = "(" ` VAR _ "/" _ CONCEPT (_ REL _ Y)* ` ")" 9 | Y = X / NAMEDCONST / VAR / STR / NUM 10 | VAR = ~r"[a-z]+[0-9]*" ALIGNMENT? 11 | NAMEDCONST = ~r"[a-z]{2,}\b" ALIGNMENT? # aside from + and -, named constants must have at least 2 letters (to distinguish from variable names) 12 | STR = "\"" ~r"[^\"\s]([^\"\n\r]*[^\"\s])?" "\"" ALIGNMENT? # quoted string literal. nonempty; may not start or end with whitespace 13 | CONCEPT = ~r"[A-Za-z0-9.\!\?,:;'][A-Za-z0-9.i\!\?.;:'-]*" ALIGNMENT? # seen in data: :li (x3 / 3) and :quant (x / 355.02) and :mod (x / friggin') 14 | REL = ~r":[A-Za-z][A-Za-z0-9-]*" ALIGNMENT? 15 | NUM = ~r"[+-]?\d*(\.\d+)?" ALIGNMENT? 16 | ALIGNMENT = "~" ~r"[A-Za-z0-9\.]+(\,[0-9]+)*" 17 | # TODO: the regexes, especially NUM, need checking 18 | 19 | _ = ~r"([ \t]*[\n\r][ \t]*)|[ \t]+" 20 | ` = ~r"[ \t]*[\n\r]?[ \t]*" 21 | -------------------------------------------------------------------------------- /utility/amr.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/amr.pyc -------------------------------------------------------------------------------- /utility/constants.py: -------------------------------------------------------------------------------- 1 | '''Global constants, and file paths''' 2 | import os,re 3 | 4 | # Change the path according to your system 5 | 6 | save_to = '/disk/scratch/s1544871/model/' #the folder amr model will be saved to (model name is parameterized by some hyper parameter) 7 | train_from = '/disk/scratch/s1544871/model/gpus_0valid_best.pt' #default model loading 8 | embed_path = "/disk/scratch/s1544871/glove.840B.300d.txt" #file containing glove embedding 9 | core_nlp_url = 'http://localhost:9000' #local host url of standford corenlp server 10 | root_path = "/disk/scratch/s1544871" 11 | allFolderPath = root_path + "/amr_annotation_r2/data/alignments/split" 12 | resource_folder_path = root_path +"/amr_annotation_r2/" 13 | frame_folder_path = resource_folder_path+"data/frames/propbank-frames-xml-2016-03-08/" 14 | have_org_role = resource_folder_path+"have-org-role-91-roles-v1.06.txt" #not used 15 | have_rel_role = resource_folder_path+"have-rel-role-91-roles-v1.06.txt" #not used 16 | morph_verbalization = resource_folder_path+"morph-verbalization-v1.01.txt" #not used 17 | verbalization = resource_folder_path+"verbalization-list-v1.06.txt" 18 | 19 | 20 | PAD = 0 21 | UNK = 1 22 | 23 | PAD_WORD = '' 24 | UNK_WORD = '' 25 | BOS_WORD = '~~' 26 | EOS_WORD = '~~' 27 | NULL_WORD = "" 28 | UNK_WIKI = '' 29 | Special = [NULL_WORD,UNK_WORD,PAD_WORD] 30 | #Categories 31 | Rule_Frame = "Frame" 32 | Rule_Constant = "Constant" 33 | Rule_String = "String" 34 | Rule_Concept = "Concept" 35 | Rule_Comp = "COMPO" 36 | Rule_Num = "Num" 37 | Rule_Re = "Re" #corenference 38 | Rule_Ner = "Ner" 39 | Rule_B_Ner = "B_Ner" 40 | Rule_Other = "Entity" 41 | Other_Cats = {"person","thing",} 42 | COMP = "0" 43 | Rule_All_Constants = [Rule_Num,Rule_Constant,Rule_String,Rule_Ner] 44 | Splish = "$£%%££%£%£%£%" 45 | Rule_Basics = Rule_All_Constants + [Rule_Frame,Rule_Concept,UNK_WORD,BOS_WORD,EOS_WORD,NULL_WORD,PAD_WORD] 46 | 47 | RULE = 0 48 | HIGH = 1 49 | LOW = 2 50 | 51 | RE_FRAME_NUM = re.compile(r'-\d\d$') 52 | RE_COMP = re.compile(r'_\d$') 53 | end= re.compile(".txt\_[a-z]*") 54 | epsilon = 1e-8 55 | 56 | TXT_WORD = 0 57 | TXT_LEMMA = 1 58 | TXT_POS = 2 59 | TXT_NER = 3 60 | 61 | 62 | AMR_CAT = 0 63 | AMR_LE = 1 64 | AMR_NER = 2 65 | AMR_AUX = 2 66 | AMR_LE_SENSE = 3 67 | AMR_SENSE = 3 68 | AMR_CAN_COPY = 4 69 | 70 | threshold = 5 71 | 72 | 73 | -------------------------------------------------------------------------------- /utility/constants.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/3375123c6b00bdfbe3395706769175073716b699/utility/constants.pyc -------------------------------------------------------------------------------- /utility/data_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # coding=utf-8 3 | ''' 4 | 5 | Some helper functions for storing and reading data 6 | 7 | @author: Chunchuan Lyu (chunchuan.lv@gmail.com) 8 | @since: 2018-05-29 9 | ''' 10 | import json,os,re 11 | import pickle 12 | 13 | 14 | class Pickle_Helper: 15 | 16 | def __init__(self, filePath): 17 | self.path = filePath 18 | self.objects = dict() 19 | 20 | def dump(self,obj,name): 21 | self.objects[name] = obj 22 | 23 | def save(self): 24 | f = open(self.path , "wb") 25 | pickle.dump(self.objects ,f,protocol=pickle.HIGHEST_PROTOCOL) 26 | f.close() 27 | 28 | def load(self): 29 | f = open(self.path , "rb") 30 | self.objects = pickle.load(f) 31 | f.close() 32 | return self.objects 33 | 34 | def get_path(self): 35 | return self.path 36 | 37 | class Json_Helper: 38 | 39 | def __init__(self, filePath): 40 | self.path = filePath 41 | self.objects = dict() 42 | 43 | def dump(self,obj,name): 44 | self.objects[name] = obj 45 | 46 | def save(self): 47 | if not os.path.exists(self.path): 48 | os.makedirs(self.path) 49 | for name in self.objects: 50 | with open(self.path+"/"+name+".json", 'w+') as fp: 51 | json.dump(self.objects[name], fp) 52 | 53 | def load(self): 54 | files_path = folder_to_files_path(self.path,ends =".json") 55 | for f in files_path: 56 | name = f.split("/") 57 | with open(f) as data_file: 58 | data = json.load(data_file) 59 | self.objects[name] = data 60 | return self.objects 61 | 62 | def get_path(self): 63 | return self.path 64 | 65 | def folder_to_files_path(folder,ends =".txt"): 66 | files = os.listdir(folder ) 67 | files_path = [] 68 | for f in files: 69 | if f.endswith(ends): 70 | files_path.append(folder+f) 71 | # break 72 | return files_path 73 | def load_line(line,data): 74 | if "\t" in line: 75 | tokens = line[4:].split("\t") 76 | else: 77 | tokens = line[4:].split() 78 | if tokens[0] == "root": return 79 | 80 | if tokens[0] == "node": 81 | data["node"][tokens[1]] = tokens[2] 82 | if tokens.__len__() > 3: 83 | data["align"][tokens[1]] = int(tokens[3].split("-")[0]) 84 | return 85 | if tokens[0] == "edge": 86 | data["edge"][tokens[4],tokens[5]] = tokens[2] 87 | return 88 | data[tokens[0]] = tokens[1:] 89 | def asserting_equal_length(data): 90 | assert len(data["tok"]) ==len(data["lem"]) , ( len(data["tok"]) ,len(data["lem"]),"\n",list(zip(data["tok"],data["lem"])) ,data["tok"],data["lem"]) 91 | assert len(data["tok"]) ==len(data["ner"]) , ( len(data["tok"]) ,len(data["ner"]),"\n",list(zip(data["tok"],data["ner"])) ,data["tok"],data["ner"]) 92 | assert len(data["tok"]) ==len(data["pos"]) , ( len(data["tok"]) ,len(data["pos"]),"\n",list(zip(data["tok"],data["pos"])) ,data["tok"],data["pos"]) 93 | 94 | def load_text_jamr(filepath): 95 | all_data = [] 96 | with open(filepath,'r') as f: 97 | line = f.readline() 98 | while line != '' : 99 | while line != '' and not line.startswith("# ::") : 100 | line = f.readline() 101 | 102 | if line == "": return all_data 103 | 104 | data = {} 105 | data.setdefault("align",{}) 106 | data.setdefault("node",{}) 107 | data.setdefault("edge",{}) 108 | while line.startswith("# ::"): 109 | load_line(line.replace("\n","").strip(),data) 110 | line = f.readline() 111 | amr_t = "" 112 | while line.strip() != '' and not line.startswith("# AMR release"): 113 | amr_t = amr_t+line 114 | line = f.readline() 115 | data["amr_t"] = amr_t 116 | asserting_equal_length(data) 117 | all_data.append(data) 118 | line = f.readline() 119 | return all_data 120 | 121 | 122 | def load_text_input(filepath): 123 | all_data = [] 124 | with open(filepath,'r') as f: 125 | line = f.readline() 126 | while line != '' : 127 | while line != '' and not line.startswith("# ::"): 128 | line = f.readline() 129 | 130 | if line == "": return all_data 131 | 132 | data = {} 133 | while line.startswith("# ::"): 134 | load_line(line.replace("\n","").strip(),data) 135 | line = f.readline() 136 | all_data.append(data) 137 | line = f.readline() 138 | return all_data --------------------------------------------------------------------------------