├── .gitmodules ├── ner-system ├── CMakeLists.txt ├── attach_prediction.py ├── convert-conll2trans.pl ├── conll2parser.py ├── example.conll-2003 ├── README.md ├── c2.h └── lstm-parse.cc ├── .gitignore ├── CMakeLists.txt ├── INSTALL.md ├── cmake └── FindEigen3.cmake └── README.md /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "dynet"] 2 | path = dynet 3 | url = https://github.com/clab/dynet.git 4 | branch = master 5 | -------------------------------------------------------------------------------- /ner-system/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | PROJECT(dynet:parser) 2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 3 | 4 | ADD_EXECUTABLE(lstm-parse lstm-parse.cc) 5 | target_link_libraries(lstm-parse dynet ${Boost_LIBRARIES}) 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | -------------------------------------------------------------------------------- /ner-system/attach_prediction.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description='Give Me Conll data.') 5 | parser.add_argument('-p', type=str, help='prediction file') 6 | parser.add_argument('-t', type=str, help='test file') 7 | parser.add_argument('-o', type=str, help='output destination') 8 | args = parser.parse_args() 9 | 10 | f = open(args.o,'w') 11 | 12 | for test_line, pred_line in zip(open(args.t), open(args.p)): 13 | test_line = test_line.strip().split() 14 | pred_line = pred_line.strip().split() 15 | 16 | 17 | if len(test_line) > 0: 18 | #assert test_line[0] == pred_line[0], "your prediction is not aligned to test file" 19 | test_line.append(pred_line[-1]) 20 | f.write('{}\n'.format(" ".join(test_line))) 21 | else: 22 | f.write("\n") 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(dynet) 2 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR) 3 | 4 | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) 5 | set(CMAKE_CXX_FLAGS "-Wall -std=c++11 -O3 -g") 6 | 7 | enable_testing() 8 | 9 | #include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 10 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dynet) 11 | set(WITH_EIGEN_BACKEND 1) 12 | 13 | # look for Boost 14 | if(DEFINED ENV{BOOST_ROOT}) 15 | set(Boost_NO_SYSTEM_PATHS ON) 16 | endif() 17 | set(Boost_REALPATH ON) 18 | find_package(Boost COMPONENTS program_options serialization REQUIRED) 19 | include_directories(${Boost_INCLUDE_DIR}) 20 | set(LIBS ${LIBS} ${Boost_LIBRARIES}) 21 | 22 | # look for Eigen 23 | find_package(Eigen3 REQUIRED) 24 | include_directories(${EIGEN3_INCLUDE_DIR}) 25 | 26 | #configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) 27 | 28 | add_subdirectory(dynet/dynet) 29 | # add_subdirectory(dynet/examples) 30 | add_subdirectory(ner-system) 31 | -------------------------------------------------------------------------------- /ner-system/convert-conll2trans.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my @toks; 5 | my @tags; 6 | 7 | while(<>) { 8 | chomp; 9 | if (/^\s*$/) { 10 | print "@toks ||| "; 11 | my $len = scalar @toks; 12 | my $i = 0; 13 | while ($i < $len) { 14 | if ($tags[$i] eq 'O') { 15 | print "OUT "; 16 | $i++; 17 | } elsif ($tags[$i] =~ /^(B|I)-(.+)$/) { 18 | my $tt = $2; 19 | my $x = "I-$tt"; 20 | my $j = $i + 1; 21 | while ($j < $len && $tags[$j] eq $x) { $j++; } 22 | my @span = (); 23 | for (my $k = $i; $k < $j; $k++) { 24 | print "SHIFT "; 25 | } 26 | print "REDUCE($tt) "; 27 | $i = $j; 28 | } else { 29 | die "Bad input: $_\n"; 30 | } 31 | } 32 | @toks = (); 33 | @tags = (); 34 | print "\n"; 35 | } else { 36 | my @fields = split /\s+/; 37 | push @toks, "$fields[0]-$fields[1]"; 38 | push @tags, $fields[-1]; 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /ner-system/conll2parser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description='Give Me Conll data.') 5 | parser.add_argument('-f', type=str, help='conll file') 6 | args = parser.parse_args() 7 | 8 | for line in open(args.f): 9 | sentence, actions = map(lambda x: x.strip().split(), line.strip().split('|||')) 10 | stack_state, buffer_state = [], sentence 11 | #print sentence, actions 12 | 13 | sys.stdout.write('\n') 14 | for action in actions: 15 | sys.stdout.write('{}{}\n'.format(str(stack_state), str(buffer_state))) 16 | if action[0] == 'O': 17 | assert len(stack_state) == 0 18 | buffer_state = buffer_state[1:] 19 | elif action[0] == 'S': 20 | stack_state.append(buffer_state[0]) 21 | buffer_state = buffer_state[1:] 22 | elif action[0] == 'R': 23 | stack_state = [] 24 | sys.stdout.write('{}\n'.format(action)) 25 | assert len(stack_state) == 0 and len(buffer_state) == 0 26 | sys.stdout.write('{}{}\n'.format(str(stack_state), str(buffer_state))) 27 | 28 | -------------------------------------------------------------------------------- /ner-system/example.conll-2003: -------------------------------------------------------------------------------- 1 | John NNP I-NP I-PER 2 | Smith NNP N-NP I-PER 3 | went VBD I-VP O 4 | to TO I-PP O 5 | Pittsburgh NNP I-NP I-LOC 6 | . . O O 7 | 8 | Refusing VBG I-VP O 9 | to TO I-VP O 10 | go VB I-VP O 11 | quietly RB I-ADVP O 12 | in IN I-PP O 13 | the DT I-NP O 14 | night NN I-NP O 15 | , , O O 16 | Stefan NNP I-NP I-PER 17 | Edberg NNP I-NP I-PER 18 | extended VBD I-VP O 19 | his PRP$ I-NP O 20 | stay NN I-NP O 21 | at IN I-PP O 22 | his PRP$ I-NP O 23 | 14th JJ I-NP O 24 | and CC I-NP O 25 | last JJ I-NP O 26 | U.S. NNP I-NP I-MISC 27 | Open NNP I-NP I-MISC 28 | when WRB I-ADVP O 29 | Bernd NNP I-NP I-PER 30 | Karbacher NNP I-NP I-PER 31 | , , O O 32 | trailing VBG I-VP O 33 | and CC O O 34 | hurting VBG I-VP O 35 | , , O O 36 | quit VB I-VP O 37 | in IN I-PP O 38 | the DT I-NP O 39 | fourth JJ I-NP O 40 | set NN I-NP O 41 | of IN I-PP O 42 | their PRP$ I-NP O 43 | second-round JJ I-NP O 44 | match NN I-NP O 45 | Friday NNP B-NP O 46 | . . O O 47 | 48 | AUGUST RB I-NP O 49 | 1996 CD I-NP O 50 | CDU NNP I-NP I-ORG 51 | / SYM O I-ORG 52 | CSU NNP I-NP I-ORG 53 | SPD NNP I-NP B-ORG 54 | FDP NNP I-NP B-ORG 55 | Greens NNP I-NP B-ORG 56 | PDS NNP I-NP B-ORG 57 | 58 | -------------------------------------------------------------------------------- /ner-system/README.md: -------------------------------------------------------------------------------- 1 | ## Example 2 | 3 | #### Desired labeling 4 | 5 | John Smith went to Pittsburgh . 6 | PER----- O O LOC O 7 | 8 | Corresponding sequence of operations (generated by `convert-conll2trans.pl`) 9 | 10 | SHIFT 11 | SHIFT 12 | REDUCE(PER) 13 | OUT 14 | OUT 15 | SHIFT 16 | REDUCE(LOC) 17 | OUT 18 | 19 | #### Data structures 20 | 21 | * **buffer** - sequence of tokens, read from left to right 22 | * **stack** - working memory 23 | * **output buffer** - sequence of labeled segments constructed from left to right 24 | 25 | #### Operations 26 | 27 | * `SHIFT` - move word from **buffer** to top of **stack** 28 | * `REDUCE(X)` - all words on **stack** are popped, combined to form a segment and labeled with `X` and copied to **output buffer** 29 | * `OUT` - move one token from **buffer** to **output buffer** 30 | 31 | #### Dataset & Preprocessing 32 | 33 | Datasets are in /usr0/home/kkawakam/conll2003 34 | 35 | Convert conll format to ner action (convert-conll2trans.pl) and convert it to parser friendly format (conll2parser.py). 36 | 37 | ```bash 38 | perl convert-conll2trans.pl conll2003/train > conll2003/train.trans 39 | python conll2parser.py -f conll2003/train.trans > conll2003/train.parser 40 | ``` 41 | 42 | #### Training 43 | 44 | ./lstm-parse -T /usr0/home/kkawakam/conll2003/train.parser -d /usr0/home/kkawakam/conll2003/dev.parser --hidden_dim 100 --lstm_input_dim 100 -w /usr3/home/lingwang/chris/sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 --input_dim 100 -t -S -D 0.3 > logNERYesCharNoPosYesEmbeddingsD0.3.txt & 45 | 46 | 47 | ### Decoding 48 | 49 | 50 | ./lstm-parse -T /usr0/home/kkawakam/conll2003/train.parser -d /usr0/home/kkawakam/conll2003/test.parser --hidden_dim 100 --lstm_input_dim 100 -w /usr3/home/lingwang/chris/sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 --input_dim 100 -m latest_model -S > output.txt 51 | python attach_prediction.py -p output.txt -t /usr0/home/kkawakam/conll2003/test -o evaloutput.txt 52 | 53 | 54 | #### Evaluation 55 | 56 | Attach your prediction to test file 57 | 58 | ```bash 59 | python attach_prediction.py -p (prediction) -t /path/to/conll2003/test -o (output file) 60 | ./conlleval < (output file) 61 | ``` 62 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | Minimum Requirements 2 | ====================== 3 | 1. Cmake 2.8.7 4 | 2. [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) 5 | 3. `dynet` (Don't have to install this separately; will be installed below) 6 | 7 | Installation Steps 8 | ====================== 9 | These steps have been tested on Ubuntu 16.04 and macOS Sierra. 10 | 11 | 1. Once this repository has been cloned, the `dynet/` submodule needs to be synced 12 | 13 | ```bash 14 | git submodule init 15 | git submodule update 16 | ``` 17 | 2. This will download the required files to dynet directory. Let this directory be `PATH_TO_DYNET`. 18 | 19 | ```bash 20 | PATH_TO_DYNET=/dynet/ 21 | ``` 22 | 3. Download the C++ library `eigen` which is used by dynet: 23 | 24 | ```bash 25 | cd $HOME 26 | hg clone https://bitbucket.org/eigen/eigen/ 27 | cd eigen 28 | ``` 29 | **Note:** There were compilation issues with some versions of `eigen`. This installation has been successful with `Eigen v3.3.1` 30 | 4. Now, create a `build` directory and install eigen: 31 | 32 | ```bash 33 | mkdir build 34 | cd build 35 | cmake .. 36 | ``` 37 | 5. Run `sudo make install`. This will push the library files to the local `include` directory. On Ubuntu 16.04 and macOS Sierra, they are copied to `/usr/local/include/eigen3`. 38 | 6. Go back to `dynet` directory in `stack-lstm-ner` and build `dynet`. Modify the code below with your `eigen3` `include` location and boost location. 39 | 40 | ```bash 41 | cd $PATH_TO_DYNET 42 | mkdir build 43 | cd build 44 | cmake .. -DEIGEN3_INCLUDE_DIR=/usr/local/include/eigen3 45 | make -j 2 46 | ``` 47 | **Note:** If DYNET fails to compile and throws an error like this: 48 | ```bash 49 | $ make -j 2 50 | Scanning dependencies of target dynet 51 | Scanning dependencies of target dynet_shared 52 | [ 1%] [ 2%] Building CXX object dynet/CMakeFiles/dynet.dir/cfsm-builder.cc.o 53 | Building CXX object dynet/CMakeFiles/dynet_shared.dir/cfsm-builder.cc.o 54 | In file included from /home/user/dynet/dynet/dynet.h:13:0, 55 | from /home/user/dynet/dynet/cfsm-builder.h:6, 56 | from /home/user/dynet/dynet/cfsm-builder.cc:1: 57 | /home/user/dynet/dynet/tensor.h:22:42: fatal error: unsupported/Eigen/CXX11/Tensor: No such file or directory 58 | #include 59 | ^ 60 | compilation terminated. 61 | ``` 62 | Then, download and install a stable version of Eigen and rebuild DyNet: 63 | 64 | ```bash 65 | cd $HOME 66 | wget u.cs.biu.ac.il/~yogo/eigen.tgz 67 | tar zxvf eigen.tgz 68 | cd eigen 69 | ``` 70 | Repeat step 4 and run: 71 | 72 | ```bash 73 | cd $PATH_TO_DYNET/build 74 | rm -rf * 75 | ``` 76 | Now, rebuild DyNet again. 77 | 7. Create a `build` directory in and in the same directory `stack-lstm-ner`, do `cmake . -DEIGEN3_INCLUDE_DIR=/usr/local/include/eigen3` and then `make`. This will build `lstm-parse` in `ner-system` 78 | 79 | Debugging build errors 80 | ======================== 81 | If you want to see the compile commands that are used, you can run 82 | 83 | ```bash 84 | make VERBOSE=1 85 | ``` 86 | -------------------------------------------------------------------------------- /cmake/FindEigen3.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find Eigen3 lib 2 | # 3 | # This module supports requiring a minimum version, e.g. you can do 4 | # find_package(Eigen3 3.1.2) 5 | # to require version 3.1.2 or newer of Eigen3. 6 | # 7 | # Once done this will define 8 | # 9 | # EIGEN3_FOUND - system has eigen lib with correct version 10 | # EIGEN3_INCLUDE_DIR - the eigen include directory 11 | # EIGEN3_VERSION - eigen version 12 | 13 | # Copyright (c) 2006, 2007 Montel Laurent, 14 | # Copyright (c) 2008, 2009 Gael Guennebaud, 15 | # Copyright (c) 2009 Benoit Jacob 16 | # Redistribution and use is allowed according to the terms of the 2-clause BSD license. 17 | 18 | if(NOT Eigen3_FIND_VERSION) 19 | if(NOT Eigen3_FIND_VERSION_MAJOR) 20 | set(Eigen3_FIND_VERSION_MAJOR 2) 21 | endif(NOT Eigen3_FIND_VERSION_MAJOR) 22 | if(NOT Eigen3_FIND_VERSION_MINOR) 23 | set(Eigen3_FIND_VERSION_MINOR 91) 24 | endif(NOT Eigen3_FIND_VERSION_MINOR) 25 | if(NOT Eigen3_FIND_VERSION_PATCH) 26 | set(Eigen3_FIND_VERSION_PATCH 0) 27 | endif(NOT Eigen3_FIND_VERSION_PATCH) 28 | 29 | set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}") 30 | endif(NOT Eigen3_FIND_VERSION) 31 | 32 | macro(_eigen3_check_version) 33 | file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header) 34 | 35 | string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}") 36 | set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}") 37 | string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}") 38 | set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}") 39 | string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}") 40 | set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}") 41 | 42 | set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION}) 43 | if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 44 | set(EIGEN3_VERSION_OK FALSE) 45 | else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 46 | set(EIGEN3_VERSION_OK TRUE) 47 | endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 48 | 49 | if(NOT EIGEN3_VERSION_OK) 50 | 51 | message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, " 52 | "but at least version ${Eigen3_FIND_VERSION} is required") 53 | endif(NOT EIGEN3_VERSION_OK) 54 | endmacro(_eigen3_check_version) 55 | 56 | if (EIGEN3_INCLUDE_DIR) 57 | 58 | # in cache already 59 | _eigen3_check_version() 60 | set(EIGEN3_FOUND ${EIGEN3_VERSION_OK}) 61 | 62 | else (EIGEN3_INCLUDE_DIR) 63 | 64 | find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library 65 | PATHS 66 | ${CMAKE_INSTALL_PREFIX}/include 67 | ${KDE4_INCLUDE_DIR} 68 | PATH_SUFFIXES eigen3 eigen 69 | ) 70 | 71 | if(EIGEN3_INCLUDE_DIR) 72 | _eigen3_check_version() 73 | endif(EIGEN3_INCLUDE_DIR) 74 | 75 | include(FindPackageHandleStandardArgs) 76 | find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK) 77 | 78 | mark_as_advanced(EIGEN3_INCLUDE_DIR) 79 | 80 | endif(EIGEN3_INCLUDE_DIR) 81 | 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Transition-based NER system. 2 | 3 | This system is part of a paper accepted at NAACL-HLT 2016 Conference. 4 | See the paper here: http://arxiv.org/pdf/1603.01360v1.pdf 5 | 6 | #### Desired labeling 7 | 8 | John Smith went to Pittsburgh . 9 | PER----- O O LOC O 10 | 11 | Corresponding sequence of operations (generated by `convert-conll2trans.pl`) 12 | 13 | SHIFT 14 | SHIFT 15 | REDUCE(PER) 16 | OUT 17 | OUT 18 | SHIFT 19 | REDUCE(LOC) 20 | OUT 21 | 22 | #### Data structures 23 | 24 | * **buffer** - sequence of tokens, read from left to right 25 | * **stack** - working memory 26 | * **output buffer** - sequence of labeled segments constructed from left to right 27 | 28 | #### Operations 29 | 30 | * `SHIFT` - move word from **buffer** to top of **stack** 31 | * `REDUCE(X)` - all words on **stack** are popped, combined to form a segment and labeled with `X` and copied to **output buffer** 32 | * `OUT` - move one token from **buffer** to **output buffer** 33 | 34 | #### Dataset & Preprocessing 35 | 36 | We use the datasets from conll2002 and conll2003 37 | 38 | Convert conll format to ner action (convert-conll2trans.pl) and convert it to parser friendly format (conll2parser.py). 39 | 40 | ```bash 41 | perl convert-conll2trans.pl conll2003/train > conll2003/train.trans 42 | python conll2parser.py -f conll2003/train.trans > conll2003/train.parser 43 | ``` 44 | 45 | If you see that the words in the oracle have ' symbol , do the following in the trainind/test/dev datasets: 46 | ``` 47 | []['Peter-NNP', 'Blackburn-NNP'] --> [][Peter-NNP, Blackburn-NNP] 48 | ``` 49 | 50 | Link to the word vectors that we used in the NAACL 2016 paper for English: [sskip.100.vectors](https://drive.google.com/file/d/0B8nESzOdPhLsdWF2S1Ayb1RkTXc/view?usp=sharing). 51 | 52 | 53 | #### Build the system 54 | 55 | The first time you clone the repository, you need to sync the dynet/ submodule. 56 | ``` 57 | git submodule init 58 | git submodule update 59 | 60 | mkdir build 61 | cd build 62 | cmake .. -DEIGEN3_INCLUDE_DIR=/path/to/eigen 63 | make -j2 64 | ``` 65 | 66 | #### Training 67 | 68 | ./lstm-parse -T conll2003/train.parser -d conll2003/dev.parser --hidden_dim 100 --lstm_input_dim 100 -w sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 --input_dim 100 -t -S -D 0.3 > logNERYesCharNoPosYesEmbeddingsD0.3.txt & 69 | 70 | 71 | ### Decoding 72 | 73 | 74 | ./lstm-parse -T conll2003/train.parser -d conll2003/test.parser --hidden_dim 100 --lstm_input_dim 100 -w sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 --input_dim 100 -m latest_model -S > output.txt 75 | python attach_prediction.py -p output.txt -t conll2003/test -o evaloutput.txt 76 | 77 | 78 | #### Evaluation 79 | 80 | Attach your prediction to test file 81 | 82 | ```bash 83 | python attach_prediction.py -p (prediction) -t /path/to/conll2003/test -o (output file) 84 | ./conlleval < (output file) 85 | ``` 86 | #### Citation 87 | 88 | If you make use of this software, please cite the following: 89 | 90 | @inproceedings{2016naacl, 91 | author={Guillaume Lample and Miguel Ballesteros and Kazuya Kawakami and Sandeep Subramanian and Chris Dyer}, 92 | title={Neural Architectures for Named Entity Recognition}, 93 | booktitle={Proc. NAACL-HLT}, 94 | year=2016, 95 | } 96 | 97 | 98 | #### License 99 | 100 | This software is released under the terms of the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). 101 | 102 | #### Contact 103 | 104 | For questions and usage issues, please contact miguel.ballesteros@upf.edu 105 | 106 | -------------------------------------------------------------------------------- /ner-system/c2.h: -------------------------------------------------------------------------------- 1 | #ifndef CPYPDICT_H_ 2 | #define CPYPDICT_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace cpyp { 18 | 19 | class Corpus { 20 | //typedef std::unordered_map > Map; 21 | // typedef std::unordered_map > ReverseMap; 22 | public: 23 | bool USE_SPELLING=false; 24 | 25 | std::map> correct_act_sent; 26 | std::map> sentences; 27 | std::map> sentencesPos; 28 | 29 | std::map> correct_act_sentDev; 30 | std::map> sentencesDev; 31 | std::map> sentencesPosDev; 32 | std::map> sentencesStrDev; 33 | unsigned nsentencesDev; 34 | 35 | unsigned nsentences; 36 | unsigned nwords; 37 | unsigned nactions; 38 | unsigned npos; 39 | 40 | unsigned nsentencestest; 41 | unsigned nsentencesdev; 42 | int max; 43 | int maxPos; 44 | 45 | std::map wordsToInt; 46 | std::map intToWords; 47 | std::vector actions; 48 | 49 | std::map posToInt; 50 | std::map intToPos; 51 | 52 | int maxChars; 53 | std::map charsToInt; 54 | std::map intToChars; 55 | 56 | // String literals 57 | static constexpr const char* UNK = "UNK"; 58 | static constexpr const char* BAD0 = ""; 59 | 60 | /* std::map* headsTraining; 61 | std::map* labelsTraining; 62 | 63 | std::map* headsParsing; 64 | std::map* labelsParsing;*/ 65 | 66 | 67 | 68 | public: 69 | Corpus() { 70 | max = 0; 71 | maxPos = 0; 72 | maxChars=0; //Miguel 73 | } 74 | 75 | 76 | inline unsigned UTF8Len(unsigned char x) { 77 | if (x < 0x80) return 1; 78 | else if ((x >> 5) == 0x06) return 2; 79 | else if ((x >> 4) == 0x0e) return 3; 80 | else if ((x >> 3) == 0x1e) return 4; 81 | else if ((x >> 2) == 0x3e) return 5; 82 | else if ((x >> 1) == 0x7e) return 6; 83 | else return 0; 84 | } 85 | 86 | 87 | 88 | 89 | inline void load_correct_actions(std::string file){ 90 | 91 | std::ifstream actionsFile(file); 92 | //correct_act_sent=new vector>(); 93 | std::string lineS; 94 | 95 | int count=-1; 96 | int sentence=-1; 97 | bool initial=false; 98 | bool first=true; 99 | wordsToInt[Corpus::BAD0] = 0; 100 | intToWords[0] = Corpus::BAD0; 101 | wordsToInt[Corpus::UNK] = 1; // unknown symbol 102 | intToWords[1] = Corpus::UNK; 103 | assert(max == 0); 104 | assert(maxPos == 0); 105 | max=2; 106 | maxPos=1; 107 | 108 | charsToInt[BAD0]=1; 109 | intToChars[1]="BAD0"; 110 | maxChars=1; 111 | 112 | std::vector current_sent; 113 | std::vector current_sent_pos; 114 | while (getline(actionsFile, lineS)){ 115 | //istringstream iss(line); 116 | //string lineS; 117 | //iss>>lineS; 118 | ReplaceStringInPlace(lineS, "-RRB-", "_RRB_"); 119 | ReplaceStringInPlace(lineS, "-LRB-", "_LRB_"); 120 | if (lineS.empty()) { 121 | count = 0; 122 | if (!first) { 123 | sentences[sentence] = current_sent; 124 | sentencesPos[sentence] = current_sent_pos; 125 | } 126 | 127 | sentence++; 128 | nsentences = sentence; 129 | 130 | initial = true; 131 | current_sent.clear(); 132 | current_sent_pos.clear(); 133 | } else if (count == 0) { 134 | first = false; 135 | //stack and buffer, for now, leave it like this. 136 | count = 1; 137 | if (initial) { 138 | // the initial line in each sentence may look like: 139 | // [][the-det, cat-noun, is-verb, on-adp, the-det, mat-noun, ,-punct, ROOT-ROOT] 140 | // first, get rid of the square brackets. 141 | lineS = lineS.substr(3, lineS.size() - 4); 142 | // read the initial line, token by token "the-det," "cat-noun," ... 143 | std::istringstream iss(lineS); 144 | do { 145 | std::string word; 146 | iss >> word; 147 | if (word.size() == 0) { continue; } 148 | // remove the trailing comma if need be. 149 | if (word[word.size() - 1] == ',') { 150 | word = word.substr(0, word.size() - 1); 151 | } 152 | // split the string (at '-') into word and POS tag. 153 | size_t posIndex = word.rfind('-'); 154 | if (posIndex == std::string::npos) { 155 | std::cerr << "cant find the dash in '" << word << "'" << std::endl; 156 | } 157 | assert(posIndex != std::string::npos); 158 | std::string pos = word.substr(posIndex + 1); 159 | word = word.substr(0, posIndex); 160 | // new POS tag 161 | if (posToInt[pos] == 0) { 162 | posToInt[pos] = maxPos; 163 | intToPos[maxPos] = pos; 164 | npos = maxPos; 165 | maxPos++; 166 | } 167 | 168 | // new word 169 | if (wordsToInt[word] == 0) { 170 | wordsToInt[word] = max; 171 | intToWords[max] = word; 172 | nwords = max; 173 | max++; 174 | 175 | unsigned j = 0; 176 | while(j < word.length()) { 177 | std::string wj = ""; 178 | for (unsigned h = j; h < j + UTF8Len(word[j]); h++) { 179 | wj += word[h]; 180 | } 181 | if (charsToInt[wj] == 0) { 182 | charsToInt[wj] = maxChars; 183 | intToChars[maxChars] = wj; 184 | maxChars++; 185 | } 186 | j += UTF8Len(word[j]); 187 | } 188 | } 189 | 190 | current_sent.push_back(wordsToInt[word]); 191 | current_sent_pos.push_back(posToInt[pos]); 192 | } while(iss); 193 | } 194 | initial=false; 195 | } 196 | else if (count==1){ 197 | int i=0; 198 | bool found=false; 199 | for (auto a: actions) { 200 | if (a==lineS) { 201 | std::vector a=correct_act_sent[sentence]; 202 | a.push_back(i); 203 | correct_act_sent[sentence]=a; 204 | found=true; 205 | } 206 | i++; 207 | } 208 | if (!found) { 209 | actions.push_back(lineS); 210 | std::vector a=correct_act_sent[sentence]; 211 | a.push_back(actions.size()-1); 212 | correct_act_sent[sentence]=a; 213 | } 214 | count=0; 215 | } 216 | } 217 | 218 | // Add the last sentence. 219 | if (current_sent.size() > 0) { 220 | sentences[sentence] = current_sent; 221 | sentencesPos[sentence] = current_sent_pos; 222 | sentence++; 223 | nsentences = sentence; 224 | } 225 | 226 | actionsFile.close(); 227 | /* std::string oov="oov"; 228 | posToInt[oov]=maxPos; 229 | intToPos[maxPos]=oov; 230 | npos=maxPos; 231 | maxPos++; 232 | wordsToInt[oov]=max; 233 | intToWords[max]=oov; 234 | nwords=max; 235 | max++;*/ 236 | 237 | std::cerr<<"done"<<"\n"; 238 | for (auto a: actions) { 239 | std::cerr< 1); 267 | assert(max > 3); 268 | int count = -1; 269 | int sentence = -1; 270 | bool initial = false; 271 | bool first = true; 272 | std::vector current_sent; 273 | std::vector current_sent_pos; 274 | std::vector current_sent_str; 275 | while (getline(actionsFile, lineS)) { 276 | ReplaceStringInPlace(lineS, "-RRB-", "_RRB_"); 277 | ReplaceStringInPlace(lineS, "-LRB-", "_LRB_"); 278 | if (lineS.empty()) { 279 | // an empty line marks the end of a sentence. 280 | count = 0; 281 | if (!first) { 282 | sentencesDev[sentence] = current_sent; 283 | sentencesPosDev[sentence] = current_sent_pos; 284 | sentencesStrDev[sentence] = current_sent_str; 285 | } 286 | 287 | sentence++; 288 | nsentencesDev = sentence; 289 | 290 | initial = true; 291 | current_sent.clear(); 292 | current_sent_pos.clear(); 293 | current_sent_str.clear(); 294 | } else if (count == 0) { 295 | first = false; 296 | //stack and buffer, for now, leave it like this. 297 | count = 1; 298 | if (initial) { 299 | // the initial line in each sentence may look like: 300 | // [][the-det, cat-noun, is-verb, on-adp, the-det, mat-noun, ,-punct, ROOT-ROOT] 301 | // first, get rid of the square brackets. 302 | lineS = lineS.substr(3, lineS.size() - 4); 303 | // read the initial line, token by token "the-det," "cat-noun," ... 304 | std::istringstream iss(lineS); 305 | do { 306 | std::string word; 307 | iss >> word; 308 | if (word.size() == 0) { continue; } 309 | // remove the trailing comma if need be. 310 | if (word[word.size() - 1] == ',') { 311 | word = word.substr(0, word.size() - 1); 312 | } 313 | // split the string (at '-') into word and POS tag. 314 | size_t posIndex = word.rfind('-'); 315 | assert(posIndex != std::string::npos); 316 | std::string pos = word.substr(posIndex + 1); 317 | word = word.substr(0, posIndex); 318 | // new POS tag 319 | if (posToInt[pos] == 0) { 320 | posToInt[pos] = maxPos; 321 | intToPos[maxPos] = pos; 322 | npos = maxPos; 323 | maxPos++; 324 | } 325 | // add an empty string for any token except OOVs (it is easy to 326 | // recover the surface form of non-OOV using intToWords(id)). 327 | current_sent_str.push_back(""); 328 | // OOV word 329 | if (wordsToInt[word] == 0) { 330 | if (USE_SPELLING) { 331 | max = nwords + 1; 332 | //std::cerr<< "max:" << max << "\n"; 333 | wordsToInt[word] = max; 334 | intToWords[max] = word; 335 | nwords = max; 336 | } else { 337 | // save the surface form of this OOV before overwriting it. 338 | current_sent_str[current_sent_str.size()-1] = word; 339 | word = Corpus::UNK; 340 | } 341 | } 342 | current_sent.push_back(wordsToInt[word]); 343 | current_sent_pos.push_back(posToInt[pos]); 344 | } while(iss); 345 | } 346 | initial = false; 347 | } else if (count == 1) { 348 | auto actionIter = std::find(actions.begin(), actions.end(), lineS); 349 | if (actionIter != actions.end()) { 350 | unsigned actionIndex = std::distance(actions.begin(), actionIter); 351 | correct_act_sentDev[sentence].push_back(actionIndex); 352 | } else { 353 | // TODO: right now, new actions which haven't been observed in training 354 | // are not added to correct_act_sentDev. This may be a problem if the 355 | // training data is little. 356 | } 357 | count=0; 358 | } 359 | } 360 | 361 | // Add the last sentence. 362 | if (current_sent.size() > 0) { 363 | sentencesDev[sentence] = current_sent; 364 | sentencesPosDev[sentence] = current_sent_pos; 365 | sentencesStrDev[sentence] = current_sent_str; 366 | sentence++; 367 | nsentencesDev = sentence; 368 | } 369 | 370 | actionsFile.close(); 371 | } 372 | 373 | void ReplaceStringInPlace(std::string& subject, const std::string& search, 374 | const std::string& replace) { 375 | size_t pos = 0; 376 | while ((pos = subject.find(search, pos)) != std::string::npos) { 377 | subject.replace(pos, search.length(), replace); 378 | pos += replace.length(); 379 | } 380 | } 381 | 382 | 383 | /* inline unsigned max() const { return words_.size(); } 384 | inline unsigned size() const { return words_.size(); } 385 | inline unsigned count(const std::string& word) const { return d_.count(word); }*/ 386 | 387 | /* static bool is_ws(char x) { 388 | return (x == ' ' || x == '\t'); 389 | } 390 | 391 | inline void ConvertWhitespaceDelimitedLine(const std::string& line, std::vector* out) { 392 | size_t cur = 0; 393 | size_t last = 0; 394 | int state = 0; 395 | out->clear(); 396 | while(cur < line.size()) { 397 | if (is_ws(line[cur++])) { 398 | if (state == 0) continue; 399 | out->push_back(Convert(line.substr(last, cur - last - 1))); 400 | state = 0; 401 | } else { 402 | if (state == 1) continue; 403 | last = cur - 1; 404 | state = 1; 405 | } 406 | } 407 | if (state == 1) 408 | out->push_back(Convert(line.substr(last, cur - last))); 409 | } 410 | 411 | inline unsigned Convert(const std::string& word, bool frozen = false) { 412 | Map::iterator i = d_.find(word); 413 | if (i == d_.end()) { 414 | if (frozen) 415 | return 0; 416 | words_.push_back(word); 417 | d_[word] = words_.size(); 418 | return words_.size(); 419 | } else { 420 | return i->second; 421 | } 422 | } 423 | 424 | inline const std::string& Convert(const unsigned id) const { 425 | if (id == 0) return b0_; 426 | return words_[id-1]; 427 | } 428 | template void serialize(Archive& ar, const unsigned int version) { 429 | ar & b0_; 430 | ar & words_; 431 | ar & d_; 432 | } 433 | private: 434 | std::string b0_; 435 | std::vector words_; 436 | Map d_;*/ 437 | }; 438 | 439 | /*void ReadFromFile(const std::string& filename, 440 | Corpus* d, 441 | std::vector >* src, 442 | std::set* src_vocab) { 443 | std::cerr << "Reading from " << filename << std::endl; 444 | std::ifstream in(filename); 445 | assert(in); 446 | std::string line; 447 | int lc = 0; 448 | while(getline(in, line)) { 449 | ++lc; 450 | src->push_back(std::vector()); 451 | d->ConvertWhitespaceDelimitedLine(line, &src->back()); 452 | for (unsigned i = 0; i < src->back().size(); ++i) src_vocab->insert(src->back()[i]); 453 | } 454 | } 455 | 456 | void ReadParallelCorpusFromFile(const std::string& filename, 457 | Corpus* d, 458 | std::vector >* src, 459 | std::vector >* trg, 460 | std::set* src_vocab, 461 | std::set* trg_vocab) { 462 | src->clear(); 463 | trg->clear(); 464 | std::cerr << "Reading from " << filename << std::endl; 465 | std::ifstream in(filename); 466 | assert(in); 467 | std::string line; 468 | int lc = 0; 469 | std::vector v; 470 | const unsigned kDELIM = d->Convert("|||"); 471 | while(getline(in, line)) { 472 | ++lc; 473 | src->push_back(std::vector()); 474 | trg->push_back(std::vector()); 475 | d->ConvertWhitespaceDelimitedLine(line, &v); 476 | unsigned j = 0; 477 | while(j < v.size() && v[j] != kDELIM) { 478 | src->back().push_back(v[j]); 479 | src_vocab->insert(v[j]); 480 | ++j; 481 | } 482 | if (j >= v.size()) { 483 | std::cerr << "Malformed input in parallel corpus: " << filename << ":" << lc << std::endl; 484 | abort(); 485 | } 486 | ++j; 487 | while(j < v.size()) { 488 | trg->back().push_back(v[j]); 489 | trg_vocab->insert(v[j]); 490 | ++j; 491 | } 492 | } 493 | }*/ 494 | 495 | } // namespace 496 | 497 | #endif 498 | -------------------------------------------------------------------------------- /ner-system/lstm-parse.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | // #include 19 | // #include 20 | #include 21 | 22 | #include "dynet/training.h" 23 | #include "dynet/dynet.h" 24 | #include "dynet/expr.h" 25 | #include "dynet/nodes.h" 26 | #include "dynet/lstm.h" 27 | #include "dynet/rnn.h" 28 | #include "dynet/io.h" 29 | #include "c2.h" 30 | 31 | cpyp::Corpus corpus; 32 | volatile bool requested_stop = false; 33 | unsigned LAYERS = 2; 34 | unsigned INPUT_DIM = 40; 35 | unsigned HIDDEN_DIM = 60; 36 | unsigned ACTION_DIM = 36; 37 | unsigned PRETRAINED_DIM = 50; 38 | unsigned LSTM_INPUT_DIM = 60; 39 | unsigned POS_DIM = 10; 40 | unsigned REL_DIM = 8; 41 | 42 | 43 | unsigned LSTM_CHAR_OUTPUT_DIM = 100; //Miguel 44 | bool USE_SPELLING = false; 45 | 46 | float DROPOUT = 0.0f; 47 | 48 | bool USE_POS = false; 49 | 50 | constexpr const char* ROOT_SYMBOL = "ROOT"; 51 | unsigned kROOT_SYMBOL = 0; 52 | unsigned ACTION_SIZE = 0; 53 | unsigned VOCAB_SIZE = 0; 54 | unsigned POS_SIZE = 0; 55 | 56 | unsigned CHAR_SIZE = 255; //size of ascii chars... Miguel 57 | 58 | using namespace dynet; 59 | using namespace std; 60 | namespace po = boost::program_options; 61 | 62 | vector possible_actions; 63 | unordered_map> pretrained; 64 | 65 | void InitCommandLine(int argc, char** argv, po::variables_map* conf) { 66 | po::options_description opts("Configuration options"); 67 | opts.add_options() 68 | ("training_data,T", po::value(), "List of Transitions - Training corpus") 69 | ("dev_data,d", po::value(), "Development corpus") 70 | ("test_data,p", po::value(), "Test corpus") 71 | ("dropout,D", po::value(), "Dropout rate") 72 | ("unk_strategy,o", po::value()->default_value(1), "Unknown word strategy: 1 = singletons become UNK with probability unk_prob") 73 | ("unk_prob,u", po::value()->default_value(0.2), "Probably with which to replace singletons with UNK in training data") 74 | ("model,m", po::value(), "Load saved model from this file") 75 | ("use_pos_tags,P", "make POS tags visible to parser") 76 | ("beam_size,b", po::value()->default_value(1), "beam size") 77 | ("layers", po::value()->default_value(2), "number of LSTM layers") 78 | ("action_dim", po::value()->default_value(16), "action embedding size") 79 | ("input_dim", po::value()->default_value(32), "input embedding size") 80 | ("hidden_dim", po::value()->default_value(64), "hidden dimension") 81 | ("pretrained_dim", po::value()->default_value(50), "pretrained input dimension") 82 | ("pos_dim", po::value()->default_value(12), "POS dimension") 83 | ("rel_dim", po::value()->default_value(10), "relation dimension") 84 | ("lstm_input_dim", po::value()->default_value(60), "LSTM input dimension") 85 | ("train,t", "Should training be run?") 86 | ("words,w", po::value(), "Pretrained word embeddings") 87 | ("use_spelling,S", "Use spelling model") //Miguel. Spelling model 88 | ("help,h", "Help"); 89 | po::options_description dcmdline_options; 90 | dcmdline_options.add(opts); 91 | po::store(parse_command_line(argc, argv, dcmdline_options), *conf); 92 | if (conf->count("help")) { 93 | cerr << dcmdline_options << endl; 94 | exit(1); 95 | } 96 | if (conf->count("training_data") == 0) { 97 | cerr << "Please specify --traing_data (-T): this is required to determine the vocabulary mapping, even if the parser is used in prediction mode.\n"; 98 | exit(1); 99 | } 100 | } 101 | 102 | struct ParserBuilder { 103 | 104 | LSTMBuilder stack_lstm; // (layers, input, hidden, trainer) 105 | LSTMBuilder output_lstm; // (layers, input, hidden, trainer) 106 | LSTMBuilder buffer_lstm; 107 | LSTMBuilder action_lstm; 108 | 109 | 110 | LSTMBuilder ent_lstm_fwd; 111 | LSTMBuilder ent_lstm_rev; 112 | 113 | LookupParameter p_w; // word embeddings 114 | LookupParameter p_t; // pretrained word embeddings (not updated) 115 | LookupParameter p_a; // input action embeddings 116 | LookupParameter p_r; // relation embeddings 117 | LookupParameter p_p; // pos tag embeddings 118 | Parameter p_pbias; // parser state bias 119 | Parameter p_A; // action lstm to parser state 120 | Parameter p_B; // buffer lstm to parser state 121 | Parameter p_O; // output lstm to parser state 122 | 123 | Parameter p_S; // stack lstm to parser state 124 | Parameter p_H; // head matrix for composition function 125 | Parameter p_D; // dependency matrix for composition function 126 | Parameter p_R; // relation matrix for composition function 127 | Parameter p_w2l; // word to LSTM input 128 | Parameter p_p2l; // POS to LSTM input 129 | Parameter p_t2l; // pretrained word embeddings to LSTM input 130 | Parameter p_ib; // LSTM input bias 131 | Parameter p_cbias; // composition function bias 132 | Parameter p_p2a; // parser state to action 133 | Parameter p_action_start; // action bias 134 | Parameter p_abias; // action bias 135 | Parameter p_buffer_guard; // end of buffer 136 | Parameter p_stack_guard; // end of stack 137 | Parameter p_output_guard; // end of output buffer 138 | 139 | Parameter p_start_of_word;//Miguel -->dummy symbol 140 | Parameter p_end_of_word; //Miguel --> dummy symbol 141 | LookupParameter char_emb; //Miguel-> mapping of characters to vectors 142 | 143 | 144 | LSTMBuilder fw_char_lstm; // Miguel 145 | LSTMBuilder bw_char_lstm; //Miguel 146 | 147 | Parameter p_cW; 148 | 149 | 150 | explicit ParserBuilder(ParameterCollection & model, const unordered_map>& pretrained) : 151 | stack_lstm(LAYERS, LSTM_INPUT_DIM, HIDDEN_DIM, model), 152 | output_lstm(LAYERS, LSTM_INPUT_DIM, HIDDEN_DIM, model), 153 | buffer_lstm(LAYERS, LSTM_INPUT_DIM, HIDDEN_DIM, model), 154 | action_lstm(LAYERS, ACTION_DIM, HIDDEN_DIM, model), 155 | ent_lstm_fwd(LAYERS, LSTM_INPUT_DIM, LSTM_INPUT_DIM, model), 156 | ent_lstm_rev(LAYERS, LSTM_INPUT_DIM, LSTM_INPUT_DIM, model), 157 | p_w(model.add_lookup_parameters(VOCAB_SIZE, {INPUT_DIM})), 158 | p_a(model.add_lookup_parameters(ACTION_SIZE, {ACTION_DIM})), 159 | p_r(model.add_lookup_parameters(ACTION_SIZE, {REL_DIM})), 160 | p_pbias(model.add_parameters({HIDDEN_DIM})), 161 | p_A(model.add_parameters({HIDDEN_DIM, HIDDEN_DIM})), 162 | p_B(model.add_parameters({HIDDEN_DIM, HIDDEN_DIM})), 163 | p_O(model.add_parameters({HIDDEN_DIM, HIDDEN_DIM})), 164 | p_S(model.add_parameters({HIDDEN_DIM, HIDDEN_DIM})), 165 | p_H(model.add_parameters({LSTM_INPUT_DIM, LSTM_INPUT_DIM})), 166 | p_D(model.add_parameters({LSTM_INPUT_DIM, LSTM_INPUT_DIM})), 167 | p_R(model.add_parameters({LSTM_INPUT_DIM, REL_DIM})), 168 | p_w2l(model.add_parameters({LSTM_INPUT_DIM, INPUT_DIM})), 169 | p_ib(model.add_parameters({LSTM_INPUT_DIM})), 170 | p_cbias(model.add_parameters({LSTM_INPUT_DIM})), 171 | p_p2a(model.add_parameters({ACTION_SIZE, HIDDEN_DIM})), 172 | p_action_start(model.add_parameters({ACTION_DIM})), 173 | p_abias(model.add_parameters({ACTION_SIZE})), 174 | 175 | p_buffer_guard(model.add_parameters({LSTM_INPUT_DIM})), 176 | p_stack_guard(model.add_parameters({LSTM_INPUT_DIM})), 177 | p_output_guard(model.add_parameters({LSTM_INPUT_DIM})), 178 | 179 | p_start_of_word(model.add_parameters({LSTM_INPUT_DIM})), //Miguel 180 | p_end_of_word(model.add_parameters({LSTM_INPUT_DIM})), //Miguel 181 | 182 | char_emb(model.add_lookup_parameters(CHAR_SIZE, {INPUT_DIM})),//Miguel 183 | 184 | // fw_char_lstm(LAYERS, LSTM_CHAR_OUTPUT_DIM, LSTM_INPUT_DIM, model), //Miguel 185 | // bw_char_lstm(LAYERS, LSTM_CHAR_OUTPUT_DIM, LSTM_INPUT_DIM, model), //Miguel 186 | 187 | fw_char_lstm(LAYERS, LSTM_INPUT_DIM, LSTM_CHAR_OUTPUT_DIM/2, model), //Miguel 188 | bw_char_lstm(LAYERS, LSTM_INPUT_DIM, LSTM_CHAR_OUTPUT_DIM/2, model), /*Miguel*/ 189 | p_cW(model.add_parameters({LSTM_INPUT_DIM, LSTM_INPUT_DIM * 2})) { //ner. { 190 | if (USE_POS) { 191 | p_p = model.add_lookup_parameters(POS_SIZE, {POS_DIM}); 192 | p_p2l = model.add_parameters({LSTM_INPUT_DIM, POS_DIM}); 193 | } 194 | if (pretrained.size() > 0) { 195 | p_t = model.add_lookup_parameters(VOCAB_SIZE, {PRETRAINED_DIM}); 196 | for (auto it : pretrained) 197 | p_t.initialize(it.first, it.second); 198 | p_t2l = model.add_parameters({LSTM_INPUT_DIM, PRETRAINED_DIM}); 199 | } else { 200 | p_t = nullptr; 201 | p_t2l = nullptr; 202 | } 203 | } 204 | 205 | static bool IsActionForbidden(const string& a, unsigned bsize, unsigned ssize, vector stacki) { 206 | 207 | bool is_shift = (a[0] == 'S'); //MIGUEL 208 | bool is_reduce = (a[0] == 'R'); 209 | bool is_output = (a[0] == 'O'); 210 | // std::cout<<"is red:"< 2 && // there is more than a single element on the stack 225 | is_shift) return true; 226 | // only attach left to ROOT 227 | if (bsize == 1 && ssize == 3 && a[0] == 'R') return true; 228 | return false; 229 | }*/ 230 | 231 | map compute_ents(unsigned sent_len, const vector& actions, const vector& setOfActions, map* pr = nullptr) { 232 | map r; 233 | map& rels = (pr ? *pr : r); 234 | for(unsigned i=0;i bufferi(sent_len + 1, 0), stacki(1, -999), outputi(1, -999); 236 | for (unsigned i = 0; i < sent_len; ++i) 237 | bufferi[sent_len - i] = i; 238 | bufferi[0] = -999; 239 | 240 | for (auto action: actions) { // loop over transitions for sentence 241 | const string& actionString=setOfActions[action]; 242 | // std::cout<<"int"< 1); // dummy symbol means > 1 (not >= 1) 246 | stacki.push_back(bufferi.back()); 247 | bufferi.pop_back(); 248 | } 249 | 250 | else if (ac=='R') { // REDUCE 251 | assert(stacki.size() > 1); // dummy symbol means > 2 (not >= 2) 252 | while(stacki.size()>1) { 253 | rels[stacki.back()] = actionString; 254 | outputi.push_back(stacki.back()); 255 | stacki.pop_back(); 256 | } 257 | } 258 | else if (ac =='O') { 259 | assert(bufferi.size() > 1); // dummy symbol means > 1 (not >= 1) 260 | outputi.push_back(bufferi.back()); 261 | rels[bufferi.back()] = "0"; 262 | bufferi.pop_back(); 263 | 264 | } 265 | } 266 | assert(bufferi.size() == 1); 267 | //assert(stacki.size() == 2); 268 | return rels; 269 | } 270 | 271 | 272 | // given the first character of a UTF8 block, find out how wide it is 273 | // see http://en.wikipedia.org/wiki/UTF-8 for more info 274 | inline unsigned int UTF8Len(unsigned char x) { 275 | if (x < 0x80) return 1; 276 | else if ((x >> 5) == 0x06) return 2; 277 | else if ((x >> 4) == 0x0e) return 3; 278 | else if ((x >> 3) == 0x1e) return 4; 279 | else if ((x >> 2) == 0x3e) return 5; 280 | else if ((x >> 1) == 0x7e) return 6; 281 | else return 0; 282 | } 283 | 284 | 285 | // *** if correct_actions is empty, this runs greedy decoding *** 286 | // returns parse actions for input sentence (in training just returns the reference) 287 | // OOV handling: raw_sent will have the actual words 288 | // sent will have words replaced by appropriate UNK tokens 289 | // this lets us use pretrained embeddings, when available, for words that were OOV in the 290 | // parser training data 291 | pair, Expression> log_prob_parser(ComputationGraph* hg, 292 | const vector& raw_sent, // raw sentence 293 | const vector& sent, // sent with oovs replaced 294 | const vector& sentPos, 295 | const vector& correct_actions, 296 | const vector& setOfActions, 297 | const map& intToWords, 298 | bool is_evaluation, 299 | double *right) { 300 | //for (unsigned i = 0; i < sent.size(); ++i) cerr << ' ' << intToWords.find(sent[i])->second; 301 | //cerr << endl; 302 | vector results; 303 | const bool build_training_graph = correct_actions.size() > 0; 304 | //std::cout<<"****************"<<"\n"; 305 | bool apply_dropout = (DROPOUT && !is_evaluation); 306 | 307 | stack_lstm.new_graph(*hg); 308 | buffer_lstm.new_graph(*hg); 309 | output_lstm.new_graph(*hg); 310 | action_lstm.new_graph(*hg); 311 | 312 | ent_lstm_fwd.new_graph(*hg); 313 | ent_lstm_rev.new_graph(*hg); 314 | 315 | 316 | 317 | if (apply_dropout) { 318 | stack_lstm.set_dropout(DROPOUT); 319 | action_lstm.set_dropout(DROPOUT); 320 | buffer_lstm.set_dropout(DROPOUT); 321 | ent_lstm_fwd.set_dropout(DROPOUT); 322 | ent_lstm_rev.set_dropout(DROPOUT); 323 | } else { 324 | stack_lstm.disable_dropout(); 325 | action_lstm.disable_dropout(); 326 | buffer_lstm.disable_dropout(); 327 | ent_lstm_fwd.disable_dropout(); 328 | ent_lstm_rev.disable_dropout(); 329 | } 330 | 331 | stack_lstm.start_new_sequence(); 332 | buffer_lstm.start_new_sequence(); 333 | output_lstm.start_new_sequence(); 334 | action_lstm.start_new_sequence(); 335 | // variables in the computation graph representing the parameters 336 | Expression pbias = parameter(*hg, p_pbias); 337 | // Expression H = parameter(*hg, p_H); 338 | // Expression D = parameter(*hg, p_D); 339 | Expression R = parameter(*hg, p_R); 340 | Expression cbias = parameter(*hg, p_cbias); 341 | Expression S = parameter(*hg, p_S); 342 | Expression B = parameter(*hg, p_B); 343 | Expression O = parameter(*hg, p_O); 344 | 345 | Expression A = parameter(*hg, p_A); 346 | Expression ib = parameter(*hg, p_ib); 347 | Expression w2l = parameter(*hg, p_w2l); 348 | Expression p2l; 349 | if (USE_POS) 350 | p2l = parameter(*hg, p_p2l); 351 | Expression t2l; 352 | if (p_t2l.p) 353 | t2l = parameter(*hg, p_t2l); 354 | Expression p2a = parameter(*hg, p_p2a); 355 | Expression abias = parameter(*hg, p_abias); 356 | Expression action_start = parameter(*hg, p_action_start); 357 | 358 | action_lstm.add_input(action_start); 359 | 360 | Expression cW = parameter(*hg, p_cW); 361 | 362 | vector buffer(sent.size() + 1); // variables representing word embeddings (possibly including POS info) 363 | vector bufferi(sent.size() + 1); // position of the words in the sentence 364 | // precompute buffer representation from left to right 365 | 366 | 367 | Expression word_end = parameter(*hg, p_end_of_word); //Miguel 368 | Expression word_start = parameter(*hg, p_start_of_word); //Miguel 369 | 370 | if (USE_SPELLING){ 371 | fw_char_lstm.new_graph(*hg); 372 | // fw_char_lstm.add_parameter_edges(hg); 373 | 374 | bw_char_lstm.new_graph(*hg); 375 | // bw_char_lstm.add_parameter_edges(hg); 376 | } 377 | 378 | 379 | 380 | for (unsigned i = 0; i < sent.size(); ++i) { 381 | assert(sent[i] < VOCAB_SIZE); 382 | //Expression w = lookup(*hg, p_w, sent[i]); 383 | 384 | unsigned wi=sent[i]; 385 | std::string ww=intToWords.at(wi); 386 | Expression w; 387 | /**********SPELLING MODEL*****************/ 388 | if (USE_SPELLING) { 389 | //std::cout<<"using spelling"<<"\n"; 390 | if (ww.length()==4 && ww[0]=='R' && ww[1]=='O' && ww[2]=='O' && ww[3]=='T'){ 391 | w=lookup(*hg, p_w, sent[i]); //we do not need a LSTM encoding for the root word, so we put it directly-. 392 | } 393 | else { 394 | 395 | fw_char_lstm.start_new_sequence(); 396 | //cerr<<"start_new_sequence done"<<"\n"; 397 | 398 | fw_char_lstm.add_input(word_start); 399 | //cerr<<"added start of word symbol"<<"\n"; 400 | /*for (unsigned j=0;jincremental_forward(); 408 | 409 | }*/ 410 | std::vector strevbuffer; 411 | for (unsigned j=0;jincremental_forward(); 425 | 426 | } 427 | fw_char_lstm.add_input(word_end); 428 | //cerr<<"added end of word symbol"<<"\n"; 429 | 430 | 431 | 432 | Expression fw_i=fw_char_lstm.back(); 433 | 434 | //cerr<<"fw_char_lstm.back() done"<<"\n"; 435 | 436 | bw_char_lstm.start_new_sequence(); 437 | //cerr<<"bw start new sequence done"<<"\n"; 438 | 439 | bw_char_lstm.add_input(word_end); 440 | //for (unsigned j=w.length()-1;j>=0;j--){ 441 | /*for (unsigned j=w.length();j-->0;){ 442 | //cerr<0;j=j-UTF8Len(w[j])) { 456 | 457 | //cerr<incremental_forward(); 467 | 468 | }*/ 469 | bw_char_lstm.add_input(word_start); 470 | //cerr<<"start symbol in bw seq"<<"\n"; 471 | 472 | Expression bw_i=bw_char_lstm.back(); 473 | 474 | vector tt = {fw_i, bw_i}; 475 | w=concatenate(tt); //and this goes into the buffer... 476 | //cerr<<"fw and bw done"<<"\n"; 477 | } 478 | 479 | } 480 | /**************************************************/ 481 | //cerr<<"concatenate?"<<"\n"; 482 | 483 | /***************NO SPELLING*************************************/ 484 | 485 | // Expression w = lookup(*hg, p_w, sent[i]); 486 | else { //NO SPELLING 487 | //Don't use SPELLING 488 | //std::cout<<"don't use spelling"<<"\n"; 489 | w=lookup(*hg, p_w, sent[i]); 490 | } 491 | 492 | Expression i_i; 493 | if (USE_POS) { 494 | Expression p = lookup(*hg, p_p, sentPos[i]); 495 | i_i = affine_transform({ib, w2l, w, p2l, p}); 496 | } else { 497 | i_i = affine_transform({ib, w2l, w}); 498 | } 499 | if (p_t.p && pretrained.count(raw_sent[i])) { 500 | Expression t = const_lookup(*hg, p_t, raw_sent[i]); 501 | i_i = affine_transform({i_i, t2l, t}); 502 | } 503 | buffer[sent.size() - i] = rectify(i_i); 504 | bufferi[sent.size() - i] = i; 505 | } 506 | // dummy symbol to represent the empty buffer 507 | buffer[0] = parameter(*hg, p_buffer_guard); 508 | bufferi[0] = -999; 509 | for (auto& b : buffer) 510 | buffer_lstm.add_input(b); 511 | 512 | vector stack; // variables representing subtree embeddings 513 | vector stacki; // position of words in the sentence of head of subtree 514 | stack.push_back(parameter(*hg, p_stack_guard)); 515 | stacki.push_back(-999); // not used for anything 516 | 517 | vector output; // variables representing subtree embeddings 518 | vector outputi; 519 | output.push_back(parameter(*hg, p_output_guard)); 520 | outputi.push_back(-999); // not used for anything 521 | // drive dummy symbol on stack through LSTM 522 | stack_lstm.add_input(stack.back()); 523 | output_lstm.add_input(output.back()); 524 | vector log_probs; 525 | string rootword; 526 | unsigned action_count = 0; // incremented at each prediction 527 | while(buffer.size() > 1 || stack.size()>1) { 528 | 529 | // get list of possible actions for the current parser state 530 | vector current_valid_actions; 531 | for (auto a: possible_actions) { 532 | if (IsActionForbidden(setOfActions[a], buffer.size(), stack.size(), stacki)) 533 | continue; 534 | current_valid_actions.push_back(a); 535 | } 536 | 537 | // p_t = pbias + S * slstm + B * blstm + A * almst 538 | Expression p_t = affine_transform({pbias, O, output_lstm.back(), S, stack_lstm.back(), B, buffer_lstm.back(), A, action_lstm.back()}); 539 | Expression nlp_t = rectify(p_t); 540 | // r_t = abias + p2a * nlp 541 | Expression r_t = affine_transform({abias, p2a, nlp_t}); 542 | 543 | // adist = log_softmax(r_t, current_valid_actions) 544 | Expression adiste = log_softmax(r_t, current_valid_actions); 545 | vector adist = as_vector(hg->incremental_forward(adiste)); 546 | double best_score = adist[current_valid_actions[0]]; 547 | unsigned best_a = current_valid_actions[0]; 548 | for (unsigned i = 1; i < current_valid_actions.size(); ++i) { 549 | if (adist[current_valid_actions[i]] > best_score) { 550 | best_score = adist[current_valid_actions[i]]; 551 | best_a = current_valid_actions[i]; 552 | } 553 | } 554 | unsigned action = best_a; 555 | if (build_training_graph) { // if we have reference actions (for training) use the reference action 556 | action = correct_actions[action_count]; 557 | if (best_a == action) { (*right)++; } 558 | } 559 | ++action_count; 560 | // action_log_prob = pick(adist, action) 561 | log_probs.push_back(pick(adiste, action)); 562 | results.push_back(action); 563 | //std::cout<<"action:"< 1); // dummy symbol means > 1 (not >= 1) 583 | stack.push_back(buffer.back()); 584 | stack_lstm.add_input(buffer.back()); 585 | stacki.push_back(bufferi.back()); 586 | buffer.pop_back(); 587 | buffer_lstm.rewind_one_step(); 588 | bufferi.pop_back(); 589 | } 590 | else if (ac=='R') { // REDUCE 591 | Expression previous; 592 | Expression comp; 593 | vector entities(stacki.size()); 594 | ent_lstm_fwd.start_new_sequence(); 595 | ent_lstm_rev.start_new_sequence(); 596 | for (unsigned i = 0; i < stacki.size(); ++i) { 597 | ent_lstm_fwd.add_input(stack[i]); 598 | ent_lstm_rev.add_input(stack[stacki.size() - i - 1]); 599 | } 600 | while(stacki.size()>1) { 601 | outputi.push_back(stacki.back()); 602 | stack_lstm.rewind_one_step(); 603 | stack.pop_back(); 604 | stacki.pop_back(); 605 | //COMPOSITION FUNCTION!! ?? 606 | //Expression composed = affine_transform({cbias, H, head, D, dep, R, relation}); 607 | //Expression nlcomposed = tanh(composed); 608 | } 609 | Expression efwd = ent_lstm_fwd.back(); 610 | Expression erev = ent_lstm_rev.back(); 611 | if (apply_dropout) { 612 | efwd = dropout(efwd, DROPOUT); 613 | erev = dropout(erev, DROPOUT); 614 | } 615 | Expression c = concatenate({efwd, erev}); 616 | //Expression c = concatenate({ent_lstm_fwd.back(), ent_lstm_rev.back()}); 617 | Expression composed = rectify(affine_transform({cbias, cW, c, R, relation})); 618 | output.push_back(composed); 619 | output_lstm.add_input(composed); 620 | 621 | } 622 | else if (ac =='O') { 623 | assert(bufferi.size() > 1); // dummy symbol means > 1 (not >= 1) 624 | outputi.push_back(bufferi.back()); 625 | output.push_back(buffer.back()); 626 | output_lstm.add_input(buffer.back()); 627 | buffer.pop_back(); 628 | bufferi.pop_back(); 629 | buffer_lstm.rewind_one_step(); 630 | 631 | } 632 | 633 | } 634 | assert(stack.size() == 1); // guard symbol, root 635 | assert(stacki.size() == 1); 636 | assert(buffer.size() == 1); // guard symbol 637 | assert(bufferi.size() == 1); 638 | Expression tot_neglogprob = -sum(log_probs); 639 | assert(tot_neglogprob.pg != nullptr); 640 | return std::make_pair(results, tot_neglogprob); 641 | } 642 | 643 | }; 644 | 645 | void signal_callback_handler(int /* signum */) { 646 | if (requested_stop) { 647 | cerr << "\nReceived SIGINT again, quitting.\n"; 648 | _exit(1); 649 | } 650 | cerr << "\nReceived SIGINT terminating optimization early...\n"; 651 | requested_stop = true; 652 | } 653 | 654 | unsigned compute_correct(const map& ref, const map& hyp, unsigned len) { 655 | unsigned res = 0; 656 | for (unsigned i = 0; i < len; ++i) { 657 | auto ri = ref.find(i); 658 | auto hi = hyp.find(i); 659 | assert(ri != ref.end()); 660 | assert(hi != hyp.end()); 661 | if (ri->second.compare(hi->second)==0) ++res; 662 | } 663 | return res; 664 | } 665 | 666 | void output_conll(const vector& sentence, const vector& pos, 667 | const vector& sentenceUnkStrings, 668 | const map& intToWords, 669 | const map& intToPos, 670 | const map& rel_ref, 671 | const map& rel_hyp) { 672 | for (unsigned i = 0; i < (sentence.size()); ++i) { 673 | // auto index = i + 1; 674 | assert(i < sentenceUnkStrings.size() && 675 | ((sentence[i] == corpus.get_or_add_word(cpyp::Corpus::UNK) && 676 | sentenceUnkStrings[i].size() > 0) || 677 | (sentence[i] != corpus.get_or_add_word(cpyp::Corpus::UNK) && 678 | sentenceUnkStrings[i].size() == 0 && 679 | intToWords.find(sentence[i]) != intToWords.end()))); 680 | string wit = (sentenceUnkStrings[i].size() > 0)? 681 | sentenceUnkStrings[i] : intToWords.find(sentence[i])->second; 682 | auto pit = intToPos.find(pos[i]); 683 | //assert(hyp.find(i) != hyp.end()); 684 | //auto hyp_head = hyp.find(i)->second + 1; 685 | //if (hyp_head == (int)sentence.size()) hyp_head = 0; 686 | auto hyp_rel_it = rel_hyp.find(i); 687 | auto ref_rel_it = rel_ref.find(i); 688 | assert(hyp_rel_it != rel_hyp.end()); 689 | auto hyp_rel = hyp_rel_it->second; 690 | auto ref_rel = ref_rel_it->second; 691 | size_t first_char_in_rel = hyp_rel.find('(') + 1; 692 | size_t last_char_in_rel = hyp_rel.rfind(')') - 1; 693 | size_t first_char_in_ref = ref_rel.find('(') + 1; 694 | size_t last_char_in_ref = ref_rel.rfind(')') - 1; 695 | 696 | hyp_rel = hyp_rel.substr(first_char_in_rel, last_char_in_rel - first_char_in_rel + 1); 697 | ref_rel = ref_rel.substr(first_char_in_ref, last_char_in_ref - first_char_in_ref + 1); 698 | if (hyp_rel.compare("0")!=0){ 699 | hyp_rel="I-"+hyp_rel; 700 | } 701 | else hyp_rel="O"; 702 | if (ref_rel.compare("0")!=0){ 703 | ref_rel="I-"+ref_rel; 704 | } 705 | else ref_rel="O"; 706 | 707 | 708 | //cout << index << '\t' // 1. ID 709 | cout << wit << ' ' // 2. FORM 710 | // << "_" << '\t' // 3. LEMMA 711 | // << "_" << '\t' // 4. CPOSTAG 712 | << pit->second << ' ' // 5. POSTAG 713 | << "_" << ' ' // 6. tree. _ empty? 714 | // << hyp_head << '\t' // 7. HEAD 715 | << ref_rel << ' ' // 8. DEPREL 716 | << hyp_rel << endl; // 8. DEPREL 717 | // << "_" << '\t' // 9. PHEAD 718 | // << "_" << endl; // 10. PDEPREL 719 | } 720 | cout << endl; 721 | } 722 | 723 | int main(int argc, char** argv) { 724 | dynet::initialize(argc, argv); 725 | 726 | cerr << "COMMAND:"; 727 | for (unsigned i = 0; i < static_cast(argc); ++i) cerr << ' ' << argv[i]; 728 | cerr << endl; 729 | unsigned status_every_i_iterations = 100; 730 | 731 | po::variables_map conf; 732 | InitCommandLine(argc, argv, &conf); 733 | USE_POS = conf.count("use_pos_tags"); 734 | if (conf.count("dropout")) 735 | DROPOUT = conf["dropout"].as(); 736 | 737 | USE_SPELLING=conf.count("use_spelling"); //Miguel 738 | corpus.USE_SPELLING=USE_SPELLING; 739 | 740 | LAYERS = conf["layers"].as(); 741 | INPUT_DIM = conf["input_dim"].as(); 742 | PRETRAINED_DIM = conf["pretrained_dim"].as(); 743 | HIDDEN_DIM = conf["hidden_dim"].as(); 744 | ACTION_DIM = conf["action_dim"].as(); 745 | LSTM_INPUT_DIM = conf["lstm_input_dim"].as(); 746 | POS_DIM = conf["pos_dim"].as(); 747 | REL_DIM = conf["rel_dim"].as(); 748 | // const unsigned beam_size = conf["beam_size"].as(); 749 | const unsigned unk_strategy = conf["unk_strategy"].as(); 750 | cerr << "Unknown word strategy: "; 751 | if (unk_strategy == 1) { 752 | cerr << "STOCHASTIC REPLACEMENT\n"; 753 | } else { 754 | abort(); 755 | } 756 | const double unk_prob = conf["unk_prob"].as(); 757 | assert(unk_prob >= 0.); assert(unk_prob <= 1.); 758 | ostringstream os; 759 | os << "parser_" << (USE_POS ? "pos" : "nopos") 760 | << '_' << LAYERS 761 | << '_' << INPUT_DIM 762 | << '_' << HIDDEN_DIM 763 | << '_' << ACTION_DIM 764 | << '_' << LSTM_INPUT_DIM 765 | << '_' << POS_DIM 766 | << '_' << REL_DIM 767 | << "-pid" << getpid() << ".params"; 768 | // int best_correct_heads = 0; 769 | double best_f1_score=-1.0; 770 | const string fname = os.str(); 771 | cerr << "Writing parameters to file: " << fname << endl; 772 | bool softlinkCreated = false; 773 | corpus.load_correct_actions(conf["training_data"].as()); 774 | const unsigned kUNK = corpus.get_or_add_word(cpyp::Corpus::UNK); 775 | kROOT_SYMBOL = corpus.get_or_add_word(ROOT_SYMBOL); 776 | 777 | if (conf.count("words")) { 778 | pretrained[kUNK] = vector(PRETRAINED_DIM, 0); 779 | cerr << "Loading from " << conf["words"].as() << " with" << PRETRAINED_DIM << " dimensions\n"; 780 | ifstream in(conf["words"].as().c_str()); 781 | string line; 782 | getline(in, line); 783 | vector v(PRETRAINED_DIM, 0); 784 | string word; 785 | while (getline(in, line)) { 786 | istringstream lin(line); 787 | lin >> word; 788 | for (unsigned i = 0; i < PRETRAINED_DIM; ++i) lin >> v[i]; 789 | unsigned id = corpus.get_or_add_word(word); 790 | pretrained[id] = v; 791 | } 792 | } 793 | 794 | set training_vocab; // words available in the training corpus 795 | set singletons; 796 | { // compute the singletons in the parser's training data 797 | map counts; 798 | for (auto sent : corpus.sentences) 799 | for (auto word : sent.second) { training_vocab.insert(word); counts[word]++; } 800 | for (auto wc : counts) 801 | if (wc.second == 1) singletons.insert(wc.first); 802 | } 803 | 804 | // correct ner label set (not action) 805 | set label; 806 | map label2id; 807 | for (unsigned i = 0; i < corpus.actions.size(); ++i) { 808 | if (corpus.actions[i][0] == 'R') { 809 | label2id[corpus.actions[i]] = label.size(); 810 | 811 | cerr << corpus.actions[i] << " :: "<< label.size() << endl; 812 | label.insert(corpus.actions[i]); 813 | } 814 | } 815 | 816 | 817 | cerr << "Number of words: " << corpus.nwords << endl; 818 | VOCAB_SIZE = corpus.nwords + 1; 819 | 820 | cerr << "Number of UTF8 chars: " << corpus.maxChars << endl; 821 | if (corpus.maxChars>255) CHAR_SIZE=corpus.maxChars; 822 | 823 | ACTION_SIZE = corpus.nactions + 1; 824 | //POS_SIZE = corpus.npos + 1; 825 | POS_SIZE = corpus.npos + 10; 826 | possible_actions.resize(corpus.nactions); 827 | for (unsigned i = 0; i < corpus.nactions; ++i) 828 | possible_actions[i] = i; 829 | 830 | ParameterCollection model; 831 | ParserBuilder parser(model, pretrained); 832 | if (conf.count("model")) { 833 | TextFileLoader loader(conf["model"].as()); 834 | loader.populate(model); 835 | } 836 | 837 | // OOV words will be replaced by UNK tokens 838 | corpus.load_correct_actionsDev(conf["dev_data"].as()); 839 | if (USE_SPELLING) VOCAB_SIZE = corpus.nwords + 1; 840 | //TRAINING 841 | if (conf.count("train")) { 842 | signal(SIGINT, signal_callback_handler); 843 | SimpleSGDTrainer sgd(model); 844 | //MomentumSGDTrainer sgd(&model); 845 | float eta_decay = 0.08; 846 | //sgd.eta_decay = 0.05; 847 | cerr << "Training started."<<"\n"; 848 | vector order(corpus.nsentences); 849 | for (unsigned i = 0; i < corpus.nsentences; ++i) 850 | order[i] = i; 851 | double tot_seen = 0; 852 | status_every_i_iterations = min(status_every_i_iterations, corpus.nsentences); 853 | unsigned si = corpus.nsentences; 854 | cerr << "NUMBER OF TRAINING SENTENCES: " << corpus.nsentences << endl; 855 | unsigned trs = 0; 856 | double right = 0; 857 | double llh = 0; 858 | bool first = true; 859 | int iter = -1; 860 | while(!requested_stop) { 861 | ++iter; 862 | for (unsigned sii = 0; sii < status_every_i_iterations; ++sii) { 863 | if (si == corpus.nsentences) { 864 | si = 0; 865 | if (first) { first = false; } else { sgd.learning_rate *= 1 - eta_decay; } 866 | cerr << "**SHUFFLE\n"; 867 | random_shuffle(order.begin(), order.end()); 868 | } 869 | tot_seen += 1; 870 | const vector& sentence=corpus.sentences[order[si]]; 871 | vector tsentence=sentence; 872 | if (unk_strategy == 1) { 873 | for (auto& w : tsentence) 874 | if (singletons.count(w) && dynet::rand01() < unk_prob) w = kUNK; 875 | } 876 | const vector& sentencePos=corpus.sentencesPos[order[si]]; 877 | const vector& actions=corpus.correct_act_sent[order[si]]; 878 | ComputationGraph hg; 879 | auto pred_loss = parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,actions,corpus.actions,corpus.intToWords,false,&right); 880 | double lp = as_scalar(hg.incremental_forward(pred_loss.second)); 881 | if (lp < 0) { 882 | cerr << "Log prob < 0 on sentence " << order[si] << ": lp=" << lp << endl; 883 | assert(lp >= 0.0); 884 | } 885 | hg.backward(pred_loss.second); 886 | sgd.update(); 887 | llh += lp; 888 | ++si; 889 | trs += actions.size(); 890 | } 891 | sgd.status(); 892 | cerr << "update #" << iter << " (epoch " << (tot_seen / corpus.nsentences) << ")\tllh: "<< llh<<" ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << endl; 893 | llh = trs = right = 0; 894 | 895 | static int logc = 0; 896 | ++logc; 897 | if (logc % 25 == 1) { // report on dev set 898 | unsigned dev_size = corpus.nsentencesDev; 899 | // dev_size = 100; 900 | double llh = 0; 901 | double trs = 0; 902 | double right = 0; 903 | double correct_heads = 0; 904 | double total_heads = 0; 905 | 906 | // TODO fix it to variable length label 907 | int confusion[4][4]= {{0}}; 908 | int class_tp[4] = {0}; 909 | int class_fp[4] = {0}; 910 | int class_fn[4] = {0}; 911 | 912 | auto t_start = std::chrono::high_resolution_clock::now(); 913 | for (unsigned sii = 0; sii < dev_size; ++sii) { 914 | const vector& sentence=corpus.sentencesDev[sii]; 915 | const vector& sentencePos=corpus.sentencesPosDev[sii]; 916 | const vector& actions=corpus.correct_act_sentDev[sii]; 917 | vector tsentence=sentence; 918 | if (!USE_SPELLING) { 919 | for (auto& w : tsentence) 920 | if (training_vocab.count(w) == 0) w = kUNK; 921 | } 922 | 923 | ComputationGraph hg; 924 | auto pred_loss = parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,vector(),corpus.actions,corpus.intToWords,true,&right); 925 | vector & pred = pred_loss.first; 926 | double lp = 0; 927 | //vector pred = parser.log_prob_parser_beam(&hg,sentence,sentencePos,corpus.actions,beam_size,&lp); 928 | llh -= lp; 929 | trs += actions.size(); 930 | //for (int i=0;i"<"< ref = parser.compute_ents(sentence.size(), actions, corpus.actions); 933 | map hyp = parser.compute_ents(sentence.size(), pred, corpus.actions); 934 | 935 | //update confusion matrix 936 | for (unsigned i = 0; i < sentence.size()-1; ++i) { 937 | auto ri = ref.find(i); 938 | auto hi = hyp.find(i); 939 | assert(ri != ref.end()); 940 | assert(hi != hyp.end()); 941 | 942 | if (ri->second != "0"){ 943 | int pr = label2id[hi->second]; 944 | int tr = label2id[ri->second]; 945 | 946 | confusion[pr][tr]++; 947 | } 948 | } 949 | 950 | correct_heads += compute_correct(ref, hyp, sentence.size() - 1); 951 | total_heads += sentence.size() - 1; 952 | } 953 | 954 | // compute tp, fp and fn for each class 955 | cerr << "confusion matrix" << endl; 956 | for (unsigned i = 0; i < label.size(); ++i){ 957 | for (unsigned j = 0; j < label.size(); ++j){ 958 | cerr << i << " " << j << " " << confusion[i][j] << endl; 959 | if (i == j){ 960 | class_tp[i] = confusion[i][j]; 961 | } else { 962 | class_fp[i] += confusion[i][j]; 963 | class_fn[i] += confusion[j][i]; 964 | } 965 | } 966 | } 967 | 968 | // compute precision, recall, f1 969 | double global_f1_score = 0; 970 | for (unsigned i = 0; i < label.size(); ++i){ 971 | double precision = (float)class_tp[i] / (class_tp[i] + class_fp[i]); 972 | double recall = (float)class_tp[i] / (class_tp[i] + class_fn[i]); 973 | double f1_score = 2 * (float)(precision * recall) / (precision + recall); 974 | double pr=precision+recall; 975 | if (pr==0) f1_score=0.0; 976 | global_f1_score += f1_score / label.size(); // Macro averaged F1 977 | if (std::isnan(global_f1_score)) global_f1_score=0.0; 978 | 979 | cerr << i << "class Precision " << precision << " Recall " << recall << " F1 " << f1_score << endl; 980 | } 981 | cerr << "F1 >> " << global_f1_score << endl; 982 | 983 | 984 | auto t_end = std::chrono::high_resolution_clock::now(); 985 | cerr << " **dev (iter=" << iter << " epoch=" << (tot_seen / corpus.nsentences) << ")\tllh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " f1: " << global_f1_score << "\t[" << dev_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; 986 | if (global_f1_score > best_f1_score) { 987 | best_f1_score = global_f1_score; 988 | TextFileSaver saver(fname); 989 | saver.save(model); 990 | // Create a soft link to the most recent model in order to make it 991 | // easier to refer to it in a shell script. 992 | if (!softlinkCreated) { 993 | string softlink = " latest_model"; 994 | if (system((string("rm -f ") + softlink).c_str()) == 0 && 995 | system((string("ln -s ") + fname + softlink).c_str()) == 0) { 996 | cerr << "Created " << softlink << " as a soft link to " << fname 997 | << " for convenience." << endl; 998 | } 999 | softlinkCreated = true; 1000 | } 1001 | } 1002 | } 1003 | } 1004 | } // should do training? 1005 | if (true) { // do test evaluation 1006 | double llh = 0; 1007 | double trs = 0; 1008 | double right = 0; 1009 | double correct_heads = 0; 1010 | double total_heads = 0; 1011 | // double f1score=0.0; 1012 | 1013 | // TODO fix it to variable length label 1014 | int confusion[4][4]= {{0}}; 1015 | int class_tp[4] = {0}; 1016 | int class_fp[4] = {0}; 1017 | int class_fn[4] = {0}; 1018 | 1019 | auto t_start = std::chrono::high_resolution_clock::now(); 1020 | unsigned corpus_size = corpus.nsentencesDev; 1021 | for (unsigned sii = 0; sii < corpus_size; ++sii) { 1022 | const vector& sentence=corpus.sentencesDev[sii]; 1023 | const vector& sentencePos=corpus.sentencesPosDev[sii]; 1024 | const vector& sentenceUnkStr=corpus.sentencesStrDev[sii]; 1025 | const vector& actions=corpus.correct_act_sentDev[sii]; 1026 | vector tsentence=sentence; 1027 | if (!USE_SPELLING) { 1028 | for (auto& w : tsentence) 1029 | if (training_vocab.count(w) == 0) w = kUNK; 1030 | } 1031 | ComputationGraph cg; 1032 | double lp = 0; 1033 | auto pred_loss = parser.log_prob_parser(&cg,sentence,tsentence,sentencePos,vector(),corpus.actions,corpus.intToWords,true,&right); 1034 | vector & pred = pred_loss.first; 1035 | // if (beam_size == 1) 1036 | // pred = parser.log_prob_parser(&cg,sentence,tsentence,sentencePos,vector(),corpus.actions,corpus.intToWords,true,&right); 1037 | //else 1038 | // pred = parser.log_prob_parser_beam(&cg,sentence,tsentence,sentencePos,corpus.actions,beam_size,&lp); 1039 | llh -= lp; 1040 | trs += actions.size(); 1041 | map rel_ref, rel_hyp; 1042 | map ref = parser.compute_ents(sentence.size(), actions, corpus.actions, &rel_ref); 1043 | map hyp = parser.compute_ents(sentence.size(), pred, corpus.actions, &rel_hyp); 1044 | output_conll(sentence, sentencePos, sentenceUnkStr, corpus.intToWords, corpus.intToPos, ref, hyp); 1045 | 1046 | //update confusion matrix 1047 | for (unsigned i = 0; i < sentence.size()-1; ++i) { 1048 | auto ri = ref.find(i); 1049 | auto hi = hyp.find(i); 1050 | assert(ri != ref.end()); 1051 | assert(hi != hyp.end()); 1052 | 1053 | if (ri->second != "0"){ 1054 | int pr = label2id[hi->second]; 1055 | int tr = label2id[ri->second]; 1056 | 1057 | confusion[pr][tr]++; 1058 | } 1059 | } 1060 | 1061 | correct_heads += compute_correct(ref, hyp, sentence.size() - 1); 1062 | total_heads += sentence.size() - 1; 1063 | } 1064 | 1065 | // compute tp, fp and fn for each class 1066 | cerr << "confusion matrix" << endl; 1067 | for (unsigned i = 0; i < label.size(); ++i){ 1068 | for (unsigned j = 0; j < label.size(); ++j){ 1069 | cerr << i << " " << j << " " << confusion[i][j] << endl; 1070 | if (i == j){ 1071 | class_tp[i] = confusion[i][j]; 1072 | } else { 1073 | class_fp[i] += confusion[i][j]; 1074 | class_fn[i] += confusion[j][i]; 1075 | } 1076 | } 1077 | } 1078 | 1079 | // compute precision, recall, f1 1080 | double global_f1_score = 0; 1081 | for (unsigned i = 0; i < label.size(); ++i){ 1082 | double precision = (float)class_tp[i] / (class_tp[i] + class_fp[i]); 1083 | double recall = (float)class_tp[i] / (class_tp[i] + class_fn[i]); 1084 | double f1_score = 2 * (float)(precision * recall) / (precision + recall); 1085 | double pr=precision+recall; 1086 | if (pr==0) f1_score=0.0; 1087 | global_f1_score += f1_score / label.size(); // Macro averaged F1 1088 | if (std::isnan(global_f1_score)) global_f1_score=0.0; 1089 | 1090 | cerr << i << "class Precision " << precision << " Recall " << recall << " F1 " << f1_score << endl; 1091 | } 1092 | cerr << "F1 >> " << global_f1_score << endl; 1093 | 1094 | auto t_end = std::chrono::high_resolution_clock::now(); 1095 | cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " f1: " << global_f1_score << "\t[" << corpus_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; 1096 | } 1097 | for (unsigned i = 0; i < corpus.actions.size(); ++i) { 1098 | //cerr << corpus.actions[i] << '\t' << parser.p_r->values[i].transpose() << endl; 1099 | //cerr << corpus.actions[i] << '\t' << parser.p_p2a->values.col(i).transpose() << endl; 1100 | } 1101 | } 1102 | --------------------------------------------------------------------------------