├── .gitmodules
├── ner-system
    ├── CMakeLists.txt
    ├── attach_prediction.py
    ├── convert-conll2trans.pl
    ├── conll2parser.py
    ├── example.conll-2003
    ├── README.md
    ├── c2.h
    └── lstm-parse.cc
├── .gitignore
├── CMakeLists.txt
├── INSTALL.md
├── cmake
    └── FindEigen3.cmake
└── README.md


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "dynet"]
2 | 	path = dynet
3 | 	url = https://github.com/clab/dynet.git
4 | 	branch = master
5 | 


--------------------------------------------------------------------------------
/ner-system/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | PROJECT(dynet:parser)
2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
3 | 
4 | ADD_EXECUTABLE(lstm-parse lstm-parse.cc)
5 | target_link_libraries(lstm-parse dynet ${Boost_LIBRARIES})
6 | 
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | 


--------------------------------------------------------------------------------
/ner-system/attach_prediction.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description='Give Me Conll data.')
 5 | parser.add_argument('-p', type=str, help='prediction file')
 6 | parser.add_argument('-t', type=str, help='test file')
 7 | parser.add_argument('-o', type=str, help='output destination')
 8 | args = parser.parse_args()
 9 | 
10 | f = open(args.o,'w')
11 | 
12 | for test_line, pred_line in zip(open(args.t), open(args.p)):
13 |     test_line = test_line.strip().split()
14 |     pred_line = pred_line.strip().split()
15 |     
16 |     
17 |     if len(test_line) > 0:
18 |         #assert test_line[0] == pred_line[0], "your prediction is not aligned to test file"
19 |         test_line.append(pred_line[-1])
20 |         f.write('{}\n'.format(" ".join(test_line)))
21 |     else:
22 |         f.write("\n")
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(dynet)
 2 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
 3 | 
 4 | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 5 | set(CMAKE_CXX_FLAGS "-Wall -std=c++11 -O3 -g")
 6 | 
 7 | enable_testing()
 8 | 
 9 | #include_directories(${CMAKE_CURRENT_SOURCE_DIR})
10 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dynet)
11 | set(WITH_EIGEN_BACKEND 1)
12 | 
13 | # look for Boost
14 | if(DEFINED ENV{BOOST_ROOT})
15 |   set(Boost_NO_SYSTEM_PATHS ON)
16 | endif()
17 | set(Boost_REALPATH ON)
18 | find_package(Boost COMPONENTS program_options serialization REQUIRED)
19 | include_directories(${Boost_INCLUDE_DIR})
20 | set(LIBS ${LIBS} ${Boost_LIBRARIES})
21 | 
22 | # look for Eigen
23 | find_package(Eigen3 REQUIRED)
24 | include_directories(${EIGEN3_INCLUDE_DIR})
25 | 
26 | #configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
27 | 
28 | add_subdirectory(dynet/dynet)
29 | # add_subdirectory(dynet/examples)
30 | add_subdirectory(ner-system)
31 | 


--------------------------------------------------------------------------------
/ner-system/convert-conll2trans.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | 
 4 | my @toks;
 5 | my @tags;
 6 | 
 7 | while(<>) {
 8 |   chomp;
 9 |   if (/^\s*$/) {
10 |     print "@toks ||| ";
11 |     my $len = scalar @toks;
12 |     my $i = 0;
13 |     while ($i < $len) {
14 |       if ($tags[$i] eq 'O') {
15 |         print "OUT ";
16 |         $i++;
17 |       } elsif ($tags[$i] =~ /^(B|I)-(.+)$/) {
18 |         my $tt = $2;
19 |         my $x = "I-$tt";
20 |         my $j = $i + 1;
21 |         while ($j < $len && $tags[$j] eq $x) { $j++; }
22 |         my @span = ();
23 |         for (my $k = $i; $k < $j; $k++) {
24 |           print "SHIFT ";
25 |         }
26 |         print "REDUCE($tt) ";
27 |         $i = $j;
28 |       } else {
29 |         die "Bad input: $_\n";
30 |       }
31 |     }
32 |     @toks = ();
33 |     @tags = ();
34 |     print "\n";
35 |   } else {
36 |     my @fields = split /\s+/;
37 |     push @toks, "$fields[0]-$fields[1]";
38 |     push @tags, $fields[-1];
39 |   }
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/ner-system/conll2parser.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description='Give Me Conll data.')
 5 | parser.add_argument('-f', type=str, help='conll file')
 6 | args = parser.parse_args()
 7 | 
 8 | for line in open(args.f):
 9 |     sentence, actions = map(lambda x: x.strip().split(), line.strip().split('|||'))
10 |     stack_state, buffer_state = [], sentence
11 |     #print sentence, actions
12 | 
13 |     sys.stdout.write('\n')
14 |     for action in actions:
15 |         sys.stdout.write('{}{}\n'.format(str(stack_state), str(buffer_state)))
16 |         if action[0] == 'O':
17 |             assert len(stack_state) == 0
18 |             buffer_state = buffer_state[1:]
19 |         elif action[0] == 'S':
20 |             stack_state.append(buffer_state[0])
21 |             buffer_state = buffer_state[1:]
22 |         elif action[0] == 'R':
23 |             stack_state = []
24 |         sys.stdout.write('{}\n'.format(action))
25 |     assert len(stack_state) == 0 and len(buffer_state) == 0
26 |     sys.stdout.write('{}{}\n'.format(str(stack_state), str(buffer_state)))
27 | 
28 | 


--------------------------------------------------------------------------------
/ner-system/example.conll-2003:
--------------------------------------------------------------------------------
 1 | John NNP I-NP I-PER
 2 | Smith NNP N-NP I-PER
 3 | went VBD I-VP O
 4 | to TO I-PP O
 5 | Pittsburgh NNP I-NP I-LOC
 6 | . . O O
 7 | 
 8 | Refusing VBG I-VP O
 9 | to TO I-VP O
10 | go VB I-VP O
11 | quietly RB I-ADVP O
12 | in IN I-PP O
13 | the DT I-NP O
14 | night NN I-NP O
15 | , , O O
16 | Stefan NNP I-NP I-PER
17 | Edberg NNP I-NP I-PER
18 | extended VBD I-VP O
19 | his PRP$ I-NP O
20 | stay NN I-NP O
21 | at IN I-PP O
22 | his PRP$ I-NP O
23 | 14th JJ I-NP O
24 | and CC I-NP O
25 | last JJ I-NP O
26 | U.S. NNP I-NP I-MISC
27 | Open NNP I-NP I-MISC
28 | when WRB I-ADVP O
29 | Bernd NNP I-NP I-PER
30 | Karbacher NNP I-NP I-PER
31 | , , O O
32 | trailing VBG I-VP O
33 | and CC O O
34 | hurting VBG I-VP O
35 | , , O O
36 | quit VB I-VP O
37 | in IN I-PP O
38 | the DT I-NP O
39 | fourth JJ I-NP O
40 | set NN I-NP O
41 | of IN I-PP O
42 | their PRP$ I-NP O
43 | second-round JJ I-NP O
44 | match NN I-NP O
45 | Friday NNP B-NP O
46 | . . O O
47 | 
48 | AUGUST RB I-NP O
49 | 1996 CD I-NP O
50 | CDU NNP I-NP I-ORG
51 | / SYM O I-ORG
52 | CSU NNP I-NP I-ORG
53 | SPD NNP I-NP B-ORG
54 | FDP NNP I-NP B-ORG
55 | Greens NNP I-NP B-ORG
56 | PDS NNP I-NP B-ORG
57 | 
58 | 


--------------------------------------------------------------------------------
/ner-system/README.md:
--------------------------------------------------------------------------------
 1 | ## Example
 2 | 
 3 | #### Desired labeling
 4 | 
 5 |     John Smith went to Pittsburgh .
 6 |      PER-----   O    O  LOC       O
 7 | 
 8 | Corresponding sequence of operations (generated by `convert-conll2trans.pl`)
 9 | 
10 |     SHIFT
11 |     SHIFT
12 |     REDUCE(PER)
13 |     OUT
14 |     OUT
15 |     SHIFT
16 |     REDUCE(LOC)
17 |     OUT
18 | 
19 | #### Data structures
20 | 
21 |  * **buffer** - sequence of tokens, read from left to right
22 |  * **stack** - working memory
23 |  * **output buffer** - sequence of labeled segments constructed from left to right
24 | 
25 | #### Operations
26 | 
27 |  * `SHIFT` - move word from **buffer** to top of **stack**
28 |  * `REDUCE(X)` - all words on **stack** are popped, combined to form a segment and labeled with `X` and copied to **output buffer**
29 |  * `OUT` - move one token from **buffer** to **output buffer**
30 | 
31 | #### Dataset & Preprocessing
32 | 
33 | Datasets are in /usr0/home/kkawakam/conll2003
34 | 
35 | Convert conll format to ner action (convert-conll2trans.pl) and convert it to parser friendly format (conll2parser.py).  
36 | 
37 | ```bash
38 |    perl convert-conll2trans.pl conll2003/train > conll2003/train.trans
39 |    python conll2parser.py -f conll2003/train.trans > conll2003/train.parser 
40 | ```
41 | 
42 | #### Training
43 | 
44 |     ./lstm-parse -T /usr0/home/kkawakam/conll2003/train.parser -d /usr0/home/kkawakam/conll2003/dev.parser --hidden_dim 100 --lstm_input_dim 100 -w /usr3/home/lingwang/chris/sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 --input_dim 100 -t -S -D 0.3 > logNERYesCharNoPosYesEmbeddingsD0.3.txt &
45 | 
46 | 
47 | ### Decoding 
48 | 
49 | 
50 |     ./lstm-parse -T /usr0/home/kkawakam/conll2003/train.parser -d /usr0/home/kkawakam/conll2003/test.parser --hidden_dim 100 --lstm_input_dim 100 -w /usr3/home/lingwang/chris/sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 --input_dim 100 -m latest_model -S > output.txt
51 |     python attach_prediction.py -p output.txt -t /usr0/home/kkawakam/conll2003/test -o evaloutput.txt
52 | 
53 | 
54 | #### Evaluation
55 | 
56 | Attach your prediction to test file
57 | 
58 | ```bash
59 |   python attach_prediction.py -p (prediction) -t /path/to/conll2003/test -o (output file)
60 |   ./conlleval < (output file)
61 | ```
62 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | Minimum Requirements
 2 | ======================
 3 | 1. Cmake 2.8.7
 4 | 2. [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page)
 5 | 3. `dynet` (Don't have to install this separately; will be installed below)
 6 | 
 7 | Installation Steps
 8 | ======================
 9 | These steps have been tested on Ubuntu 16.04 and macOS Sierra.
10 | 
11 | 1. Once this repository has been cloned, the `dynet/` submodule needs to be synced
12 | 
13 |   ```bash
14 |   git submodule init
15 |   git submodule update
16 |   ```
17 | 2. This will download the required files to dynet directory. Let this directory be `PATH_TO_DYNET`.
18 |   
19 |   ```bash
20 |   PATH_TO_DYNET=<your_stack_lstm_dir>/dynet/
21 |   ```
22 | 3. Download the C++ library `eigen` which is used by dynet:
23 |   
24 |   ```bash
25 |   cd $HOME
26 |   hg clone https://bitbucket.org/eigen/eigen/
27 |   cd eigen
28 |   ```
29 |   **Note:** There were compilation issues with some versions of `eigen`. This installation has been successful with `Eigen v3.3.1`
30 | 4. Now, create a `build` directory and install eigen:
31 | 
32 |   ```bash
33 |   mkdir build
34 |   cd build
35 |   cmake ..
36 |   ```
37 | 5. Run `sudo make install`. This will push the library files to the local `include` directory. On Ubuntu 16.04 and macOS Sierra, they are copied to `/usr/local/include/eigen3`. 
38 | 6. Go back to `dynet` directory in `stack-lstm-ner` and build `dynet`. Modify the code below with your `eigen3` `include` location and boost location. 
39 | 
40 |   ```bash
41 |   cd $PATH_TO_DYNET
42 |   mkdir build
43 |   cd build
44 |   cmake .. -DEIGEN3_INCLUDE_DIR=/usr/local/include/eigen3
45 |   make -j 2
46 |   ```
47 | **Note:** If DYNET fails to compile and throws an error like this:
48 |   ```bash
49 |   $ make -j 2
50 |   Scanning dependencies of target dynet
51 |   Scanning dependencies of target dynet_shared
52 |   [  1%] [  2%] Building CXX object dynet/CMakeFiles/dynet.dir/cfsm-builder.cc.o
53 |   Building CXX object dynet/CMakeFiles/dynet_shared.dir/cfsm-builder.cc.o
54 |   In file included from /home/user/dynet/dynet/dynet.h:13:0,
55 |                    from /home/user/dynet/dynet/cfsm-builder.h:6,
56 |                    from /home/user/dynet/dynet/cfsm-builder.cc:1:
57 |   /home/user/dynet/dynet/tensor.h:22:42: fatal error: unsupported/Eigen/CXX11/Tensor: No such file or directory
58 |   #include <unsupported/Eigen/CXX11/Tensor>
59 |                                             ^
60 |   compilation terminated.
61 |   ```
62 | Then, download and install a stable version of Eigen and rebuild DyNet:
63 | 
64 |   ```bash
65 |   cd $HOME
66 |   wget u.cs.biu.ac.il/~yogo/eigen.tgz
67 |   tar zxvf eigen.tgz
68 |   cd eigen
69 |   ```
70 | Repeat step 4 and run:
71 | 
72 |   ```bash
73 |   cd $PATH_TO_DYNET/build
74 |   rm -rf *
75 |   ```
76 | Now, rebuild DyNet again.
77 | 7. Create a `build` directory in <stack_lstm_dir> and in the same directory `stack-lstm-ner`, do `cmake .  -DEIGEN3_INCLUDE_DIR=/usr/local/include/eigen3` and then `make`. This will build `lstm-parse` in `ner-system`
78 | 
79 | Debugging build errors
80 | ========================
81 | If you want to see the compile commands that are used, you can run
82 | 
83 | ```bash
84 | make VERBOSE=1
85 | ```
86 | 


--------------------------------------------------------------------------------
/cmake/FindEigen3.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find Eigen3 lib
 2 | #
 3 | # This module supports requiring a minimum version, e.g. you can do
 4 | #   find_package(Eigen3 3.1.2)
 5 | # to require version 3.1.2 or newer of Eigen3.
 6 | #
 7 | # Once done this will define
 8 | #
 9 | #  EIGEN3_FOUND - system has eigen lib with correct version
10 | #  EIGEN3_INCLUDE_DIR - the eigen include directory
11 | #  EIGEN3_VERSION - eigen version
12 | 
13 | # Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
14 | # Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
15 | # Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
16 | # Redistribution and use is allowed according to the terms of the 2-clause BSD license.
17 | 
18 | if(NOT Eigen3_FIND_VERSION)
19 |   if(NOT Eigen3_FIND_VERSION_MAJOR)
20 |     set(Eigen3_FIND_VERSION_MAJOR 2)
21 |   endif(NOT Eigen3_FIND_VERSION_MAJOR)
22 |   if(NOT Eigen3_FIND_VERSION_MINOR)
23 |     set(Eigen3_FIND_VERSION_MINOR 91)
24 |   endif(NOT Eigen3_FIND_VERSION_MINOR)
25 |   if(NOT Eigen3_FIND_VERSION_PATCH)
26 |     set(Eigen3_FIND_VERSION_PATCH 0)
27 |   endif(NOT Eigen3_FIND_VERSION_PATCH)
28 | 
29 |   set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
30 | endif(NOT Eigen3_FIND_VERSION)
31 | 
32 | macro(_eigen3_check_version)
33 |   file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
34 | 
35 |   string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}")
36 |   set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
37 |   string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}")
38 |   set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
39 |   string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}")
40 |   set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
41 | 
42 |   set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
43 |   if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
44 |     set(EIGEN3_VERSION_OK FALSE)
45 |   else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
46 |     set(EIGEN3_VERSION_OK TRUE)
47 |   endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
48 | 
49 |   if(NOT EIGEN3_VERSION_OK)
50 | 
51 |     message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
52 |                    "but at least version ${Eigen3_FIND_VERSION} is required")
53 |   endif(NOT EIGEN3_VERSION_OK)
54 | endmacro(_eigen3_check_version)
55 | 
56 | if (EIGEN3_INCLUDE_DIR)
57 | 
58 |   # in cache already
59 |   _eigen3_check_version()
60 |   set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
61 | 
62 | else (EIGEN3_INCLUDE_DIR)
63 | 
64 |   find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
65 |       PATHS
66 |       ${CMAKE_INSTALL_PREFIX}/include
67 |       ${KDE4_INCLUDE_DIR}
68 |       PATH_SUFFIXES eigen3 eigen
69 |     )
70 | 
71 |   if(EIGEN3_INCLUDE_DIR)
72 |     _eigen3_check_version()
73 |   endif(EIGEN3_INCLUDE_DIR)
74 | 
75 |   include(FindPackageHandleStandardArgs)
76 |   find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
77 | 
78 |   mark_as_advanced(EIGEN3_INCLUDE_DIR)
79 | 
80 | endif(EIGEN3_INCLUDE_DIR)
81 | 
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Transition-based NER system.
  2 | 
  3 | This system is part of a paper accepted at NAACL-HLT 2016 Conference.
  4 | See the paper here: http://arxiv.org/pdf/1603.01360v1.pdf
  5 | 
  6 | #### Desired labeling
  7 | 
  8 |     John Smith went to Pittsburgh .
  9 |      PER-----   O    O  LOC       O
 10 | 
 11 | Corresponding sequence of operations (generated by `convert-conll2trans.pl`)
 12 | 
 13 |     SHIFT
 14 |     SHIFT
 15 |     REDUCE(PER)
 16 |     OUT
 17 |     OUT
 18 |     SHIFT
 19 |     REDUCE(LOC)
 20 |     OUT
 21 | 
 22 | #### Data structures
 23 | 
 24 |  * **buffer** - sequence of tokens, read from left to right
 25 |  * **stack** - working memory
 26 |  * **output buffer** - sequence of labeled segments constructed from left to right
 27 | 
 28 | #### Operations
 29 | 
 30 |  * `SHIFT` - move word from **buffer** to top of **stack**
 31 |  * `REDUCE(X)` - all words on **stack** are popped, combined to form a segment and labeled with `X` and copied to **output buffer**
 32 |  * `OUT` - move one token from **buffer** to **output buffer**
 33 | 
 34 | #### Dataset & Preprocessing
 35 | 
 36 | We use the datasets from conll2002 and conll2003
 37 | 
 38 | Convert conll format to ner action (convert-conll2trans.pl) and convert it to parser friendly format (conll2parser.py).  
 39 | 
 40 | ```bash
 41 |    perl convert-conll2trans.pl conll2003/train > conll2003/train.trans
 42 |    python conll2parser.py -f conll2003/train.trans > conll2003/train.parser 
 43 | ```
 44 | 
 45 | If you see that the words in the oracle have ' symbol , do the following in the trainind/test/dev datasets: 
 46 | ```
 47 |     []['Peter-NNP', 'Blackburn-NNP'] -->  [][Peter-NNP, Blackburn-NNP]
 48 | ```
 49 | 
 50 |  Link to the word vectors that we used in the NAACL 2016 paper for English:  [sskip.100.vectors](https://drive.google.com/file/d/0B8nESzOdPhLsdWF2S1Ayb1RkTXc/view?usp=sharing).
 51 | 
 52 | 
 53 | #### Build the system
 54 | 
 55 | The first time you clone the repository, you need to sync the dynet/ submodule.
 56 | ```
 57 | git submodule init
 58 | git submodule update
 59 | 
 60 | mkdir build
 61 | cd build
 62 | cmake .. -DEIGEN3_INCLUDE_DIR=/path/to/eigen
 63 | make -j2
 64 | ```
 65 | 
 66 | #### Training
 67 | 
 68 |     ./lstm-parse -T conll2003/train.parser -d conll2003/dev.parser --hidden_dim 100 --lstm_input_dim 100 -w sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 --input_dim 100 -t -S -D 0.3 > logNERYesCharNoPosYesEmbeddingsD0.3.txt &
 69 | 
 70 | 
 71 | ### Decoding 
 72 | 
 73 | 
 74 |     ./lstm-parse -T conll2003/train.parser -d conll2003/test.parser --hidden_dim 100 --lstm_input_dim 100 -w sskip.100.vectors --pretrained_dim 100 --rel_dim 20 --action_dim 20 --input_dim 100 -m latest_model -S > output.txt
 75 |     python attach_prediction.py -p output.txt -t conll2003/test -o evaloutput.txt
 76 | 
 77 | 
 78 | #### Evaluation
 79 | 
 80 | Attach your prediction to test file
 81 | 
 82 | ```bash
 83 |   python attach_prediction.py -p (prediction) -t /path/to/conll2003/test -o (output file)
 84 |   ./conlleval < (output file)
 85 | ```
 86 | #### Citation
 87 | 
 88 | If you make use of this software, please cite the following:
 89 | 
 90 |     @inproceedings{2016naacl,
 91 |       author={Guillaume Lample and Miguel Ballesteros and Kazuya Kawakami and Sandeep Subramanian and Chris Dyer},
 92 |       title={Neural Architectures for Named Entity Recognition},
 93 |       booktitle={Proc. NAACL-HLT},
 94 |       year=2016,
 95 |     }
 96 | 
 97 | 
 98 | #### License
 99 | 
100 | This software is released under the terms of the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
101 | 
102 | #### Contact
103 | 
104 | For questions and usage issues, please contact miguel.ballesteros@upf.edu
105 | 
106 | 


--------------------------------------------------------------------------------
/ner-system/c2.h:
--------------------------------------------------------------------------------
  1 | #ifndef CPYPDICT_H_
  2 | #define CPYPDICT_H_
  3 | 
  4 | #include <string>
  5 | #include <iostream>
  6 | #include <cassert>
  7 | #include <fstream>
  8 | #include <sstream>
  9 | #include <vector>
 10 | #include <set>
 11 | #include <unordered_map>
 12 | #include <functional>
 13 | #include <vector>
 14 | #include <map>
 15 | #include <string>
 16 | 
 17 | namespace cpyp {
 18 | 
 19 | class Corpus {
 20 |  //typedef std::unordered_map<std::string, unsigned, std::hash<std::string> > Map;
 21 | // typedef std::unordered_map<unsigned,std::string, std::hash<std::string> > ReverseMap;
 22 | public: 
 23 |    bool USE_SPELLING=false; 
 24 | 
 25 |    std::map<int,std::vector<unsigned>> correct_act_sent;
 26 |    std::map<int,std::vector<unsigned>> sentences;
 27 |    std::map<int,std::vector<unsigned>> sentencesPos;
 28 | 
 29 |    std::map<int,std::vector<unsigned>> correct_act_sentDev;
 30 |    std::map<int,std::vector<unsigned>> sentencesDev;
 31 |    std::map<int,std::vector<unsigned>> sentencesPosDev;
 32 |    std::map<int,std::vector<std::string>> sentencesStrDev;
 33 |    unsigned nsentencesDev;
 34 | 
 35 |    unsigned nsentences;
 36 |    unsigned nwords;
 37 |    unsigned nactions;
 38 |    unsigned npos;
 39 | 
 40 |    unsigned nsentencestest;
 41 |    unsigned nsentencesdev;
 42 |    int max;
 43 |    int maxPos;
 44 | 
 45 |    std::map<std::string, unsigned> wordsToInt;
 46 |    std::map<unsigned, std::string> intToWords;
 47 |    std::vector<std::string> actions;
 48 | 
 49 |    std::map<std::string, unsigned> posToInt;
 50 |    std::map<unsigned, std::string> intToPos;
 51 | 
 52 |    int maxChars;
 53 |    std::map<std::string, unsigned> charsToInt;
 54 |    std::map<unsigned, std::string> intToChars;
 55 | 
 56 |    // String literals
 57 |    static constexpr const char* UNK = "UNK";
 58 |    static constexpr const char* BAD0 = "<BAD0>";
 59 | 
 60 | /*  std::map<unsigned,unsigned>* headsTraining;
 61 |   std::map<unsigned,std::string>* labelsTraining;
 62 | 
 63 |   std::map<unsigned,unsigned>*  headsParsing;
 64 |   std::map<unsigned,std::string>* labelsParsing;*/
 65 | 
 66 | 
 67 |  
 68 |  public:
 69 |   Corpus() {
 70 |     max = 0;
 71 |     maxPos = 0;
 72 |     maxChars=0; //Miguel
 73 |   }
 74 | 
 75 | 
 76 | inline unsigned UTF8Len(unsigned char x) {
 77 |   if (x < 0x80) return 1;
 78 |   else if ((x >> 5) == 0x06) return 2;
 79 |   else if ((x >> 4) == 0x0e) return 3;
 80 |   else if ((x >> 3) == 0x1e) return 4;
 81 |   else if ((x >> 2) == 0x3e) return 5;
 82 |   else if ((x >> 1) == 0x7e) return 6;
 83 |   else return 0;
 84 | }
 85 | 
 86 | 
 87 | 
 88 | 
 89 | inline void load_correct_actions(std::string file){
 90 | 	
 91 |   std::ifstream actionsFile(file);
 92 |   //correct_act_sent=new vector<vector<unsigned>>();
 93 |   std::string lineS;
 94 | 	
 95 |   int count=-1;
 96 |   int sentence=-1;
 97 |   bool initial=false;
 98 |   bool first=true;
 99 |   wordsToInt[Corpus::BAD0] = 0;
100 |   intToWords[0] = Corpus::BAD0;
101 |   wordsToInt[Corpus::UNK] = 1; // unknown symbol
102 |   intToWords[1] = Corpus::UNK;
103 |   assert(max == 0);
104 |   assert(maxPos == 0);
105 |   max=2;
106 |   maxPos=1;
107 |   
108 |   charsToInt[BAD0]=1;
109 |   intToChars[1]="BAD0";
110 |   maxChars=1;
111 |   
112 | 	std::vector<unsigned> current_sent;
113 |   std::vector<unsigned> current_sent_pos;
114 |   while (getline(actionsFile, lineS)){
115 |     //istringstream iss(line);
116 |     //string lineS;
117 |  		//iss>>lineS;
118 |     ReplaceStringInPlace(lineS, "-RRB-", "_RRB_");
119 |     ReplaceStringInPlace(lineS, "-LRB-", "_LRB_");
120 | 		if (lineS.empty()) {
121 | 			count = 0;
122 | 			if (!first) {
123 | 				sentences[sentence] = current_sent;
124 | 				sentencesPos[sentence] = current_sent_pos;
125 |       }
126 |       
127 | 			sentence++;
128 | 			nsentences = sentence;
129 |       
130 | 			initial = true;
131 |       current_sent.clear();
132 | 			current_sent_pos.clear();
133 | 		} else if (count == 0) {
134 | 			first = false;
135 | 			//stack and buffer, for now, leave it like this.
136 | 			count = 1;
137 | 			if (initial) {
138 |         // the initial line in each sentence may look like:
139 |         // [][the-det, cat-noun, is-verb, on-adp, the-det, mat-noun, ,-punct, ROOT-ROOT]
140 |         // first, get rid of the square brackets.
141 |         lineS = lineS.substr(3, lineS.size() - 4);
142 |         // read the initial line, token by token "the-det," "cat-noun," ...
143 |         std::istringstream iss(lineS);
144 |         do {
145 |           std::string word;
146 |           iss >> word;
147 |           if (word.size() == 0) { continue; }
148 |           // remove the trailing comma if need be.
149 |           if (word[word.size() - 1] == ',') { 
150 |             word = word.substr(0, word.size() - 1);
151 |           }
152 |           // split the string (at '-') into word and POS tag.
153 |           size_t posIndex = word.rfind('-');
154 |           if (posIndex == std::string::npos) {
155 |             std::cerr << "cant find the dash in '" << word << "'" << std::endl;
156 |           }
157 |           assert(posIndex != std::string::npos);
158 |           std::string pos = word.substr(posIndex + 1);
159 |           word = word.substr(0, posIndex);
160 |           // new POS tag
161 |           if (posToInt[pos] == 0) {
162 |             posToInt[pos] = maxPos;
163 |             intToPos[maxPos] = pos;
164 |             npos = maxPos;
165 |             maxPos++;
166 |           }
167 | 
168 |           // new word
169 |           if (wordsToInt[word] == 0) {
170 |             wordsToInt[word] = max;
171 |             intToWords[max] = word;
172 |             nwords = max;
173 |             max++;
174 | 
175 |             unsigned j = 0;
176 |             while(j < word.length()) {
177 |               std::string wj = "";
178 |               for (unsigned h = j; h < j + UTF8Len(word[j]); h++) {
179 |                 wj += word[h];
180 |               }
181 |               if (charsToInt[wj] == 0) {
182 |                 charsToInt[wj] = maxChars;
183 |                 intToChars[maxChars] = wj;
184 |                 maxChars++;
185 |               }
186 |               j += UTF8Len(word[j]);
187 |             }
188 |           }
189 |         
190 |           current_sent.push_back(wordsToInt[word]);
191 |           current_sent_pos.push_back(posToInt[pos]);
192 |         } while(iss);
193 | 			}
194 | 			initial=false;
195 | 		}
196 | 		else if (count==1){
197 | 			int i=0;
198 | 			bool found=false;
199 | 			for (auto a: actions) {
200 | 				if (a==lineS) {
201 | 					std::vector<unsigned> a=correct_act_sent[sentence];
202 | 	                                a.push_back(i);
203 |         	                        correct_act_sent[sentence]=a;
204 | 					found=true;
205 | 				}
206 | 				i++;
207 | 			}
208 | 			if (!found) {
209 | 				actions.push_back(lineS);
210 | 				std::vector<unsigned> a=correct_act_sent[sentence];
211 | 				a.push_back(actions.size()-1);
212 | 				correct_act_sent[sentence]=a;
213 | 			}
214 | 			count=0;
215 | 		}
216 | 	}
217 | 
218 |   // Add the last sentence.
219 |   if (current_sent.size() > 0) {
220 |     sentences[sentence] = current_sent;
221 |     sentencesPos[sentence] = current_sent_pos;
222 |     sentence++;
223 |     nsentences = sentence;
224 |   }
225 |       
226 |   actionsFile.close();
227 | /*	std::string oov="oov";
228 | 	posToInt[oov]=maxPos;
229 |         intToPos[maxPos]=oov;
230 |         npos=maxPos;
231 |         maxPos++;
232 |         wordsToInt[oov]=max;
233 |         intToWords[max]=oov;
234 |         nwords=max;
235 |         max++;*/
236 | 
237 | 	std::cerr<<"done"<<"\n";
238 | 	for (auto a: actions) {
239 | 		std::cerr<<a<<"\n";
240 | 	}
241 | 	nactions=actions.size();
242 | 	std::cerr<<"nactions:"<<nactions<<"\n";
243 |         std::cerr<<"nwords:"<<nwords<<"\n";
244 | 	for (unsigned i=0;i<npos;i++){
245 |                 std::cerr<<i<<":"<<intToPos[i]<<"\n";
246 |         }
247 | 	nactions=actions.size();
248 | 	
249 | }
250 | 
251 | inline unsigned get_or_add_word(const std::string& word) {
252 |   unsigned& id = wordsToInt[word];
253 |   if (id == 0) {
254 |     id = max;
255 |     ++max;
256 |     intToWords[id] = word;
257 |     nwords = max;
258 |   }
259 |   return id;
260 | }
261 | 
262 | inline void load_correct_actionsDev(std::string file) {
263 |   std::ifstream actionsFile(file);
264 |   std::string lineS;
265 | 
266 |   assert(maxPos > 1);
267 |   assert(max > 3);
268 |   int count = -1;
269 |   int sentence = -1;
270 |   bool initial = false;
271 |   bool first = true;
272 |   std::vector<unsigned> current_sent;
273 |   std::vector<unsigned> current_sent_pos;
274 |   std::vector<std::string> current_sent_str;
275 |   while (getline(actionsFile, lineS)) {
276 |     ReplaceStringInPlace(lineS, "-RRB-", "_RRB_");
277 |     ReplaceStringInPlace(lineS, "-LRB-", "_LRB_");
278 |     if (lineS.empty()) {
279 |       // an empty line marks the end of a sentence.
280 |       count = 0;
281 |       if (!first) {
282 |         sentencesDev[sentence] = current_sent;
283 |         sentencesPosDev[sentence] = current_sent_pos;
284 |         sentencesStrDev[sentence] = current_sent_str;
285 |       }
286 |       
287 |       sentence++;
288 |       nsentencesDev = sentence;
289 |       
290 |       initial = true;
291 |       current_sent.clear();
292 |       current_sent_pos.clear();
293 |       current_sent_str.clear();
294 |     } else if (count == 0) {
295 |       first = false;
296 |       //stack and buffer, for now, leave it like this.
297 |       count = 1;
298 |       if (initial) {
299 |         // the initial line in each sentence may look like:
300 |         // [][the-det, cat-noun, is-verb, on-adp, the-det, mat-noun, ,-punct, ROOT-ROOT]
301 |         // first, get rid of the square brackets.
302 |         lineS = lineS.substr(3, lineS.size() - 4);
303 |         // read the initial line, token by token "the-det," "cat-noun," ...
304 |         std::istringstream iss(lineS);
305 |         do {
306 |           std::string word;
307 |           iss >> word;
308 |           if (word.size() == 0) { continue; }
309 |           // remove the trailing comma if need be.
310 |           if (word[word.size() - 1] == ',') { 
311 |             word = word.substr(0, word.size() - 1);
312 |           }
313 |           // split the string (at '-') into word and POS tag.
314 |           size_t posIndex = word.rfind('-');
315 |           assert(posIndex != std::string::npos);
316 |           std::string pos = word.substr(posIndex + 1);
317 |           word = word.substr(0, posIndex);
318 |           // new POS tag
319 |           if (posToInt[pos] == 0) {
320 |             posToInt[pos] = maxPos;
321 |             intToPos[maxPos] = pos;
322 |             npos = maxPos;
323 |             maxPos++;
324 |           }
325 |           // add an empty string for any token except OOVs (it is easy to 
326 |           // recover the surface form of non-OOV using intToWords(id)).
327 |           current_sent_str.push_back("");
328 |           // OOV word
329 |           if (wordsToInt[word] == 0) {
330 |             if (USE_SPELLING) {
331 |               max = nwords + 1;
332 |               //std::cerr<< "max:" << max << "\n";
333 |               wordsToInt[word] = max;
334 |               intToWords[max] = word;
335 |               nwords = max;
336 |             } else {
337 |               // save the surface form of this OOV before overwriting it.
338 |               current_sent_str[current_sent_str.size()-1] = word;
339 |               word = Corpus::UNK;
340 |             }
341 |           }
342 |           current_sent.push_back(wordsToInt[word]);
343 |           current_sent_pos.push_back(posToInt[pos]);
344 |         } while(iss);
345 |       }
346 |       initial = false;
347 |     } else if (count == 1) {
348 |       auto actionIter = std::find(actions.begin(), actions.end(), lineS);
349 |       if (actionIter != actions.end()) {
350 |         unsigned actionIndex = std::distance(actions.begin(), actionIter);
351 |         correct_act_sentDev[sentence].push_back(actionIndex);
352 |       } else {
353 |         // TODO: right now, new actions which haven't been observed in training
354 |         // are not added to correct_act_sentDev. This may be a problem if the
355 |         // training data is little.
356 |       }
357 |       count=0;
358 |     }
359 |   }
360 | 
361 |   // Add the last sentence.
362 |   if (current_sent.size() > 0) {
363 |     sentencesDev[sentence] = current_sent;
364 |     sentencesPosDev[sentence] = current_sent_pos;
365 |     sentencesStrDev[sentence] = current_sent_str;
366 |     sentence++;
367 |     nsentencesDev = sentence;
368 |   }
369 |   
370 |   actionsFile.close();
371 | }
372 | 
373 | void ReplaceStringInPlace(std::string& subject, const std::string& search,
374 |                           const std::string& replace) {
375 |     size_t pos = 0;
376 |     while ((pos = subject.find(search, pos)) != std::string::npos) {
377 |          subject.replace(pos, search.length(), replace);
378 |          pos += replace.length();
379 |     }
380 | }
381 | 
382 | 
383 | /*  inline unsigned max() const { return words_.size(); }
384 |   inline unsigned size() const { return words_.size(); }
385 |   inline unsigned count(const std::string& word) const { return d_.count(word); }*/
386 | 
387 | /*  static bool is_ws(char x) {
388 |     return (x == ' ' || x == '\t');
389 |   }
390 | 
391 |   inline void ConvertWhitespaceDelimitedLine(const std::string& line, std::vector<unsigned>* out) {
392 |     size_t cur = 0;
393 |     size_t last = 0;
394 |     int state = 0;
395 |     out->clear();
396 |     while(cur < line.size()) {
397 |       if (is_ws(line[cur++])) {
398 |         if (state == 0) continue;
399 |         out->push_back(Convert(line.substr(last, cur - last - 1)));
400 |         state = 0;
401 |       } else {
402 |         if (state == 1) continue;
403 |         last = cur - 1;
404 |         state = 1;
405 |       }
406 |     }
407 |     if (state == 1)
408 |       out->push_back(Convert(line.substr(last, cur - last)));
409 |   }
410 | 
411 |   inline unsigned Convert(const std::string& word, bool frozen = false) {
412 |     Map::iterator i = d_.find(word);
413 |     if (i == d_.end()) {
414 |       if (frozen)
415 |         return 0;
416 |       words_.push_back(word);
417 |       d_[word] = words_.size();
418 |       return words_.size();
419 |     } else {
420 |       return i->second;
421 |     }
422 |   }
423 | 
424 |   inline const std::string& Convert(const unsigned id) const {
425 |     if (id == 0) return b0_;
426 |     return words_[id-1];
427 |   }
428 |   template<class Archive> void serialize(Archive& ar, const unsigned int version) {
429 |     ar & b0_;
430 |     ar & words_;
431 |     ar & d_;
432 |   }
433 |  private:
434 |   std::string b0_;
435 |   std::vector<std::string> words_;
436 |   Map d_;*/
437 | };
438 | 
439 | /*void ReadFromFile(const std::string& filename,
440 |                   Corpus* d,
441 |                   std::vector<std::vector<unsigned> >* src,
442 |                   std::set<unsigned>* src_vocab) {
443 |   std::cerr << "Reading from " << filename << std::endl;
444 |   std::ifstream in(filename);
445 |   assert(in);
446 |   std::string line;
447 |   int lc = 0;
448 |   while(getline(in, line)) {
449 |     ++lc;
450 |     src->push_back(std::vector<unsigned>());
451 |     d->ConvertWhitespaceDelimitedLine(line, &src->back());
452 |     for (unsigned i = 0; i < src->back().size(); ++i) src_vocab->insert(src->back()[i]);
453 |   }
454 | }
455 | 
456 | void ReadParallelCorpusFromFile(const std::string& filename,
457 |                                 Corpus* d,
458 |                                 std::vector<std::vector<unsigned> >* src,
459 |                                 std::vector<std::vector<unsigned> >* trg,
460 |                                 std::set<unsigned>* src_vocab,
461 |                                 std::set<unsigned>* trg_vocab) {
462 |   src->clear();
463 |   trg->clear();
464 |   std::cerr << "Reading from " << filename << std::endl;
465 |   std::ifstream in(filename);
466 |   assert(in);
467 |   std::string line;
468 |   int lc = 0;
469 |   std::vector<unsigned> v;
470 |   const unsigned kDELIM = d->Convert("|||");
471 |   while(getline(in, line)) {
472 |     ++lc;
473 |     src->push_back(std::vector<unsigned>());
474 |     trg->push_back(std::vector<unsigned>());
475 |     d->ConvertWhitespaceDelimitedLine(line, &v);
476 |     unsigned j = 0;
477 |     while(j < v.size() && v[j] != kDELIM) {
478 |       src->back().push_back(v[j]);
479 |       src_vocab->insert(v[j]);
480 |       ++j;
481 |     }
482 |     if (j >= v.size()) {
483 |       std::cerr << "Malformed input in parallel corpus: " << filename << ":" << lc << std::endl;
484 |       abort();
485 |     }
486 |     ++j;
487 |     while(j < v.size()) {
488 |       trg->back().push_back(v[j]);
489 |       trg_vocab->insert(v[j]);
490 |       ++j;
491 |     }
492 |   }
493 | }*/
494 | 
495 | } // namespace
496 | 
497 | #endif
498 | 


--------------------------------------------------------------------------------
/ner-system/lstm-parse.cc:
--------------------------------------------------------------------------------
   1 | #include <cstdlib>
   2 | #include <algorithm>
   3 | #include <sstream>
   4 | #include <iostream>
   5 | #include <vector>
   6 | #include <limits>
   7 | #include <cmath>
   8 | #include <chrono>
   9 | #include <ctime>
  10 | 
  11 | #include <unordered_map>
  12 | #include <unordered_set>
  13 | 
  14 | #include <execinfo.h>
  15 | #include <unistd.h>
  16 | #include <signal.h>
  17 | 
  18 | // #include <boost/archive/text_oarchive.hpp>
  19 | // #include <boost/archive/text_iarchive.hpp>
  20 | #include <boost/program_options.hpp>
  21 | 
  22 | #include "dynet/training.h"
  23 | #include "dynet/dynet.h"
  24 | #include "dynet/expr.h"
  25 | #include "dynet/nodes.h"
  26 | #include "dynet/lstm.h"
  27 | #include "dynet/rnn.h"
  28 | #include "dynet/io.h"
  29 | #include "c2.h"
  30 | 
  31 | cpyp::Corpus corpus;
  32 | volatile bool requested_stop = false;
  33 | unsigned LAYERS = 2;
  34 | unsigned INPUT_DIM = 40;
  35 | unsigned HIDDEN_DIM = 60;
  36 | unsigned ACTION_DIM = 36;
  37 | unsigned PRETRAINED_DIM = 50;
  38 | unsigned LSTM_INPUT_DIM = 60;
  39 | unsigned POS_DIM = 10;
  40 | unsigned REL_DIM = 8;
  41 | 
  42 | 
  43 | unsigned LSTM_CHAR_OUTPUT_DIM = 100; //Miguel
  44 | bool USE_SPELLING = false;
  45 | 
  46 | float DROPOUT = 0.0f;
  47 | 
  48 | bool USE_POS = false;
  49 | 
  50 | constexpr const char* ROOT_SYMBOL = "ROOT";
  51 | unsigned kROOT_SYMBOL = 0;
  52 | unsigned ACTION_SIZE = 0;
  53 | unsigned VOCAB_SIZE = 0;
  54 | unsigned POS_SIZE = 0;
  55 | 
  56 | unsigned CHAR_SIZE = 255; //size of ascii chars... Miguel
  57 | 
  58 | using namespace dynet;
  59 | using namespace std;
  60 | namespace po = boost::program_options;
  61 | 
  62 | vector<unsigned> possible_actions;
  63 | unordered_map<unsigned, vector<float>> pretrained;
  64 | 
  65 | void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
  66 |   po::options_description opts("Configuration options");
  67 |   opts.add_options()
  68 |         ("training_data,T", po::value<string>(), "List of Transitions - Training corpus")
  69 |         ("dev_data,d", po::value<string>(), "Development corpus")
  70 |         ("test_data,p", po::value<string>(), "Test corpus")
  71 |         ("dropout,D", po::value<float>(), "Dropout rate")
  72 |         ("unk_strategy,o", po::value<unsigned>()->default_value(1), "Unknown word strategy: 1 = singletons become UNK with probability unk_prob")
  73 |         ("unk_prob,u", po::value<double>()->default_value(0.2), "Probably with which to replace singletons with UNK in training data")
  74 |         ("model,m", po::value<string>(), "Load saved model from this file")
  75 |         ("use_pos_tags,P", "make POS tags visible to parser")
  76 |         ("beam_size,b", po::value<unsigned>()->default_value(1), "beam size")
  77 |         ("layers", po::value<unsigned>()->default_value(2), "number of LSTM layers")
  78 |         ("action_dim", po::value<unsigned>()->default_value(16), "action embedding size")
  79 |         ("input_dim", po::value<unsigned>()->default_value(32), "input embedding size")
  80 |         ("hidden_dim", po::value<unsigned>()->default_value(64), "hidden dimension")
  81 |         ("pretrained_dim", po::value<unsigned>()->default_value(50), "pretrained input dimension")
  82 |         ("pos_dim", po::value<unsigned>()->default_value(12), "POS dimension")
  83 |         ("rel_dim", po::value<unsigned>()->default_value(10), "relation dimension")
  84 |         ("lstm_input_dim", po::value<unsigned>()->default_value(60), "LSTM input dimension")
  85 |         ("train,t", "Should training be run?")
  86 |         ("words,w", po::value<string>(), "Pretrained word embeddings")
  87 |         ("use_spelling,S", "Use spelling model") //Miguel. Spelling model
  88 |         ("help,h", "Help");
  89 |   po::options_description dcmdline_options;
  90 |   dcmdline_options.add(opts);
  91 |   po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
  92 |   if (conf->count("help")) {
  93 |     cerr << dcmdline_options << endl;
  94 |     exit(1);
  95 |   }
  96 |   if (conf->count("training_data") == 0) {
  97 |     cerr << "Please specify --traing_data (-T): this is required to determine the vocabulary mapping, even if the parser is used in prediction mode.\n";
  98 |     exit(1);
  99 |   }
 100 | }
 101 | 
 102 | struct ParserBuilder {
 103 | 
 104 |   LSTMBuilder stack_lstm; // (layers, input, hidden, trainer)
 105 |   LSTMBuilder output_lstm; // (layers, input, hidden, trainer)
 106 |   LSTMBuilder buffer_lstm;
 107 |   LSTMBuilder action_lstm;
 108 | 
 109 | 
 110 |   LSTMBuilder ent_lstm_fwd;
 111 |   LSTMBuilder ent_lstm_rev;
 112 | 
 113 |   LookupParameter p_w; // word embeddings
 114 |   LookupParameter p_t; // pretrained word embeddings (not updated)
 115 |   LookupParameter p_a; // input action embeddings
 116 |   LookupParameter p_r; // relation embeddings
 117 |   LookupParameter p_p; // pos tag embeddings
 118 |   Parameter p_pbias; // parser state bias
 119 |   Parameter p_A; // action lstm to parser state
 120 |   Parameter p_B; // buffer lstm to parser state
 121 |   Parameter p_O; // output lstm to parser state
 122 | 
 123 |   Parameter p_S; // stack lstm to parser state
 124 |   Parameter p_H; // head matrix for composition function
 125 |   Parameter p_D; // dependency matrix for composition function
 126 |   Parameter p_R; // relation matrix for composition function
 127 |   Parameter p_w2l; // word to LSTM input
 128 |   Parameter p_p2l; // POS to LSTM input
 129 |   Parameter p_t2l; // pretrained word embeddings to LSTM input
 130 |   Parameter p_ib; // LSTM input bias
 131 |   Parameter p_cbias; // composition function bias
 132 |   Parameter p_p2a;   // parser state to action
 133 |   Parameter p_action_start;  // action bias
 134 |   Parameter p_abias;  // action bias
 135 |   Parameter p_buffer_guard;  // end of buffer
 136 |   Parameter p_stack_guard;  // end of stack
 137 |   Parameter p_output_guard;  // end of output buffer
 138 | 
 139 |   Parameter p_start_of_word;//Miguel -->dummy <s> symbol
 140 |   Parameter p_end_of_word; //Miguel --> dummy </s> symbol
 141 |   LookupParameter char_emb; //Miguel-> mapping of characters to vectors 
 142 | 
 143 | 
 144 |   LSTMBuilder fw_char_lstm; // Miguel
 145 |   LSTMBuilder bw_char_lstm; //Miguel
 146 |  
 147 |   Parameter p_cW;
 148 | 
 149 | 
 150 |   explicit ParserBuilder(ParameterCollection & model, const unordered_map<unsigned, vector<float>>& pretrained) :
 151 |       stack_lstm(LAYERS, LSTM_INPUT_DIM, HIDDEN_DIM, model),
 152 |       output_lstm(LAYERS, LSTM_INPUT_DIM, HIDDEN_DIM, model),
 153 |       buffer_lstm(LAYERS, LSTM_INPUT_DIM, HIDDEN_DIM, model),
 154 |       action_lstm(LAYERS, ACTION_DIM, HIDDEN_DIM, model),
 155 |       ent_lstm_fwd(LAYERS, LSTM_INPUT_DIM, LSTM_INPUT_DIM, model), 
 156 |       ent_lstm_rev(LAYERS, LSTM_INPUT_DIM, LSTM_INPUT_DIM, model),
 157 |       p_w(model.add_lookup_parameters(VOCAB_SIZE, {INPUT_DIM})),
 158 |       p_a(model.add_lookup_parameters(ACTION_SIZE, {ACTION_DIM})),
 159 |       p_r(model.add_lookup_parameters(ACTION_SIZE, {REL_DIM})),
 160 |       p_pbias(model.add_parameters({HIDDEN_DIM})),
 161 |       p_A(model.add_parameters({HIDDEN_DIM, HIDDEN_DIM})),
 162 |       p_B(model.add_parameters({HIDDEN_DIM, HIDDEN_DIM})),
 163 |       p_O(model.add_parameters({HIDDEN_DIM, HIDDEN_DIM})),
 164 |       p_S(model.add_parameters({HIDDEN_DIM, HIDDEN_DIM})),
 165 |       p_H(model.add_parameters({LSTM_INPUT_DIM, LSTM_INPUT_DIM})),
 166 |       p_D(model.add_parameters({LSTM_INPUT_DIM, LSTM_INPUT_DIM})),
 167 |       p_R(model.add_parameters({LSTM_INPUT_DIM, REL_DIM})),
 168 |       p_w2l(model.add_parameters({LSTM_INPUT_DIM, INPUT_DIM})),
 169 |       p_ib(model.add_parameters({LSTM_INPUT_DIM})),
 170 |       p_cbias(model.add_parameters({LSTM_INPUT_DIM})),
 171 |       p_p2a(model.add_parameters({ACTION_SIZE, HIDDEN_DIM})),
 172 |       p_action_start(model.add_parameters({ACTION_DIM})),
 173 |       p_abias(model.add_parameters({ACTION_SIZE})),
 174 | 
 175 |       p_buffer_guard(model.add_parameters({LSTM_INPUT_DIM})),
 176 |       p_stack_guard(model.add_parameters({LSTM_INPUT_DIM})),
 177 |       p_output_guard(model.add_parameters({LSTM_INPUT_DIM})),
 178 | 
 179 |       p_start_of_word(model.add_parameters({LSTM_INPUT_DIM})), //Miguel
 180 |       p_end_of_word(model.add_parameters({LSTM_INPUT_DIM})), //Miguel 
 181 | 
 182 |       char_emb(model.add_lookup_parameters(CHAR_SIZE, {INPUT_DIM})),//Miguel
 183 | 
 184 | //      fw_char_lstm(LAYERS, LSTM_CHAR_OUTPUT_DIM, LSTM_INPUT_DIM, model), //Miguel
 185 | //      bw_char_lstm(LAYERS, LSTM_CHAR_OUTPUT_DIM, LSTM_INPUT_DIM,  model), //Miguel
 186 | 
 187 |       fw_char_lstm(LAYERS, LSTM_INPUT_DIM, LSTM_CHAR_OUTPUT_DIM/2, model), //Miguel 
 188 |       bw_char_lstm(LAYERS, LSTM_INPUT_DIM, LSTM_CHAR_OUTPUT_DIM/2, model), /*Miguel*/
 189 |       p_cW(model.add_parameters({LSTM_INPUT_DIM, LSTM_INPUT_DIM * 2})) { //ner. {
 190 |     if (USE_POS) {
 191 |       p_p = model.add_lookup_parameters(POS_SIZE, {POS_DIM});
 192 |       p_p2l = model.add_parameters({LSTM_INPUT_DIM, POS_DIM});
 193 |     }
 194 |     if (pretrained.size() > 0) {
 195 |       p_t = model.add_lookup_parameters(VOCAB_SIZE, {PRETRAINED_DIM});
 196 |       for (auto it : pretrained)
 197 |         p_t.initialize(it.first, it.second);
 198 |       p_t2l = model.add_parameters({LSTM_INPUT_DIM, PRETRAINED_DIM});
 199 |     } else {
 200 |       p_t = nullptr;
 201 |       p_t2l = nullptr;
 202 |     }
 203 |   }
 204 | 
 205 | static bool IsActionForbidden(const string& a, unsigned bsize, unsigned ssize, vector<int> stacki) {
 206 | 
 207 |   bool is_shift = (a[0] == 'S');  //MIGUEL
 208 |   bool is_reduce = (a[0] == 'R');
 209 |   bool is_output = (a[0] == 'O');
 210 | //  std::cout<<"is red:"<<is_reduce<<"\n";
 211 |   if (is_shift && bsize == 1) return true;
 212 |   if (is_reduce && ssize ==1) return true;
 213 |   if (is_output && bsize ==1) return true;
 214 | 
 215 |   return false;
 216 | }
 217 | 
 218 | /*static bool IsActionForbidden(const string& a, unsigned bsize, unsigned ssize) {
 219 |   bool is_shift = (a[0] == 'S');
 220 |   bool is_reduce = !is_shift;
 221 |   if (is_shift && bsize == 1) return true;
 222 |   if (is_reduce && ssize < 3) return true;
 223 |   if (bsize == 2 && // ROOT is the only thing remaining on buffer
 224 |       ssize > 2 && // there is more than a single element on the stack
 225 |       is_shift) return true;
 226 |   // only attach left to ROOT
 227 |   if (bsize == 1 && ssize == 3 && a[0] == 'R') return true;
 228 |   return false;
 229 | }*/
 230 | 
 231 | map<int,string> compute_ents(unsigned sent_len, const vector<unsigned>& actions, const vector<string>& setOfActions, map<int,string>* pr = nullptr) {
 232 |   map<int,string> r;
 233 |   map<int,string>& rels = (pr ? *pr : r);
 234 |   for(unsigned i=0;i<sent_len;i++) { rels[i]="ERROR"; }
 235 |   vector<int> bufferi(sent_len + 1, 0), stacki(1, -999), outputi(1, -999);
 236 |   for (unsigned i = 0; i < sent_len; ++i)
 237 |     bufferi[sent_len - i] = i;
 238 |   bufferi[0] = -999;
 239 | 
 240 |   for (auto action: actions) { // loop over transitions for sentence
 241 |     const string& actionString=setOfActions[action];
 242 |    // std::cout<<"int"<<action<<"-"<<"actionString"<<actionString<<"\n";
 243 |     const char ac = actionString[0];
 244 |     if (ac =='S') {  // SHIFT
 245 |       assert(bufferi.size() > 1); // dummy symbol means > 1 (not >= 1)
 246 |       stacki.push_back(bufferi.back());
 247 |       bufferi.pop_back();
 248 |     } 
 249 | 
 250 |     else if (ac=='R') { // REDUCE
 251 |       assert(stacki.size() > 1); // dummy symbol means > 2 (not >= 2)
 252 |       while(stacki.size()>1) {
 253 | 	rels[stacki.back()] = actionString;
 254 |         outputi.push_back(stacki.back());
 255 |         stacki.pop_back();
 256 |       }
 257 |     }
 258 |     else if (ac =='O') {
 259 |        assert(bufferi.size() > 1); // dummy symbol means > 1 (not >= 1)
 260 |        outputi.push_back(bufferi.back());
 261 |        rels[bufferi.back()] = "0";
 262 |        bufferi.pop_back();
 263 |        
 264 |     }
 265 |   }
 266 |   assert(bufferi.size() == 1);
 267 |   //assert(stacki.size() == 2);
 268 |   return rels;
 269 | }
 270 | 
 271 | 
 272 | // given the first character of a UTF8 block, find out how wide it is
 273 | // see http://en.wikipedia.org/wiki/UTF-8 for more info
 274 | inline unsigned int UTF8Len(unsigned char x) {
 275 |   if (x < 0x80) return 1;
 276 |   else if ((x >> 5) == 0x06) return 2;
 277 |   else if ((x >> 4) == 0x0e) return 3;
 278 |   else if ((x >> 3) == 0x1e) return 4;
 279 |   else if ((x >> 2) == 0x3e) return 5;
 280 |   else if ((x >> 1) == 0x7e) return 6;
 281 |   else return 0;
 282 | }
 283 | 
 284 | 
 285 | // *** if correct_actions is empty, this runs greedy decoding ***
 286 | // returns parse actions for input sentence (in training just returns the reference)
 287 | // OOV handling: raw_sent will have the actual words
 288 | //               sent will have words replaced by appropriate UNK tokens
 289 | // this lets us use pretrained embeddings, when available, for words that were OOV in the
 290 | // parser training data
 291 | pair<vector<unsigned>, Expression> log_prob_parser(ComputationGraph* hg,
 292 |                                     const vector<unsigned>& raw_sent,  // raw sentence
 293 |                                     const vector<unsigned>& sent,  // sent with oovs replaced
 294 |                                     const vector<unsigned>& sentPos,
 295 |                                     const vector<unsigned>& correct_actions,
 296 |                                     const vector<string>& setOfActions,
 297 |                                     const map<unsigned, std::string>& intToWords,
 298 |                                     bool is_evaluation,
 299 |                                     double *right) {
 300 |   //for (unsigned i = 0; i < sent.size(); ++i) cerr << ' ' << intToWords.find(sent[i])->second;
 301 |   //cerr << endl;
 302 |     vector<unsigned> results;
 303 |     const bool build_training_graph = correct_actions.size() > 0;
 304 |     //std::cout<<"****************"<<"\n";
 305 |     bool apply_dropout = (DROPOUT && !is_evaluation);
 306 | 
 307 |     stack_lstm.new_graph(*hg);
 308 |     buffer_lstm.new_graph(*hg);
 309 |     output_lstm.new_graph(*hg);
 310 |     action_lstm.new_graph(*hg);
 311 | 
 312 |     ent_lstm_fwd.new_graph(*hg);
 313 |     ent_lstm_rev.new_graph(*hg);
 314 | 
 315 | 
 316 | 
 317 |   if (apply_dropout) {
 318 |       stack_lstm.set_dropout(DROPOUT);
 319 |       action_lstm.set_dropout(DROPOUT);
 320 |       buffer_lstm.set_dropout(DROPOUT);
 321 |       ent_lstm_fwd.set_dropout(DROPOUT);
 322 |       ent_lstm_rev.set_dropout(DROPOUT);
 323 |     } else {
 324 |       stack_lstm.disable_dropout();
 325 |       action_lstm.disable_dropout();
 326 |       buffer_lstm.disable_dropout();
 327 |       ent_lstm_fwd.disable_dropout();
 328 |       ent_lstm_rev.disable_dropout();
 329 |     }
 330 | 
 331 |     stack_lstm.start_new_sequence();
 332 |     buffer_lstm.start_new_sequence();
 333 |     output_lstm.start_new_sequence();
 334 |     action_lstm.start_new_sequence();
 335 |     // variables in the computation graph representing the parameters
 336 |     Expression pbias = parameter(*hg, p_pbias);
 337 |     // Expression H = parameter(*hg, p_H);
 338 |     // Expression D = parameter(*hg, p_D);
 339 |     Expression R = parameter(*hg, p_R);
 340 |     Expression cbias = parameter(*hg, p_cbias);
 341 |     Expression S = parameter(*hg, p_S);
 342 |     Expression B = parameter(*hg, p_B);
 343 |     Expression O = parameter(*hg, p_O);
 344 | 
 345 |     Expression A = parameter(*hg, p_A);
 346 |     Expression ib = parameter(*hg, p_ib);
 347 |     Expression w2l = parameter(*hg, p_w2l);
 348 |     Expression p2l;
 349 |     if (USE_POS)
 350 |       p2l = parameter(*hg, p_p2l);
 351 |     Expression t2l;
 352 |     if (p_t2l.p)
 353 |       t2l = parameter(*hg, p_t2l);
 354 |     Expression p2a = parameter(*hg, p_p2a);
 355 |     Expression abias = parameter(*hg, p_abias);
 356 |     Expression action_start = parameter(*hg, p_action_start);
 357 | 
 358 |     action_lstm.add_input(action_start);
 359 | 
 360 |     Expression cW = parameter(*hg, p_cW); 
 361 | 
 362 |     vector<Expression> buffer(sent.size() + 1);  // variables representing word embeddings (possibly including POS info)
 363 |     vector<int> bufferi(sent.size() + 1);  // position of the words in the sentence
 364 |     // precompute buffer representation from left to right
 365 | 
 366 | 
 367 |     Expression word_end = parameter(*hg, p_end_of_word); //Miguel
 368 |     Expression word_start = parameter(*hg, p_start_of_word); //Miguel
 369 | 
 370 |     if (USE_SPELLING){
 371 |        fw_char_lstm.new_graph(*hg);
 372 |         //    fw_char_lstm.add_parameter_edges(hg);
 373 | 
 374 |        bw_char_lstm.new_graph(*hg);
 375 |        //    bw_char_lstm.add_parameter_edges(hg);
 376 |     }
 377 | 
 378 | 
 379 | 
 380 |     for (unsigned i = 0; i < sent.size(); ++i) {
 381 |       assert(sent[i] < VOCAB_SIZE);
 382 |       //Expression w = lookup(*hg, p_w, sent[i]);
 383 | 
 384 |       unsigned wi=sent[i];
 385 |       std::string ww=intToWords.at(wi);
 386 |       Expression w;
 387 |       /**********SPELLING MODEL*****************/
 388 |       if (USE_SPELLING) {
 389 |         //std::cout<<"using spelling"<<"\n";
 390 |         if (ww.length()==4  && ww[0]=='R' && ww[1]=='O' && ww[2]=='O' && ww[3]=='T'){
 391 |           w=lookup(*hg, p_w, sent[i]); //we do not need a LSTM encoding for the root word, so we put it directly-.
 392 |         }
 393 |         else {
 394 | 
 395 |             fw_char_lstm.start_new_sequence();
 396 |             //cerr<<"start_new_sequence done"<<"\n";
 397 | 
 398 |             fw_char_lstm.add_input(word_start);
 399 |             //cerr<<"added start of word symbol"<<"\n";
 400 |             /*for (unsigned j=0;j<w.length();j++){
 401 | 
 402 |                 //cerr<<j<<":"<<w[j]<<"\n"; 
 403 |                 Expression cj=lookup(*hg, char_emb, w[j]);
 404 |                 fw_char_lstm.add_input(cj, hg);
 405 |         
 406 |                //std::cout<<"Inputdim:"<<LSTM_INPUT_DIM<<"\n";  
 407 |                //hg->incremental_forward();
 408 | 
 409 |             }*/
 410 | 	    std::vector<int> strevbuffer;
 411 |             for (unsigned j=0;j<ww.length();j+=UTF8Len(ww[j])){
 412 | 
 413 |                 //cerr<<j<<":"<<w[j]<<"\n"; 
 414 |                 std::string wj;
 415 |                 for (unsigned h=j;h<j+UTF8Len(ww[j]);h++) wj+=ww[h];
 416 |                 //std::cout<<"fw"<<wj<<"\n";
 417 |                 int wjint=corpus.charsToInt[wj];
 418 | 		//std::cout<<"fw:"<<wjint<<"\n";
 419 | 		strevbuffer.push_back(wjint);
 420 |                 Expression cj=lookup(*hg, char_emb, wjint);
 421 |                 fw_char_lstm.add_input(cj);
 422 | 
 423 |                //std::cout<<"Inputdim:"<<LSTM_INPUT_DIM<<"\n";  
 424 |                //hg->incremental_forward();
 425 | 
 426 |             }
 427 |             fw_char_lstm.add_input(word_end);
 428 |             //cerr<<"added end of word symbol"<<"\n";
 429 | 
 430 | 
 431 | 
 432 |             Expression fw_i=fw_char_lstm.back();
 433 | 
 434 |             //cerr<<"fw_char_lstm.back() done"<<"\n";
 435 | 
 436 |             bw_char_lstm.start_new_sequence();
 437 |             //cerr<<"bw start new sequence done"<<"\n";
 438 | 
 439 |             bw_char_lstm.add_input(word_end);
 440 | 	    //for (unsigned j=w.length()-1;j>=0;j--){
 441 |             /*for (unsigned j=w.length();j-->0;){
 442 |                //cerr<<j<<":"<<w[j]<<"\n";
 443 |                Expression cj=lookup(*hg, char_emb, w[j]);
 444 |                bw_char_lstm.add_input(cj); 
 445 |             }*/
 446 | 
 447 | 	    while(!strevbuffer.empty()) {
 448 | 		int wjint=strevbuffer.back();
 449 | 		//std::cout<<"bw:"<<wjint<<"\n";
 450 | 		Expression cj=lookup(*hg, char_emb, wjint);
 451 |                 bw_char_lstm.add_input(cj);
 452 | 		strevbuffer.pop_back();
 453 | 	    }
 454 | 	    
 455 |             /*for (unsigned j=w.length()-1;j>0;j=j-UTF8Len(w[j])) {
 456 | 
 457 |                 //cerr<<j<<":"<<w[j]<<"\n"; 
 458 |                 std::string wj;
 459 |                 for (unsigned h=j;h<j+UTF8Len(w[j]);h++) wj+=w[h];
 460 |                 std::cout<<"bw"<<wj<<"\n";
 461 |                 int wjint=corpus.charsToInt[wj];
 462 |                 Expression cj=lookup(*hg, char_emb, wjint);
 463 |                 bw_char_lstm.add_input(cj);
 464 | 
 465 |                //std::cout<<"Inputdim:"<<LSTM_INPUT_DIM<<"\n";  
 466 |                //hg->incremental_forward();
 467 | 
 468 |             }*/
 469 |             bw_char_lstm.add_input(word_start);
 470 |             //cerr<<"start symbol in bw seq"<<"\n";     
 471 | 
 472 |             Expression bw_i=bw_char_lstm.back();
 473 | 
 474 |             vector<Expression> tt = {fw_i, bw_i};
 475 |             w=concatenate(tt); //and this goes into the buffer...
 476 |             //cerr<<"fw and bw done"<<"\n";
 477 |          }
 478 | 
 479 | 	}
 480 |       /**************************************************/
 481 |       //cerr<<"concatenate?"<<"\n";
 482 | 
 483 |       /***************NO SPELLING*************************************/
 484 | 
 485 |       // Expression w = lookup(*hg, p_w, sent[i]);
 486 |       else { //NO SPELLING
 487 |           //Don't use SPELLING
 488 |           //std::cout<<"don't use spelling"<<"\n";
 489 |           w=lookup(*hg, p_w, sent[i]);
 490 |       }
 491 | 
 492 |       Expression i_i;
 493 |       if (USE_POS) {
 494 |         Expression p = lookup(*hg, p_p, sentPos[i]);
 495 |         i_i = affine_transform({ib, w2l, w, p2l, p});
 496 |       } else {
 497 |         i_i = affine_transform({ib, w2l, w});
 498 |       }
 499 |       if (p_t.p && pretrained.count(raw_sent[i])) {
 500 |         Expression t = const_lookup(*hg, p_t, raw_sent[i]);
 501 |         i_i = affine_transform({i_i, t2l, t});
 502 |       }
 503 |       buffer[sent.size() - i] = rectify(i_i);
 504 |       bufferi[sent.size() - i] = i;
 505 |     }
 506 |     // dummy symbol to represent the empty buffer
 507 |     buffer[0] = parameter(*hg, p_buffer_guard);
 508 |     bufferi[0] = -999;
 509 |     for (auto& b : buffer)
 510 |       buffer_lstm.add_input(b);
 511 | 
 512 |     vector<Expression> stack;  // variables representing subtree embeddings
 513 |     vector<int> stacki; // position of words in the sentence of head of subtree
 514 |     stack.push_back(parameter(*hg, p_stack_guard));
 515 |     stacki.push_back(-999); // not used for anything
 516 | 
 517 |     vector<Expression> output;  // variables representing subtree embeddings
 518 |     vector<int> outputi;
 519 |     output.push_back(parameter(*hg, p_output_guard));
 520 |     outputi.push_back(-999); // not used for anything
 521 |     // drive dummy symbol on stack through LSTM
 522 |     stack_lstm.add_input(stack.back());
 523 |     output_lstm.add_input(output.back());
 524 |     vector<Expression> log_probs;
 525 |     string rootword;
 526 |     unsigned action_count = 0;  // incremented at each prediction
 527 |     while(buffer.size() > 1 || stack.size()>1) {
 528 | 
 529 |       // get list of possible actions for the current parser state
 530 |       vector<unsigned> current_valid_actions;
 531 |       for (auto a: possible_actions) {
 532 |         if (IsActionForbidden(setOfActions[a], buffer.size(), stack.size(), stacki))
 533 |           continue;
 534 |         current_valid_actions.push_back(a);
 535 |       }
 536 | 
 537 |       // p_t = pbias + S * slstm + B * blstm + A * almst
 538 |       Expression p_t = affine_transform({pbias, O, output_lstm.back(), S, stack_lstm.back(), B, buffer_lstm.back(), A, action_lstm.back()});
 539 |       Expression nlp_t = rectify(p_t);
 540 |       // r_t = abias + p2a * nlp
 541 |       Expression r_t = affine_transform({abias, p2a, nlp_t});
 542 | 
 543 |       // adist = log_softmax(r_t, current_valid_actions)
 544 |       Expression adiste = log_softmax(r_t, current_valid_actions);
 545 |       vector<float> adist = as_vector(hg->incremental_forward(adiste));
 546 |       double best_score = adist[current_valid_actions[0]];
 547 |       unsigned best_a = current_valid_actions[0];
 548 |       for (unsigned i = 1; i < current_valid_actions.size(); ++i) {
 549 |         if (adist[current_valid_actions[i]] > best_score) {
 550 |           best_score = adist[current_valid_actions[i]];
 551 |           best_a = current_valid_actions[i];
 552 |         }
 553 |       }
 554 |       unsigned action = best_a;
 555 |       if (build_training_graph) {  // if we have reference actions (for training) use the reference action
 556 |         action = correct_actions[action_count];
 557 |         if (best_a == action) { (*right)++; }
 558 |       }
 559 |       ++action_count;
 560 |       // action_log_prob = pick(adist, action)
 561 |       log_probs.push_back(pick(adiste, action));
 562 |       results.push_back(action);
 563 |       //std::cout<<"action:"<<action<<"\n";
 564 | 
 565 |       // add current action to action LSTM
 566 |       Expression actione = lookup(*hg, p_a, action);
 567 |       action_lstm.add_input(actione);
 568 | 
 569 |       // get relation embedding from action (TODO: convert to relation from action?)
 570 |       Expression relation = lookup(*hg, p_r, action);
 571 | 
 572 |       // do action
 573 |       const string& actionString=setOfActions[action];
 574 |       //cerr << "A=" << actionString << " Bsize=" << buffer.size() << " Ssize=" << stack.size() << endl;
 575 |       const char ac = actionString[0];
 576 | 
 577 |       //std::cout<<"buffer-size:"<<bufferi.size()<<"\n";
 578 |       //std::cout<<"stack-size:"<<stacki.size()<<"\n";
 579 |       //std::cout<<"output-size:"<<outputi.size()<<"\n";
 580 |       //std::cout<<ac<<"\n";
 581 |       if (ac =='S') {  // SHIFT
 582 |          assert(buffer.size() > 1); // dummy symbol means > 1 (not >= 1)
 583 |           stack.push_back(buffer.back());
 584 |           stack_lstm.add_input(buffer.back());
 585 |           stacki.push_back(bufferi.back());
 586 |           buffer.pop_back();
 587 |           buffer_lstm.rewind_one_step();
 588 |           bufferi.pop_back();
 589 |        }
 590 |        else if (ac=='R') { // REDUCE
 591 | 	   Expression previous;
 592 |            Expression comp;
 593 |            vector<Expression> entities(stacki.size());
 594 |            ent_lstm_fwd.start_new_sequence();
 595 |            ent_lstm_rev.start_new_sequence();
 596 |            for (unsigned i = 0; i < stacki.size(); ++i) {
 597 |               ent_lstm_fwd.add_input(stack[i]);
 598 |               ent_lstm_rev.add_input(stack[stacki.size() - i - 1]);
 599 |            }
 600 |            while(stacki.size()>1) {
 601 |               outputi.push_back(stacki.back());
 602 |               stack_lstm.rewind_one_step();
 603 |               stack.pop_back();
 604 |               stacki.pop_back();
 605 |             //COMPOSITION FUNCTION!! ??
 606 |             //Expression composed = affine_transform({cbias, H, head, D, dep, R, relation});
 607 |             //Expression nlcomposed = tanh(composed);
 608 |            }
 609 |            Expression efwd = ent_lstm_fwd.back();
 610 |            Expression erev = ent_lstm_rev.back();
 611 |            if (apply_dropout) {
 612 |              efwd = dropout(efwd, DROPOUT);
 613 |              erev = dropout(erev, DROPOUT);
 614 |            }
 615 |            Expression c = concatenate({efwd, erev});
 616 |            //Expression c = concatenate({ent_lstm_fwd.back(), ent_lstm_rev.back()});
 617 |            Expression composed = rectify(affine_transform({cbias, cW, c, R, relation}));
 618 | 	   output.push_back(composed);
 619 |            output_lstm.add_input(composed);
 620 | 
 621 |         }
 622 |         else if (ac =='O') {
 623 |            assert(bufferi.size() > 1); // dummy symbol means > 1 (not >= 1)
 624 |            outputi.push_back(bufferi.back());
 625 |            output.push_back(buffer.back());
 626 |            output_lstm.add_input(buffer.back());
 627 |            buffer.pop_back();
 628 |            bufferi.pop_back();
 629 |            buffer_lstm.rewind_one_step();
 630 | 
 631 |         }
 632 | 
 633 |     }
 634 |     assert(stack.size() == 1); // guard symbol, root
 635 |     assert(stacki.size() == 1);
 636 |     assert(buffer.size() == 1); // guard symbol
 637 |     assert(bufferi.size() == 1);
 638 |     Expression tot_neglogprob = -sum(log_probs);
 639 |     assert(tot_neglogprob.pg != nullptr);
 640 |     return std::make_pair(results, tot_neglogprob);
 641 |   }
 642 | 
 643 | };
 644 | 
 645 | void signal_callback_handler(int /* signum */) {
 646 |   if (requested_stop) {
 647 |     cerr << "\nReceived SIGINT again, quitting.\n";
 648 |     _exit(1);
 649 |   }
 650 |   cerr << "\nReceived SIGINT terminating optimization early...\n";
 651 |   requested_stop = true;
 652 | }
 653 | 
 654 | unsigned compute_correct(const map<int,string>& ref, const map<int,string>& hyp, unsigned len) {
 655 |   unsigned res = 0;
 656 |   for (unsigned i = 0; i < len; ++i) {
 657 |     auto ri = ref.find(i);
 658 |     auto hi = hyp.find(i);
 659 |     assert(ri != ref.end());
 660 |     assert(hi != hyp.end());
 661 |     if (ri->second.compare(hi->second)==0) ++res;
 662 |   }
 663 |   return res;
 664 | }
 665 | 
 666 | void output_conll(const vector<unsigned>& sentence, const vector<unsigned>& pos,
 667 |                   const vector<string>& sentenceUnkStrings, 
 668 |                   const map<unsigned, string>& intToWords, 
 669 |                   const map<unsigned, string>& intToPos, 
 670 |                   const map<int,string>& rel_ref,
 671 |                   const map<int,string>& rel_hyp) {
 672 |   for (unsigned i = 0; i < (sentence.size()); ++i) {
 673 |     // auto index = i + 1;
 674 |     assert(i < sentenceUnkStrings.size() && 
 675 |            ((sentence[i] == corpus.get_or_add_word(cpyp::Corpus::UNK) &&
 676 |              sentenceUnkStrings[i].size() > 0) ||
 677 |             (sentence[i] != corpus.get_or_add_word(cpyp::Corpus::UNK) &&
 678 |              sentenceUnkStrings[i].size() == 0 &&
 679 |              intToWords.find(sentence[i]) != intToWords.end())));
 680 |     string wit = (sentenceUnkStrings[i].size() > 0)? 
 681 |       sentenceUnkStrings[i] : intToWords.find(sentence[i])->second;
 682 |     auto pit = intToPos.find(pos[i]);
 683 |     //assert(hyp.find(i) != hyp.end());
 684 |     //auto hyp_head = hyp.find(i)->second + 1;
 685 |     //if (hyp_head == (int)sentence.size()) hyp_head = 0;
 686 |     auto hyp_rel_it = rel_hyp.find(i);
 687 |     auto ref_rel_it = rel_ref.find(i);
 688 |     assert(hyp_rel_it != rel_hyp.end());
 689 |     auto hyp_rel = hyp_rel_it->second;
 690 |     auto ref_rel = ref_rel_it->second;
 691 |     size_t first_char_in_rel = hyp_rel.find('(') + 1;
 692 |     size_t last_char_in_rel = hyp_rel.rfind(')') - 1;
 693 |     size_t first_char_in_ref = ref_rel.find('(') + 1;
 694 |     size_t last_char_in_ref = ref_rel.rfind(')') - 1;
 695 | 
 696 |     hyp_rel = hyp_rel.substr(first_char_in_rel, last_char_in_rel - first_char_in_rel + 1);
 697 |     ref_rel = ref_rel.substr(first_char_in_ref, last_char_in_ref - first_char_in_ref + 1);
 698 |     if (hyp_rel.compare("0")!=0){
 699 |         hyp_rel="I-"+hyp_rel;
 700 |     }
 701 |     else hyp_rel="O";
 702 |     if (ref_rel.compare("0")!=0){
 703 |         ref_rel="I-"+ref_rel;
 704 |     }
 705 |     else ref_rel="O";
 706 | 
 707 | 
 708 |     //cout << index << '\t'       // 1. ID 
 709 |     cout << wit << ' '        // 2. FORM
 710 |     //      << "_" << '\t'         // 3. LEMMA 
 711 |     //     << "_" << '\t'         // 4. CPOSTAG 
 712 |          << pit->second << ' ' // 5. POSTAG
 713 |          << "_" << ' '         // 6. tree. _ empty?
 714 |    //      << hyp_head << '\t'    // 7. HEAD
 715 |          << ref_rel << ' '     // 8. DEPREL
 716 |          << hyp_rel << endl;     // 8. DEPREL
 717 | //         << "_" << '\t'         // 9. PHEAD
 718 |  //        << "_" << endl;        // 10. PDEPREL
 719 |   }
 720 |   cout << endl;
 721 | }
 722 | 
 723 | int main(int argc, char** argv) {
 724 |   dynet::initialize(argc, argv);
 725 | 
 726 |   cerr << "COMMAND:"; 
 727 |   for (unsigned i = 0; i < static_cast<unsigned>(argc); ++i) cerr << ' ' << argv[i];
 728 |   cerr << endl;
 729 |   unsigned status_every_i_iterations = 100;
 730 | 
 731 |   po::variables_map conf;
 732 |   InitCommandLine(argc, argv, &conf);
 733 |   USE_POS = conf.count("use_pos_tags");
 734 |   if (conf.count("dropout"))
 735 |     DROPOUT = conf["dropout"].as<float>();
 736 | 
 737 |   USE_SPELLING=conf.count("use_spelling"); //Miguel
 738 |   corpus.USE_SPELLING=USE_SPELLING;
 739 | 
 740 |   LAYERS = conf["layers"].as<unsigned>();
 741 |   INPUT_DIM = conf["input_dim"].as<unsigned>();
 742 |   PRETRAINED_DIM = conf["pretrained_dim"].as<unsigned>();
 743 |   HIDDEN_DIM = conf["hidden_dim"].as<unsigned>();
 744 |   ACTION_DIM = conf["action_dim"].as<unsigned>();
 745 |   LSTM_INPUT_DIM = conf["lstm_input_dim"].as<unsigned>();
 746 |   POS_DIM = conf["pos_dim"].as<unsigned>();
 747 |   REL_DIM = conf["rel_dim"].as<unsigned>();
 748 |   // const unsigned beam_size = conf["beam_size"].as<unsigned>();
 749 |   const unsigned unk_strategy = conf["unk_strategy"].as<unsigned>();
 750 |   cerr << "Unknown word strategy: ";
 751 |   if (unk_strategy == 1) {
 752 |     cerr << "STOCHASTIC REPLACEMENT\n";
 753 |   } else {
 754 |     abort();
 755 |   }
 756 |   const double unk_prob = conf["unk_prob"].as<double>();
 757 |   assert(unk_prob >= 0.); assert(unk_prob <= 1.);
 758 |   ostringstream os;
 759 |   os << "parser_" << (USE_POS ? "pos" : "nopos")
 760 |      << '_' << LAYERS
 761 |      << '_' << INPUT_DIM
 762 |      << '_' << HIDDEN_DIM
 763 |      << '_' << ACTION_DIM
 764 |      << '_' << LSTM_INPUT_DIM
 765 |      << '_' << POS_DIM
 766 |      << '_' << REL_DIM
 767 |      << "-pid" << getpid() << ".params";
 768 |   // int best_correct_heads = 0;
 769 |   double best_f1_score=-1.0;
 770 |   const string fname = os.str();
 771 |   cerr << "Writing parameters to file: " << fname << endl;
 772 |   bool softlinkCreated = false;
 773 |   corpus.load_correct_actions(conf["training_data"].as<string>());	
 774 |   const unsigned kUNK = corpus.get_or_add_word(cpyp::Corpus::UNK);
 775 |   kROOT_SYMBOL = corpus.get_or_add_word(ROOT_SYMBOL);
 776 | 
 777 |   if (conf.count("words")) {
 778 |     pretrained[kUNK] = vector<float>(PRETRAINED_DIM, 0);
 779 |     cerr << "Loading from " << conf["words"].as<string>() << " with" << PRETRAINED_DIM << " dimensions\n";
 780 |     ifstream in(conf["words"].as<string>().c_str());
 781 |     string line;
 782 |     getline(in, line);
 783 |     vector<float> v(PRETRAINED_DIM, 0);
 784 |     string word;
 785 |     while (getline(in, line)) {
 786 |       istringstream lin(line);
 787 |       lin >> word;
 788 |       for (unsigned i = 0; i < PRETRAINED_DIM; ++i) lin >> v[i];
 789 |       unsigned id = corpus.get_or_add_word(word);
 790 |       pretrained[id] = v;
 791 |     }
 792 |   }
 793 | 
 794 |   set<unsigned> training_vocab; // words available in the training corpus
 795 |   set<unsigned> singletons;
 796 |   {  // compute the singletons in the parser's training data
 797 |     map<unsigned, unsigned> counts;
 798 |     for (auto sent : corpus.sentences)
 799 |       for (auto word : sent.second) { training_vocab.insert(word); counts[word]++; }
 800 |     for (auto wc : counts)
 801 |       if (wc.second == 1) singletons.insert(wc.first);
 802 |   }
 803 | 
 804 |   // correct ner label set (not action)
 805 |   set<string> label;
 806 |   map<string,int> label2id;
 807 |   for (unsigned i = 0; i < corpus.actions.size(); ++i) {
 808 |       if (corpus.actions[i][0] == 'R') {
 809 |           label2id[corpus.actions[i]] = label.size();
 810 | 
 811 |           cerr << corpus.actions[i] << " :: "<< label.size() << endl;
 812 |           label.insert(corpus.actions[i]);
 813 |       }
 814 |   }
 815 | 
 816 | 
 817 |   cerr << "Number of words: " << corpus.nwords << endl;
 818 |   VOCAB_SIZE = corpus.nwords + 1;
 819 | 
 820 |   cerr << "Number of UTF8 chars: " << corpus.maxChars << endl;
 821 |   if (corpus.maxChars>255) CHAR_SIZE=corpus.maxChars;
 822 | 
 823 |   ACTION_SIZE = corpus.nactions + 1;
 824 |   //POS_SIZE = corpus.npos + 1;
 825 |   POS_SIZE = corpus.npos + 10;
 826 |   possible_actions.resize(corpus.nactions);
 827 |   for (unsigned i = 0; i < corpus.nactions; ++i)
 828 |     possible_actions[i] = i;
 829 | 
 830 |   ParameterCollection model;
 831 |   ParserBuilder parser(model, pretrained);
 832 |   if (conf.count("model")) {
 833 |     TextFileLoader loader(conf["model"].as<string>());
 834 |     loader.populate(model);
 835 |   }
 836 | 
 837 |   // OOV words will be replaced by UNK tokens
 838 |   corpus.load_correct_actionsDev(conf["dev_data"].as<string>());
 839 |   if (USE_SPELLING) VOCAB_SIZE = corpus.nwords + 1;
 840 |   //TRAINING
 841 |   if (conf.count("train")) {
 842 |     signal(SIGINT, signal_callback_handler);
 843 |     SimpleSGDTrainer sgd(model);
 844 |     //MomentumSGDTrainer sgd(&model);
 845 |     float eta_decay = 0.08;
 846 |     //sgd.eta_decay = 0.05;
 847 |     cerr << "Training started."<<"\n";
 848 |     vector<unsigned> order(corpus.nsentences);
 849 |     for (unsigned i = 0; i < corpus.nsentences; ++i)
 850 |       order[i] = i;
 851 |     double tot_seen = 0;
 852 |     status_every_i_iterations = min(status_every_i_iterations, corpus.nsentences);
 853 |     unsigned si = corpus.nsentences;
 854 |     cerr << "NUMBER OF TRAINING SENTENCES: " << corpus.nsentences << endl;
 855 |     unsigned trs = 0;
 856 |     double right = 0;
 857 |     double llh = 0;
 858 |     bool first = true;
 859 |     int iter = -1;
 860 |     while(!requested_stop) {
 861 |       ++iter;
 862 |       for (unsigned sii = 0; sii < status_every_i_iterations; ++sii) {
 863 |            if (si == corpus.nsentences) {
 864 |              si = 0;
 865 |              if (first) { first = false; } else { sgd.learning_rate *= 1 - eta_decay; }
 866 |              cerr << "**SHUFFLE\n";
 867 |              random_shuffle(order.begin(), order.end());
 868 |            }
 869 |            tot_seen += 1;
 870 |            const vector<unsigned>& sentence=corpus.sentences[order[si]];
 871 |            vector<unsigned> tsentence=sentence;
 872 |            if (unk_strategy == 1) {
 873 |              for (auto& w : tsentence)
 874 |                if (singletons.count(w) && dynet::rand01() < unk_prob) w = kUNK;
 875 |            }
 876 | 	   const vector<unsigned>& sentencePos=corpus.sentencesPos[order[si]]; 
 877 | 	   const vector<unsigned>& actions=corpus.correct_act_sent[order[si]];
 878 |            ComputationGraph hg;
 879 |            auto pred_loss = parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,actions,corpus.actions,corpus.intToWords,false,&right);
 880 |            double lp = as_scalar(hg.incremental_forward(pred_loss.second));
 881 |            if (lp < 0) {
 882 |              cerr << "Log prob < 0 on sentence " << order[si] << ": lp=" << lp << endl;
 883 |              assert(lp >= 0.0);
 884 |            }
 885 |            hg.backward(pred_loss.second);
 886 |            sgd.update();
 887 |            llh += lp;
 888 |            ++si;
 889 |            trs += actions.size();
 890 |       }
 891 |       sgd.status();
 892 |       cerr << "update #" << iter << " (epoch " << (tot_seen / corpus.nsentences) << ")\tllh: "<< llh<<" ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << endl;
 893 |       llh = trs = right = 0;
 894 | 
 895 |       static int logc = 0;
 896 |       ++logc;
 897 |       if (logc % 25 == 1) { // report on dev set
 898 |         unsigned dev_size = corpus.nsentencesDev;
 899 |         // dev_size = 100;
 900 |         double llh = 0;
 901 |         double trs = 0;
 902 |         double right = 0;
 903 |         double correct_heads = 0;
 904 |         double total_heads = 0;
 905 | 
 906 |         // TODO fix it to variable length label
 907 |         int confusion[4][4]= {{0}};
 908 |         int class_tp[4] = {0};
 909 |         int class_fp[4] = {0};
 910 |         int class_fn[4] = {0};
 911 | 
 912 |         auto t_start = std::chrono::high_resolution_clock::now();
 913 |         for (unsigned sii = 0; sii < dev_size; ++sii) {
 914 |            const vector<unsigned>& sentence=corpus.sentencesDev[sii];
 915 |     	   const vector<unsigned>& sentencePos=corpus.sentencesPosDev[sii]; 
 916 | 	       const vector<unsigned>& actions=corpus.correct_act_sentDev[sii];
 917 |            vector<unsigned> tsentence=sentence;
 918 |         if (!USE_SPELLING) {
 919 |                  for (auto& w : tsentence)
 920 |                      if (training_vocab.count(w) == 0) w = kUNK;
 921 |          }
 922 | 
 923 |         ComputationGraph hg;
 924 |         auto pred_loss = parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,vector<unsigned>(),corpus.actions,corpus.intToWords,true,&right);
 925 |         vector<unsigned> & pred = pred_loss.first;
 926 |         double lp = 0;
 927 |         //vector<unsigned> pred = parser.log_prob_parser_beam(&hg,sentence,sentencePos,corpus.actions,beam_size,&lp);
 928 |         llh -= lp;
 929 |         trs += actions.size();
 930 |         //for (int i=0;i<actions.size();i++) std::cout<<"a->"<<actions[i]<<"\n";
 931 |         //for (int i=0;i<actions.size();i++) std::cout<<"p->"<<pred[i]<<"\n";
 932 |         map<int,string> ref = parser.compute_ents(sentence.size(), actions, corpus.actions);
 933 |         map<int,string> hyp = parser.compute_ents(sentence.size(), pred, corpus.actions);
 934 | 
 935 |         //update confusion matrix
 936 |         for (unsigned i = 0; i < sentence.size()-1; ++i) {
 937 |           auto ri = ref.find(i);
 938 |           auto hi = hyp.find(i);
 939 |           assert(ri != ref.end());
 940 |           assert(hi != hyp.end());
 941 | 
 942 |           if (ri->second != "0"){
 943 |               int pr = label2id[hi->second];
 944 |               int tr = label2id[ri->second];
 945 | 
 946 |               confusion[pr][tr]++;
 947 |           }
 948 |         }
 949 | 
 950 |         correct_heads += compute_correct(ref, hyp, sentence.size() - 1);
 951 |         total_heads += sentence.size() - 1;
 952 |         } 
 953 | 
 954 |         // compute tp, fp and fn for each class
 955 |         cerr << "confusion matrix" << endl;
 956 |         for (unsigned i = 0; i < label.size(); ++i){
 957 |             for (unsigned j = 0; j < label.size(); ++j){
 958 |                 cerr << i << " " << j << " " << confusion[i][j] << endl;  
 959 |                 if (i == j){
 960 |                     class_tp[i] = confusion[i][j];
 961 |                 } else {
 962 |                     class_fp[i] += confusion[i][j]; 
 963 |                     class_fn[i] += confusion[j][i]; 
 964 |                 }
 965 |             }
 966 |         }
 967 |         
 968 |         // compute precision, recall, f1
 969 |         double global_f1_score = 0;
 970 |         for (unsigned i = 0; i < label.size(); ++i){
 971 |             double precision = (float)class_tp[i] / (class_tp[i] + class_fp[i]);
 972 |             double recall = (float)class_tp[i] / (class_tp[i] + class_fn[i]);
 973 |             double f1_score = 2 * (float)(precision * recall) / (precision + recall);
 974 |             double pr=precision+recall;
 975 |             if (pr==0) f1_score=0.0;
 976 |             global_f1_score += f1_score / label.size(); // Macro averaged F1
 977 |             if (std::isnan(global_f1_score)) global_f1_score=0.0;
 978 |            
 979 |             cerr << i << "class Precision " << precision << " Recall " << recall << " F1 " << f1_score << endl;
 980 |         }
 981 |         cerr << "F1 >> " << global_f1_score << endl;
 982 | 
 983 | 
 984 |         auto t_end = std::chrono::high_resolution_clock::now();
 985 |         cerr << "  **dev (iter=" << iter << " epoch=" << (tot_seen / corpus.nsentences) << ")\tllh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " f1: " << global_f1_score << "\t[" << dev_size << " sents in " << std::chrono::duration<double, std::milli>(t_end-t_start).count() << " ms]" << endl;
 986 |         if (global_f1_score > best_f1_score) {
 987 |           best_f1_score = global_f1_score;
 988 |           TextFileSaver saver(fname);
 989 |           saver.save(model);
 990 |           // Create a soft link to the most recent model in order to make it
 991 |           // easier to refer to it in a shell script.
 992 |           if (!softlinkCreated) {
 993 |             string softlink = " latest_model";
 994 |             if (system((string("rm -f ") + softlink).c_str()) == 0 && 
 995 |                 system((string("ln -s ") + fname + softlink).c_str()) == 0) {
 996 |               cerr << "Created " << softlink << " as a soft link to " << fname 
 997 |                    << " for convenience." << endl;
 998 |             }
 999 |             softlinkCreated = true;
1000 |           }
1001 |         }
1002 |       }
1003 |     }
1004 |   } // should do training?
1005 |   if (true) { // do test evaluation
1006 |     double llh = 0;
1007 |     double trs = 0;
1008 |     double right = 0;
1009 |     double correct_heads = 0;
1010 |     double total_heads = 0;
1011 |     // double f1score=0.0;
1012 | 
1013 |     // TODO fix it to variable length label
1014 |     int confusion[4][4]= {{0}};
1015 |     int class_tp[4] = {0};
1016 |     int class_fp[4] = {0};
1017 |     int class_fn[4] = {0};
1018 | 
1019 |     auto t_start = std::chrono::high_resolution_clock::now();
1020 |     unsigned corpus_size = corpus.nsentencesDev;
1021 |     for (unsigned sii = 0; sii < corpus_size; ++sii) {
1022 |       const vector<unsigned>& sentence=corpus.sentencesDev[sii];
1023 |       const vector<unsigned>& sentencePos=corpus.sentencesPosDev[sii]; 
1024 |       const vector<string>& sentenceUnkStr=corpus.sentencesStrDev[sii]; 
1025 |       const vector<unsigned>& actions=corpus.correct_act_sentDev[sii];
1026 |       vector<unsigned> tsentence=sentence;
1027 |       if (!USE_SPELLING) {
1028 |         for (auto& w : tsentence)
1029 | 	  if (training_vocab.count(w) == 0) w = kUNK;
1030 |       }
1031 |       ComputationGraph cg;
1032 |       double lp = 0;
1033 |       auto pred_loss = parser.log_prob_parser(&cg,sentence,tsentence,sentencePos,vector<unsigned>(),corpus.actions,corpus.intToWords,true,&right);
1034 |       vector<unsigned> & pred = pred_loss.first;
1035 |       // if (beam_size == 1)
1036 |       //   pred = parser.log_prob_parser(&cg,sentence,tsentence,sentencePos,vector<unsigned>(),corpus.actions,corpus.intToWords,true,&right);
1037 |       //else
1038 |       //  pred = parser.log_prob_parser_beam(&cg,sentence,tsentence,sentencePos,corpus.actions,beam_size,&lp);
1039 |       llh -= lp;
1040 |       trs += actions.size();
1041 |       map<int, string> rel_ref, rel_hyp;
1042 |       map<int,string> ref = parser.compute_ents(sentence.size(), actions, corpus.actions, &rel_ref);
1043 |       map<int,string> hyp = parser.compute_ents(sentence.size(), pred, corpus.actions, &rel_hyp);
1044 |       output_conll(sentence, sentencePos, sentenceUnkStr, corpus.intToWords, corpus.intToPos, ref, hyp);
1045 | 
1046 |       //update confusion matrix
1047 |       for (unsigned i = 0; i < sentence.size()-1; ++i) {
1048 |         auto ri = ref.find(i);
1049 |         auto hi = hyp.find(i);
1050 |         assert(ri != ref.end());
1051 |         assert(hi != hyp.end());
1052 | 
1053 |         if (ri->second != "0"){
1054 |             int pr = label2id[hi->second];
1055 |             int tr = label2id[ri->second];
1056 | 
1057 |             confusion[pr][tr]++;
1058 |         }
1059 |       }
1060 | 
1061 |       correct_heads += compute_correct(ref, hyp, sentence.size() - 1);
1062 |       total_heads += sentence.size() - 1;
1063 |     }
1064 | 
1065 |     // compute tp, fp and fn for each class
1066 |     cerr << "confusion matrix" << endl;
1067 |     for (unsigned i = 0; i < label.size(); ++i){
1068 |         for (unsigned j = 0; j < label.size(); ++j){
1069 |             cerr << i << " " << j << " " << confusion[i][j] << endl;  
1070 |             if (i == j){
1071 |                 class_tp[i] = confusion[i][j];
1072 |             } else {
1073 |                 class_fp[i] += confusion[i][j]; 
1074 |                 class_fn[i] += confusion[j][i]; 
1075 |             }
1076 |         }
1077 |     }
1078 |     
1079 |     // compute precision, recall, f1
1080 |     double global_f1_score = 0;
1081 |     for (unsigned i = 0; i < label.size(); ++i){
1082 |         double precision = (float)class_tp[i] / (class_tp[i] + class_fp[i]);
1083 |         double recall = (float)class_tp[i] / (class_tp[i] + class_fn[i]);
1084 |         double f1_score = 2 * (float)(precision * recall) / (precision + recall);
1085 |         double pr=precision+recall;
1086 |         if (pr==0) f1_score=0.0;
1087 |         global_f1_score += f1_score / label.size(); // Macro averaged F1
1088 |         if (std::isnan(global_f1_score)) global_f1_score=0.0;
1089 |        
1090 |         cerr << i << "class Precision " << precision << " Recall " << recall << " F1 " << f1_score << endl;
1091 |     }
1092 |     cerr << "F1 >> " << global_f1_score << endl;
1093 | 
1094 |     auto t_end = std::chrono::high_resolution_clock::now();
1095 |     cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " f1: " << global_f1_score << "\t[" << corpus_size << " sents in " << std::chrono::duration<double, std::milli>(t_end-t_start).count() << " ms]" << endl;
1096 |   }
1097 |   for (unsigned i = 0; i < corpus.actions.size(); ++i) {
1098 |     //cerr << corpus.actions[i] << '\t' << parser.p_r->values[i].transpose() << endl;
1099 |     //cerr << corpus.actions[i] << '\t' << parser.p_p2a->values.col(i).transpose() << endl;
1100 |   }
1101 | }
1102 | 


--------------------------------------------------------------------------------