├── .gitignore
├── LICENSE
├── README.md
├── experiments
    ├── configs
    │   ├── ner
    │   │   └── conll03.json
    │   ├── parsing
    │   │   ├── biaffine.json
    │   │   ├── neuromst.json
    │   │   └── stackptr.json
    │   └── pos
    │   │   └── wsj.json
    ├── eval
    │   ├── conll03eval.v2
    │   └── conll06eval.pl
    ├── ner.py
    ├── parsing.py
    ├── pos_tagging.py
    └── scripts
    │   ├── run_analyze.sh
    │   ├── run_deepbiaf.sh
    │   ├── run_ner_conll03.sh
    │   ├── run_neuromst.sh
    │   ├── run_pos_wsj.sh
    │   └── run_stackptr.sh
└── neuronlp2
    ├── __init__.py
    ├── io
        ├── __init__.py
        ├── alphabet.py
        ├── common.py
        ├── conll03_data.py
        ├── conllx_data.py
        ├── conllx_stacked_data.py
        ├── instance.py
        ├── logger.py
        ├── reader.py
        ├── utils.py
        └── writer.py
    ├── models
        ├── __init__.py
        ├── parsing.py
        └── sequence_labeling.py
    ├── nn
        ├── __init__.py
        ├── _functions
        │   ├── __init__.py
        │   ├── rnnFusedBackend.py
        │   ├── skipconnect_rnn.py
        │   └── variational_rnn.py
        ├── crf.py
        ├── init.py
        ├── modules.py
        ├── skip_rnn.py
        ├── utils.py
        └── variational_rnn.py
    ├── optim
        ├── __init__.py
        └── lr_scheduler.py
    ├── tasks
        ├── __init__.py
        └── parser.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # data dir
104 | /experiments/data/
105 | 
106 | # model dir
107 | /experiments/models/
108 | 
109 | # log dir
110 | /experiments/log/
111 | 
112 | # test dir
113 | /tests/
114 | 
115 | # IDE 
116 | /.idea/
117 | *.iml
118 | *.sublime-project
119 | *.sublime-workspace
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NeuroNLP2
 2 | Deep neural models for core NLP tasks based on Pytorch(version 2)
 3 | 
 4 | This is the code we used in the following papers
 5 | >[End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF](http://www.cs.cmu.edu/~xuezhem/publications/P16-1101.pdf)
 6 | 
 7 | >Xuezhe Ma, Eduard Hovy
 8 | 
 9 | >ACL 2016
10 | 
11 | >[Neural Probabilistic Model for Non-projective MST Parsing](http://www.cs.cmu.edu/~xuezhem/publications/IJCNLP2017.pdf)
12 | 
13 | >Xuezhe Ma, Eduard Hovy
14 | 
15 | >IJCNLP 2017
16 | 
17 | >[Stack-Pointer Networks for Dependency Parsing](https://arxiv.org/pdf/1805.01087.pdf)
18 | 
19 | >Xuezhe Ma, Zecong Hu, Jingzhou Liu, Nanyun Peng, Graham Neubig and Eduard Hovy
20 | 
21 | >ACL 2018
22 | 
23 | It also includes the re-implementation of the Stanford Deep BiAffine Parser:
24 | >[Deep Biaffine Attention for Neural Dependency Parsing](https://arxiv.org/abs/1611.01734)
25 | 
26 | >Timothy Dozat, Christopher D. Manning
27 | 
28 | >ICLR 2017
29 | 
30 | ## Updates
31 | 1. Upgraded the code to support PyTorch 1.3 and Python 3.6
32 | 2. Re-factored code to better organization
33 | 3. Implemented the batch version of Stack-Pointer Parser decoding algorithm, about 50 times faster!
34 | 
35 | ## Requirements
36 | 
37 | Python 3.6, PyTorch >=1.3.1, Gensim >= 0.12.0
38 | 
39 | ## Data format
40 | For the data format used in our implementation, please read this [issue](https://github.com/XuezheMax/NeuroNLP2/issues/9).
41 | 
42 | ## Running the experiments
43 | First to the experiments folder:
44 | 
45 |     cd experiments
46 | ### Sequence labeling
47 | To train a CRF POS tagger of PTB WSJ corpus, 
48 | 
49 |     ./scripts/run_pos_wsj.sh
50 | where the arguments for ```train/dev/test``` data, together with the pretrained word embedding should be setup.
51 | 
52 | To train a NER model on CoNLL-2003 English data set,
53 | 
54 |     ./scripts/run_ner_conll03.sh
55 | 
56 | ### Dependency Parsing
57 | To train a Stack-Pointer parser, simply run
58 | 
59 |     ./scripts/run_stackptr.sh
60 | Remeber to setup the paths for data and embeddings.
61 | 
62 | To train a Deep BiAffine parser, simply run
63 | 
64 |     ./scripts/run_deepbiaf.sh
65 | Again, remember to setup the paths for data and embeddings.
66 | 
67 | To train a Neural MST parser, 
68 | 
69 |     ./scripts/run_neuromst.sh
70 | 


--------------------------------------------------------------------------------
/experiments/configs/ner/conll03.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "crf": true,
 3 |   "bigram": true,
 4 |   "embedd_dim": 100,
 5 |   "char_dim": 30,
 6 |   "rnn_mode": "LSTM",
 7 |   "num_layers":1,
 8 |   "hidden_size": 256,
 9 |   "out_features": 128,
10 |   "dropout": "std",
11 |   "p_in": 0.33,
12 |   "p_out": 0.5,
13 |   "p_rnn": [0.33, 0.5],
14 |   "activation": "elu"
15 | }
16 | 


--------------------------------------------------------------------------------
/experiments/configs/parsing/biaffine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": "DeepBiAffine",
 3 |   "word_dim": 100,
 4 |   "char_dim": 100,
 5 |   "pos": true,
 6 |   "pos_dim": 100,
 7 |   "rnn_mode": "FastLSTM",
 8 |   "num_layers":3,
 9 |   "hidden_size": 512,
10 |   "arc_space": 512,
11 |   "type_space": 128,
12 |   "p_in": 0.33,
13 |   "p_out": 0.33,
14 |   "p_rnn": [0.33, 0.33],
15 |   "activation": "elu"
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/configs/parsing/neuromst.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": "NeuroMST",
 3 |   "word_dim": 100,
 4 |   "char_dim": 100,
 5 |   "pos": true,
 6 |   "pos_dim": 100,
 7 |   "rnn_mode": "FastLSTM",
 8 |   "num_layers":3,
 9 |   "hidden_size": 512,
10 |   "arc_space": 512,
11 |   "type_space": 128,
12 |   "p_in": 0.33,
13 |   "p_out": 0.33,
14 |   "p_rnn": [0.33, 0.33],
15 |   "activation": "elu"
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/configs/parsing/stackptr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": "StackPtr",
 3 |   "word_dim": 100,
 4 |   "char_dim": 100,
 5 |   "pos": true,
 6 |   "pos_dim": 100,
 7 |   "rnn_mode": "FastLSTM",
 8 |   "encoder_layers":3,
 9 |   "decoder_layers":1,
10 |   "hidden_size": 512,
11 |   "arc_space": 512,
12 |   "type_space": 128,
13 |   "p_in": 0.33,
14 |   "p_out": 0.33,
15 |   "p_rnn": [0.33, 0.33],
16 |   "prior_order": "inside_out",
17 |   "grandPar": true,
18 |   "sibling": true,
19 |   "activation": "elu"
20 | }
21 | 


--------------------------------------------------------------------------------
/experiments/configs/pos/wsj.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "crf": true,
 3 |   "bigram": true,
 4 |   "embedd_dim": 100,
 5 |   "char_dim": 30,
 6 |   "rnn_mode": "LSTM",
 7 |   "num_layers":1,
 8 |   "hidden_size": 256,
 9 |   "out_features": 256,
10 |   "dropout": "std",
11 |   "p_in": 0.33,
12 |   "p_out": 0.5,
13 |   "p_rnn": [0.33, 0.5],
14 |   "activation": "elu"
15 | }
16 | 


--------------------------------------------------------------------------------
/experiments/eval/conll03eval.v2:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
  3 | # usage:     conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
  4 | #            README: http://cnts.uia.ac.be/conll2000/chunking/output.html
  5 | # options:   l: generate LaTeX output for tables like in
  6 | #               http://cnts.uia.ac.be/conll2003/ner/example.tex
  7 | #            r: accept raw result tags (without B- and I- prefix;
  8 | #                                       assumes one word per chunk)
  9 | #            d: alternative delimiter tag (default is single space)
 10 | #            o: alternative outside tag (default is O)
 11 | # note:      the file should contain lines with items separated
 12 | #            by $delimiter characters (default space). The final
 13 | #            two items should contain the correct tag and the 
 14 | #            guessed tag in that order. Sentences should be
 15 | #            separated from each other by empty lines or lines
 16 | #            with $boundary fields (default -X-).
 17 | # url:       http://lcg-www.uia.ac.be/conll2000/chunking/
 18 | # started:   1998-09-25
 19 | # version:   2004-01-26
 20 | # author:    Erik Tjong Kim Sang <erikt@uia.ua.ac.be>
 21 | 
 22 | use strict;
 23 | 
 24 | my $false = 0;
 25 | my $true = 42;
 26 | 
 27 | my $boundary = "-X-";     # sentence boundary
 28 | my $correct;              # current corpus chunk tag (I,O,B)
 29 | my $correctChunk = 0;     # number of correctly identified chunks
 30 | my $correctTags = 0;      # number of correct chunk tags
 31 | my $correctType;          # type of current corpus chunk tag (NP,VP,etc.)
 32 | my $delimiter = " ";      # field delimiter
 33 | my $FB1 = 0.0;            # FB1 score (Van Rijsbergen 1979)
 34 | my $firstItem;            # first feature (for sentence boundary checks)
 35 | my $foundCorrect = 0;     # number of chunks in corpus
 36 | my $foundGuessed = 0;     # number of identified chunks
 37 | my $guessed;              # current guessed chunk tag
 38 | my $guessedType;          # type of current guessed chunk tag
 39 | my $i;                    # miscellaneous counter
 40 | my $inCorrect = $false;   # currently processed chunk is correct until now
 41 | my $lastCorrect = "O";    # previous chunk tag in corpus
 42 | my $latex = 0;            # generate LaTeX formatted output
 43 | my $lastCorrectType = ""; # type of previously identified chunk tag
 44 | my $lastGuessed = "O";    # previously identified chunk tag
 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
 46 | my $lastType;             # temporary storage for detecting duplicates
 47 | my $line;                 # line
 48 | my $nbrOfFeatures = -1;   # number of features per line
 49 | my $precision = 0.0;      # precision score
 50 | my $oTag = "O";           # outside tag, default O
 51 | my $raw = 0;              # raw input: add B to every token
 52 | my $recall = 0.0;         # recall score
 53 | my $tokenCounter = 0;     # token counter (ignores sentence breaks)
 54 | 
 55 | my %correctChunk = ();    # number of correctly identified chunks per type
 56 | my %foundCorrect = ();    # number of chunks in corpus per type
 57 | my %foundGuessed = ();    # number of identified chunks per type
 58 | 
 59 | my @features;             # features on line
 60 | my @sortedTypes;          # sorted list of chunk type names
 61 | 
 62 | # sanity check
 63 | while (@ARGV and $ARGV[0] =~ /^-/) {
 64 |    if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
 65 |    elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
 66 |    elsif ($ARGV[0] eq "-d") { 
 67 |       shift(@ARGV); 
 68 |       if (not defined $ARGV[0]) { 
 69 |          die "conlleval: -d requires delimiter character"; 
 70 |       }
 71 |       $delimiter = shift(@ARGV);
 72 |    } elsif ($ARGV[0] eq "-o") {
 73 |       shift(@ARGV);
 74 |       if (not defined $ARGV[0]) {
 75 |          die "conlleval: -o requires delimiter character";
 76 |       }
 77 |       $oTag = shift(@ARGV);
 78 |    } else { die "conlleval: unknown argument $ARGV[0]\n"; }
 79 | }
 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
 81 | # process input
 82 | while (<STDIN>) {
 83 |    chomp($line = $_);
 84 |    @features = split(/$delimiter/,$line);
 85 |    if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
 86 |    elsif ($nbrOfFeatures != $#features and @features != 0) {
 87 |       printf STDERR "unexpected number of features: %d (%d)\n",
 88 |          $#features+1,$nbrOfFeatures+1;
 89 |       exit(1);
 90 |    }
 91 |    if (@features == 0 or 
 92 |        $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
 93 |    if (@features < 2) { 
 94 |       die "conlleval: unexpected number of features in line $line\n"; 
 95 |    }
 96 |    if ($raw) {
 97 |       if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 
 98 |       if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 
 99 |       if ($features[$#features] ne "O") { 
100 |          $features[$#features] = "B-$features[$#features]";
101 |       }
102 |       if ($features[$#features-1] ne "O") { 
103 |          $features[$#features-1] = "B-$features[$#features-1]";
104 |       }
105 |    }
106 |    # 20040126 ET code which allows hyphens in the types
107 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
108 |       $guessed = $1;
109 |       $guessedType = $2;
110 |    } else { 
111 |       $guessed = $features[$#features]; 
112 |       $guessedType = ""; 
113 |    }
114 |    pop(@features);
115 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
116 |       $correct = $1;
117 |       $correctType = $2;
118 |    } else { 
119 |       $correct = $features[$#features]; 
120 |       $correctType = ""; 
121 |    }
122 |    pop(@features);
123 | #  ($guessed,$guessedType) = split(/-/,pop(@features));
124 | #  ($correct,$correctType) = split(/-/,pop(@features));
125 |    $guessedType = $guessedType ? $guessedType : "";
126 |    $correctType = $correctType ? $correctType : "";
127 |    $firstItem = shift(@features);
128 | 
129 |    # 1999-06-26 sentence breaks should always be counted as out of chunk
130 |    if ( $firstItem eq $boundary ) { $guessed = "O"; }
131 | 
132 |    if ($inCorrect) {
133 |       if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
134 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
135 |            $lastGuessedType eq $lastCorrectType) {
136 |          $inCorrect=$false;
137 |          $correctChunk++;
138 |          $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
139 |              $correctChunk{$lastCorrectType}+1 : 1;
140 |       } elsif ( 
141 |            &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 
142 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
143 |            $guessedType ne $correctType ) {
144 |          $inCorrect=$false; 
145 |       }
146 |    }
147 | 
148 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 
149 |         &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
150 |         $guessedType eq $correctType) { $inCorrect = $true; }
151 | 
152 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
153 |       $foundCorrect++; 
154 |       $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
155 |           $foundCorrect{$correctType}+1 : 1;
156 |    }
157 |    if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
158 |       $foundGuessed++; 
159 |       $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
160 |           $foundGuessed{$guessedType}+1 : 1;
161 |    }
162 |    if ( $firstItem ne $boundary ) { 
163 |       if ( $correct eq $guessed and $guessedType eq $correctType ) { 
164 |          $correctTags++; 
165 |       }
166 |       $tokenCounter++; 
167 |    }
168 | 
169 |    $lastGuessed = $guessed;
170 |    $lastCorrect = $correct;
171 |    $lastGuessedType = $guessedType;
172 |    $lastCorrectType = $correctType;
173 | }
174 | if ($inCorrect) { 
175 |    $correctChunk++;
176 |    $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
177 |        $correctChunk{$lastCorrectType}+1 : 1;
178 | }
179 | 
180 | if (not $latex) {
181 |    # compute overall precision, recall and FB1 (default values are 0.0)
182 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
183 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
184 |    $FB1 = 2*$precision*$recall/($precision+$recall)
185 |       if ($precision+$recall > 0);
186 |    
187 |    # print overall performance
188 |    printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
189 |    printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
190 |    if ($tokenCounter>0) {
191 |       printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
192 |       printf "precision: %6.2f%%; ",$precision;
193 |       printf "recall: %6.2f%%; ",$recall;
194 |       printf "FB1: %6.2f\n",$FB1;
195 |    }
196 | }
197 | 
198 | # sort chunk type names
199 | undef($lastType);
200 | @sortedTypes = ();
201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
202 |    if (not($lastType) or $lastType ne $i) { 
203 |       push(@sortedTypes,($i));
204 |    }
205 |    $lastType = $i;
206 | }
207 | # print performance per chunk type
208 | if (not $latex) {
209 |    for $i (@sortedTypes) {
210 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
211 |       if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
212 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
213 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
214 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
215 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
216 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
217 |       printf "%17s: ",$i;
218 |       printf "precision: %6.2f%%; ",$precision;
219 |       printf "recall: %6.2f%%; ",$recall;
220 |       printf "FB1: %6.2f  %d\n",$FB1,$foundGuessed{$i};
221 |    }
222 | } else {
223 |    print "        & Precision &  Recall  & F\$_{\\beta=1} \\\\\\hline";
224 |    for $i (@sortedTypes) {
225 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
226 |       if (not($foundGuessed{$i})) { $precision = 0.0; }
227 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
228 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
229 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
230 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
231 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
232 |       printf "\n%-7s &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
233 |              $i,$precision,$recall,$FB1;
234 |    }
235 |    print "\\hline\n";
236 |    $precision = 0.0;
237 |    $recall = 0;
238 |    $FB1 = 0.0;
239 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
240 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
241 |    $FB1 = 2*$precision*$recall/($precision+$recall)
242 |       if ($precision+$recall > 0);
243 |    printf "Overall &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
244 |           $precision,$recall,$FB1;
245 | }
246 | 
247 | exit 0;
248 | 
249 | # endOfChunk: checks if a chunk ended between the previous and current word
250 | # arguments:  previous and current chunk tags, previous and current types
251 | # note:       this code is capable of handling other chunk representations
252 | #             than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
253 | #             Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
254 | 
255 | sub endOfChunk {
256 |    my $prevTag = shift(@_);
257 |    my $tag = shift(@_);
258 |    my $prevType = shift(@_);
259 |    my $type = shift(@_);
260 |    my $chunkEnd = $false;
261 | 
262 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
263 |    if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
264 |    if ( $prevTag eq "B" and $tag eq "S" ) { $chunkEnd = $true; }
265 | 
266 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
267 |    if ( $prevTag eq "I" and $tag eq "S" ) { $chunkEnd = $true; }
268 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
269 | 
270 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
271 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
272 |    if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
273 |    if ( $prevTag eq "E" and $tag eq "S" ) { $chunkEnd = $true; }
274 |    if ( $prevTag eq "E" and $tag eq "B" ) { $chunkEnd = $true; }
275 | 
276 |    if ( $prevTag eq "S" and $tag eq "E" ) { $chunkEnd = $true; }
277 |    if ( $prevTag eq "S" and $tag eq "I" ) { $chunkEnd = $true; }
278 |    if ( $prevTag eq "S" and $tag eq "O" ) { $chunkEnd = $true; }
279 |    if ( $prevTag eq "S" and $tag eq "S" ) { $chunkEnd = $true; }
280 |    if ( $prevTag eq "S" and $tag eq "B" ) { $chunkEnd = $true; }
281 |    
282 | 
283 |    if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 
284 |       $chunkEnd = $true; 
285 |    }
286 | 
287 |    # corrected 1998-12-22: these chunks are assumed to have length 1
288 |    if ( $prevTag eq "]" ) { $chunkEnd = $true; }
289 |    if ( $prevTag eq "[" ) { $chunkEnd = $true; }
290 | 
291 |    return($chunkEnd);   
292 | }
293 | 
294 | # startOfChunk: checks if a chunk started between the previous and current word
295 | # arguments:    previous and current chunk tags, previous and current types
296 | # note:         this code is capable of handling other chunk representations
297 | #               than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
298 | #               Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
299 | 
300 | sub startOfChunk {
301 |    my $prevTag = shift(@_);
302 |    my $tag = shift(@_);
303 |    my $prevType = shift(@_);
304 |    my $type = shift(@_);
305 |    my $chunkStart = $false;
306 | 
307 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
308 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
309 |    if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
310 |    if ( $prevTag eq "S" and $tag eq "B" ) { $chunkStart = $true; }
311 |    if ( $prevTag eq "E" and $tag eq "B" ) { $chunkStart = $true; }
312 |    
313 |    if ( $prevTag eq "B" and $tag eq "S" ) { $chunkStart = $true; }
314 |    if ( $prevTag eq "I" and $tag eq "S" ) { $chunkStart = $true; }
315 |    if ( $prevTag eq "O" and $tag eq "S" ) { $chunkStart = $true; }
316 |    if ( $prevTag eq "S" and $tag eq "S" ) { $chunkStart = $true; }
317 |    if ( $prevTag eq "E" and $tag eq "S" ) { $chunkStart = $true; }
318 | 
319 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
320 |    if ( $prevTag eq "S" and $tag eq "I" ) { $chunkStart = $true; }
321 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
322 | 
323 |    if ( $prevTag eq "S" and $tag eq "E" ) { $chunkStart = $true; }
324 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
325 |    if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
326 | 
327 |    if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 
328 |       $chunkStart = $true; 
329 |    }
330 | 
331 |    # corrected 1998-12-22: these chunks are assumed to have length 1
332 |    if ( $tag eq "[" ) { $chunkStart = $true; }
333 |    if ( $tag eq "]" ) { $chunkStart = $true; }
334 | 
335 |    return($chunkStart);   
336 | }
337 | 


--------------------------------------------------------------------------------
/experiments/ner.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of Bi-directional LSTM-CNNs-CRF model for NER.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | import gc
  8 | import json
  9 | 
 10 | current_path = os.path.dirname(os.path.realpath(__file__))
 11 | root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 12 | sys.path.append(root_path)
 13 | 
 14 | import time
 15 | import argparse
 16 | 
 17 | import numpy as np
 18 | import torch
 19 | from torch.optim.adamw import AdamW
 20 | from torch.optim import SGD
 21 | from torch.nn.utils import clip_grad_norm_
 22 | from neuronlp2.io import get_logger, conll03_data, CoNLL03Writer, iterate_data
 23 | from neuronlp2.models import BiRecurrentConv, BiVarRecurrentConv, BiRecurrentConvCRF, BiVarRecurrentConvCRF
 24 | from neuronlp2.optim import ExponentialScheduler
 25 | from neuronlp2 import utils
 26 | 
 27 | 
 28 | def evaluate(output_file, scorefile):
 29 |     script = os.path.join(current_path, 'eval/conll03eval.v2')
 30 |     os.system("perl %s < %s > %s" % (script, output_file, scorefile))
 31 |     with open(scorefile, 'r') as fin:
 32 |         fin.readline()
 33 |         line = fin.readline()
 34 |         fields = line.split(";")
 35 |         acc = float(fields[0].split(":")[1].strip()[:-1])
 36 |         precision = float(fields[1].split(":")[1].strip()[:-1])
 37 |         recall = float(fields[2].split(":")[1].strip()[:-1])
 38 |         f1 = float(fields[3].split(":")[1].strip())
 39 |     return acc, precision, recall, f1
 40 | 
 41 | 
 42 | def get_optimizer(parameters, optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps):
 43 |     if optim == 'sgd':
 44 |         optimizer = SGD(parameters, lr=learning_rate, momentum=0.9, weight_decay=weight_decay, nesterov=True)
 45 |     else:
 46 |         optimizer = AdamW(parameters, lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, amsgrad=amsgrad, weight_decay=weight_decay)
 47 |     init_lr = 1e-7
 48 |     scheduler = ExponentialScheduler(optimizer, lr_decay, warmup_steps, init_lr)
 49 |     return optimizer, scheduler
 50 | 
 51 | 
 52 | def eval(data, network, writer, outfile, scorefile, device):
 53 |     network.eval()
 54 |     writer.start(outfile)
 55 |     for data in iterate_data(data, 256):
 56 |         words = data['WORD'].to(device)
 57 |         chars = data['CHAR'].to(device)
 58 |         labels = data['NER'].numpy()
 59 |         masks = data['MASK'].to(device)
 60 |         postags = data['POS'].numpy()
 61 |         chunks = data['CHUNK'].numpy()
 62 |         lengths = data['LENGTH'].numpy()
 63 |         preds = network.decode(words, chars, mask=masks, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
 64 |         writer.write(words.cpu().numpy(), postags, chunks, preds.cpu().numpy(), labels, lengths)
 65 |     writer.close()
 66 |     acc, precision, recall, f1 = evaluate(outfile, scorefile)
 67 |     return acc, precision, recall, f1
 68 | 
 69 | 
 70 | def main():
 71 |     parser = argparse.ArgumentParser(description='NER with bi-directional RNN-CNN')
 72 |     parser.add_argument('--config', type=str, help='config file', required=True)
 73 |     parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs')
 74 |     parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch')
 75 |     parser.add_argument('--loss_type', choices=['sentence', 'token'], default='sentence', help='loss type (default: sentence)')
 76 |     parser.add_argument('--optim', choices=['sgd', 'adam'], help='type of optimizer', required=True)
 77 |     parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
 78 |     parser.add_argument('--lr_decay', type=float, default=0.999995, help='Decay rate of learning rate')
 79 |     parser.add_argument('--amsgrad', action='store_true', help='AMS Grad')
 80 |     parser.add_argument('--grad_clip', type=float, default=0, help='max norm for gradient clip (default 0: no clip')
 81 |     parser.add_argument('--warmup_steps', type=int, default=0, metavar='N', help='number of steps to warm up (default: 0)')
 82 |     parser.add_argument('--weight_decay', type=float, default=0.0, help='weight for l2 norm decay')
 83 |     parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK')
 84 |     parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True)
 85 |     parser.add_argument('--embedding_dict', help='path for embedding dict')
 86 |     parser.add_argument('--train', help='path for training file.', required=True)
 87 |     parser.add_argument('--dev', help='path for dev file.', required=True)
 88 |     parser.add_argument('--test', help='path for test file.', required=True)
 89 |     parser.add_argument('--model_path', help='path for saving model file.', required=True)
 90 | 
 91 |     args = parser.parse_args()
 92 | 
 93 |     logger = get_logger("NER")
 94 | 
 95 |     args.cuda = torch.cuda.is_available()
 96 |     device = torch.device('cuda', 0) if args.cuda else torch.device('cpu')
 97 |     train_path = args.train
 98 |     dev_path = args.dev
 99 |     test_path = args.test
100 | 
101 |     num_epochs = args.num_epochs
102 |     batch_size = args.batch_size
103 |     optim = args.optim
104 |     learning_rate = args.learning_rate
105 |     lr_decay = args.lr_decay
106 |     amsgrad = args.amsgrad
107 |     warmup_steps = args.warmup_steps
108 |     weight_decay = args.weight_decay
109 |     grad_clip = args.grad_clip
110 | 
111 |     loss_ty_token = args.loss_type == 'token'
112 |     unk_replace = args.unk_replace
113 | 
114 |     model_path = args.model_path
115 |     model_name = os.path.join(model_path, 'model.pt')
116 |     embedding = args.embedding
117 |     embedding_path = args.embedding_dict
118 | 
119 |     print(args)
120 | 
121 |     embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path)
122 | 
123 |     logger.info("Creating Alphabets")
124 |     alphabet_path = os.path.join(model_path, 'alphabets')
125 |     word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet = conll03_data.create_alphabets(alphabet_path, train_path,
126 |                                                                                                              data_paths=[dev_path, test_path],
127 |                                                                                                              embedd_dict=embedd_dict, max_vocabulary_size=50000)
128 | 
129 |     logger.info("Word Alphabet Size: %d" % word_alphabet.size())
130 |     logger.info("Character Alphabet Size: %d" % char_alphabet.size())
131 |     logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
132 |     logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
133 |     logger.info("NER Alphabet Size: %d" % ner_alphabet.size())
134 | 
135 |     logger.info("Reading Data")
136 | 
137 |     data_train = conll03_data.read_bucketed_data(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)
138 |     num_data = sum(data_train[1])
139 |     num_labels = ner_alphabet.size()
140 | 
141 |     data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)
142 |     data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)
143 | 
144 |     writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)
145 | 
146 |     def construct_word_embedding_table():
147 |         scale = np.sqrt(3.0 / embedd_dim)
148 |         table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
149 |         table[conll03_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
150 |         oov = 0
151 |         for word, index in word_alphabet.items():
152 |             if word in embedd_dict:
153 |                 embedding = embedd_dict[word]
154 |             elif word.lower() in embedd_dict:
155 |                 embedding = embedd_dict[word.lower()]
156 |             else:
157 |                 embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
158 |                 oov += 1
159 |             table[index, :] = embedding
160 |         print('oov: %d' % oov)
161 |         return torch.from_numpy(table)
162 | 
163 |     word_table = construct_word_embedding_table()
164 | 
165 |     logger.info("constructing network...")
166 | 
167 |     hyps = json.load(open(args.config, 'r'))
168 |     json.dump(hyps, open(os.path.join(model_path, 'config.json'), 'w'), indent=2)
169 |     dropout = hyps['dropout']
170 |     crf = hyps['crf']
171 |     bigram = hyps['bigram']
172 |     assert embedd_dim == hyps['embedd_dim']
173 |     char_dim = hyps['char_dim']
174 |     mode = hyps['rnn_mode']
175 |     hidden_size = hyps['hidden_size']
176 |     out_features = hyps['out_features']
177 |     num_layers = hyps['num_layers']
178 |     p_in = hyps['p_in']
179 |     p_out = hyps['p_out']
180 |     p_rnn = hyps['p_rnn']
181 |     activation = hyps['activation']
182 | 
183 |     if dropout == 'std':
184 |         if crf:
185 |             network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers,
186 |                                          num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation)
187 |         else:
188 |             network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers,
189 |                                       num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation)
190 |     elif dropout == 'variational':
191 |         if crf:
192 |             network = BiVarRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers,
193 |                                             num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation)
194 |         else:
195 |             network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers,
196 |                                          num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation)
197 |     else:
198 |         raise ValueError('Unkown dropout type: {}'.format(dropout))
199 | 
200 |     network = network.to(device)
201 | 
202 |     optimizer, scheduler = get_optimizer(network.parameters(), optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps)
203 |     model = "{}-CNN{}".format(mode, "-CRF" if crf else "")
204 |     logger.info("Network: %s, num_layer=%d, hidden=%d, act=%s" % (model, num_layers, hidden_size, activation))
205 |     logger.info("training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (weight_decay, num_data, batch_size, unk_replace))
206 |     logger.info("dropout(in, out, rnn): %s(%.2f, %.2f, %s)" % (dropout, p_in, p_out, p_rnn))
207 |     print('# of Parameters: %d' % (sum([param.numel() for param in network.parameters()])))
208 | 
209 |     best_f1 = 0.0
210 |     best_acc = 0.0
211 |     best_precision = 0.0
212 |     best_recall = 0.0
213 |     test_f1 = 0.0
214 |     test_acc = 0.0
215 |     test_precision = 0.0
216 |     test_recall = 0.0
217 |     best_epoch = 0
218 |     patient = 0
219 |     num_batches = num_data // batch_size + 1
220 |     result_path = os.path.join(model_path, 'tmp')
221 |     if not os.path.exists(result_path):
222 |         os.makedirs(result_path)
223 |     for epoch in range(1, num_epochs + 1):
224 |         start_time = time.time()
225 |         train_loss = 0.
226 |         num_insts = 0
227 |         num_words = 0
228 |         num_back = 0
229 |         network.train()
230 |         lr = scheduler.get_lr()[0]
231 |         print('Epoch %d (%s, lr=%.6f, lr decay=%.6f, amsgrad=%s, l2=%.1e): ' % (epoch, optim, lr, lr_decay, amsgrad, weight_decay))
232 |         if args.cuda:
233 |             torch.cuda.empty_cache()
234 |         gc.collect()
235 |         for step, data in enumerate(iterate_data(data_train, batch_size, bucketed=True, unk_replace=unk_replace, shuffle=True)):
236 |             optimizer.zero_grad()
237 |             words = data['WORD'].to(device)
238 |             chars = data['CHAR'].to(device)
239 |             labels = data['NER'].to(device)
240 |             masks = data['MASK'].to(device)
241 | 
242 |             nbatch = words.size(0)
243 |             nwords = masks.sum().item()
244 | 
245 |             loss_total = network.loss(words, chars, labels, mask=masks).sum()
246 |             if loss_ty_token:
247 |                 loss = loss_total.div(nwords)
248 |             else:
249 |                 loss = loss_total.div(nbatch)
250 |             loss.backward()
251 |             if grad_clip > 0:
252 |                 clip_grad_norm_(network.parameters(), grad_clip)
253 |             optimizer.step()
254 |             scheduler.step()
255 | 
256 |             with torch.no_grad():
257 |                 num_insts += nbatch
258 |                 num_words += nwords
259 |                 train_loss += loss_total.item()
260 | 
261 |             # update log
262 |             if step % 100 == 0:
263 |                 torch.cuda.empty_cache()
264 |                 sys.stdout.write("\b" * num_back)
265 |                 sys.stdout.write(" " * num_back)
266 |                 sys.stdout.write("\b" * num_back)
267 |                 curr_lr = scheduler.get_lr()[0]
268 |                 log_info = '[%d/%d (%.0f%%) lr=%.6f] loss: %.4f (%.4f)' % (step, num_batches, 100. * step / num_batches,
269 |                                                                            curr_lr, train_loss / num_insts, train_loss / num_words)
270 |                 sys.stdout.write(log_info)
271 |                 sys.stdout.flush()
272 |                 num_back = len(log_info)
273 | 
274 |         sys.stdout.write("\b" * num_back)
275 |         sys.stdout.write(" " * num_back)
276 |         sys.stdout.write("\b" * num_back)
277 |         print('total: %d (%d), loss: %.4f (%.4f), time: %.2fs' % (num_insts, num_words, train_loss / num_insts,
278 |                                                                   train_loss / num_words, time.time() - start_time))
279 |         print('-' * 100)
280 | 
281 |         # evaluate performance on dev data
282 |         with torch.no_grad():
283 |             outfile = os.path.join(result_path, 'pred_dev%d' % epoch)
284 |             scorefile = os.path.join(result_path, "score_dev%d" % epoch)
285 |             acc, precision, recall, f1 = eval(data_dev, network, writer, outfile, scorefile, device)
286 |             print('Dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1))
287 |             if best_f1 < f1:
288 |                 torch.save(network.state_dict(), model_name)
289 |                 best_f1 = f1
290 |                 best_acc = acc
291 |                 best_precision = precision
292 |                 best_recall = recall
293 |                 best_epoch = epoch
294 | 
295 |                 # evaluate on test data when better performance detected
296 |                 outfile = os.path.join(result_path, 'pred_test%d' % epoch)
297 |                 scorefile = os.path.join(result_path, "score_test%d" % epoch)
298 |                 test_acc, test_precision, test_recall, test_f1 = eval(data_test, network, writer, outfile, scorefile, device)
299 |                 print('test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (test_acc, test_precision, test_recall, test_f1))
300 |                 patient = 0
301 |             else:
302 |                 patient += 1
303 |             print('-' * 100)
304 | 
305 |             print("Best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))" % (best_acc, best_precision, best_recall, best_f1, best_epoch, patient))
306 |             print("Best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))" % (test_acc, test_precision, test_recall, test_f1, best_epoch, patient))
307 |             print('=' * 100)
308 | 
309 |         if patient > 4:
310 |             logger.info('reset optimizer momentums')
311 |             scheduler.reset_state()
312 |             patient = 0
313 | 
314 | 
315 | if __name__ == '__main__':
316 |     main()
317 | 


--------------------------------------------------------------------------------
/experiments/pos_tagging.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of Bi-directional LSTM-CNNs-CRF model for POS Tagging.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | import gc
  8 | import json
  9 | 
 10 | current_path = os.path.dirname(os.path.realpath(__file__))
 11 | root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 12 | sys.path.append(root_path)
 13 | 
 14 | import time
 15 | import argparse
 16 | 
 17 | import numpy as np
 18 | import torch
 19 | from torch.optim.adamw import AdamW
 20 | from torch.optim import SGD
 21 | from torch.nn.utils import clip_grad_norm_
 22 | from neuronlp2.io import get_logger, conllx_data, iterate_data, POSWriter
 23 | from neuronlp2.models import BiRecurrentConv, BiVarRecurrentConv, BiRecurrentConvCRF, BiVarRecurrentConvCRF
 24 | from neuronlp2.optim import ExponentialScheduler
 25 | from neuronlp2 import utils
 26 | 
 27 | 
 28 | def get_optimizer(parameters, optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps):
 29 |     if optim == 'sgd':
 30 |         optimizer = SGD(parameters, lr=learning_rate, momentum=0.9, weight_decay=weight_decay, nesterov=True)
 31 |     else:
 32 |         optimizer = AdamW(parameters, lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, amsgrad=amsgrad, weight_decay=weight_decay)
 33 |     init_lr = 1e-7
 34 |     scheduler = ExponentialScheduler(optimizer, lr_decay, warmup_steps, init_lr)
 35 |     return optimizer, scheduler
 36 | 
 37 | 
 38 | def eval(data, network, writer, outfile, device):
 39 |     network.eval()
 40 |     corr = 0
 41 |     total = 0
 42 |     writer.start(outfile)
 43 |     for data in iterate_data(data, 256):
 44 |         words = data['WORD'].to(device)
 45 |         chars = data['CHAR'].to(device)
 46 |         masks = data['MASK'].to(device)
 47 |         postags = data['POS'].to(device)
 48 |         lengths = data['LENGTH'].numpy()
 49 |         preds = network.decode(words, chars, mask=masks, leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS)
 50 |         corr += torch.eq(preds, postags).float().mul(masks).sum().item()
 51 |         total += masks.sum().item()
 52 |         writer.write(words.cpu().numpy(), preds.cpu().numpy(), postags.cpu().numpy(), lengths)
 53 |     writer.close()
 54 |     return corr, total
 55 | 
 56 | 
 57 | def main():
 58 |     parser = argparse.ArgumentParser(description='NER with bi-directional RNN-CNN')
 59 |     parser.add_argument('--config', type=str, help='config file', required=True)
 60 |     parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs')
 61 |     parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch')
 62 |     parser.add_argument('--loss_type', choices=['sentence', 'token'], default='sentence', help='loss type (default: sentence)')
 63 |     parser.add_argument('--optim', choices=['sgd', 'adam'], help='type of optimizer', required=True)
 64 |     parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
 65 |     parser.add_argument('--lr_decay', type=float, default=0.999995, help='Decay rate of learning rate')
 66 |     parser.add_argument('--amsgrad', action='store_true', help='AMS Grad')
 67 |     parser.add_argument('--grad_clip', type=float, default=0, help='max norm for gradient clip (default 0: no clip')
 68 |     parser.add_argument('--warmup_steps', type=int, default=0, metavar='N', help='number of steps to warm up (default: 0)')
 69 |     parser.add_argument('--weight_decay', type=float, default=0.0, help='weight for l2 norm decay')
 70 |     parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK')
 71 |     parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True)
 72 |     parser.add_argument('--embedding_dict', help='path for embedding dict')
 73 |     parser.add_argument('--train', help='path for training file.', required=True)
 74 |     parser.add_argument('--dev', help='path for dev file.', required=True)
 75 |     parser.add_argument('--test', help='path for test file.', required=True)
 76 |     parser.add_argument('--model_path', help='path for saving model file.', required=True)
 77 | 
 78 |     args = parser.parse_args()
 79 | 
 80 |     logger = get_logger("POS")
 81 | 
 82 |     args.cuda = torch.cuda.is_available()
 83 |     device = torch.device('cuda', 0) if args.cuda else torch.device('cpu')
 84 |     train_path = args.train
 85 |     dev_path = args.dev
 86 |     test_path = args.test
 87 | 
 88 |     num_epochs = args.num_epochs
 89 |     batch_size = args.batch_size
 90 |     optim = args.optim
 91 |     learning_rate = args.learning_rate
 92 |     lr_decay = args.lr_decay
 93 |     amsgrad = args.amsgrad
 94 |     warmup_steps = args.warmup_steps
 95 |     weight_decay = args.weight_decay
 96 |     grad_clip = args.grad_clip
 97 | 
 98 |     loss_ty_token = args.loss_type == 'token'
 99 |     unk_replace = args.unk_replace
100 | 
101 |     model_path = args.model_path
102 |     model_name = os.path.join(model_path, 'model.pt')
103 |     embedding = args.embedding
104 |     embedding_path = args.embedding_dict
105 | 
106 |     print(args)
107 | 
108 |     embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path)
109 | 
110 |     logger.info("Creating Alphabets")
111 |     alphabet_path = os.path.join(model_path, 'alphabets')
112 |     word_alphabet, char_alphabet, pos_alphabet, type_alphabet = conllx_data.create_alphabets(alphabet_path, train_path,
113 |                                                                                              data_paths=[dev_path, test_path],
114 |                                                                                              embedd_dict=embedd_dict, max_vocabulary_size=50000)
115 | 
116 |     logger.info("Word Alphabet Size: %d" % word_alphabet.size())
117 |     logger.info("Character Alphabet Size: %d" % char_alphabet.size())
118 |     logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
119 | 
120 |     logger.info("Reading Data")
121 | 
122 |     data_train = conllx_data.read_bucketed_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
123 |     num_data = sum(data_train[1])
124 |     num_labels = pos_alphabet.size()
125 | 
126 |     data_dev = conllx_data.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
127 |     data_test = conllx_data.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
128 | 
129 |     writer = POSWriter(word_alphabet, char_alphabet, pos_alphabet)
130 | 
131 |     def construct_word_embedding_table():
132 |         scale = np.sqrt(3.0 / embedd_dim)
133 |         table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
134 |         table[conllx_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
135 |         oov = 0
136 |         for word, index in word_alphabet.items():
137 |             if word in embedd_dict:
138 |                 embedding = embedd_dict[word]
139 |             elif word.lower() in embedd_dict:
140 |                 embedding = embedd_dict[word.lower()]
141 |             else:
142 |                 embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
143 |                 oov += 1
144 |             table[index, :] = embedding
145 |         print('oov: %d' % oov)
146 |         return torch.from_numpy(table)
147 | 
148 |     word_table = construct_word_embedding_table()
149 | 
150 |     logger.info("constructing network...")
151 | 
152 |     hyps = json.load(open(args.config, 'r'))
153 |     json.dump(hyps, open(os.path.join(model_path, 'config.json'), 'w'), indent=2)
154 |     dropout = hyps['dropout']
155 |     crf = hyps['crf']
156 |     bigram = hyps['bigram']
157 |     assert embedd_dim == hyps['embedd_dim']
158 |     char_dim = hyps['char_dim']
159 |     mode = hyps['rnn_mode']
160 |     hidden_size = hyps['hidden_size']
161 |     out_features = hyps['out_features']
162 |     num_layers = hyps['num_layers']
163 |     p_in = hyps['p_in']
164 |     p_out = hyps['p_out']
165 |     p_rnn = hyps['p_rnn']
166 |     activation = hyps['activation']
167 | 
168 |     if dropout == 'std':
169 |         if crf:
170 |             network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers,
171 |                                          num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation)
172 |         else:
173 |             network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers,
174 |                                       num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation)
175 |     elif dropout == 'variational':
176 |         if crf:
177 |             network = BiVarRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers,
178 |                                             num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation)
179 |         else:
180 |             network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers,
181 |                                          num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation)
182 |     else:
183 |         raise ValueError('Unkown dropout type: {}'.format(dropout))
184 | 
185 |     network = network.to(device)
186 | 
187 |     optimizer, scheduler = get_optimizer(network.parameters(), optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps)
188 |     model = "{}-CNN{}".format(mode, "-CRF" if crf else "")
189 |     logger.info("Network: %s, num_layer=%d, hidden=%d, act=%s" % (model, num_layers, hidden_size, activation))
190 |     logger.info("training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (weight_decay, num_data, batch_size, unk_replace))
191 |     logger.info("dropout(in, out, rnn): %s(%.2f, %.2f, %s)" % (dropout, p_in, p_out, p_rnn))
192 |     print('# of Parameters: %d' % (sum([param.numel() for param in network.parameters()])))
193 | 
194 |     best_corr = 0.0
195 |     best_total = 0.0
196 |     test_corr = 0.0
197 |     test_total = 0.0
198 |     best_epoch = 0
199 |     patient = 0
200 |     num_batches = num_data // batch_size + 1
201 |     result_path = os.path.join(model_path, 'tmp')
202 |     if not os.path.exists(result_path):
203 |         os.makedirs(result_path)
204 |     for epoch in range(1, num_epochs + 1):
205 |         start_time = time.time()
206 |         train_loss = 0.
207 |         num_insts = 0
208 |         num_words = 0
209 |         num_back = 0
210 |         network.train()
211 |         lr = scheduler.get_lr()[0]
212 |         print('Epoch %d (%s, lr=%.6f, lr decay=%.6f, amsgrad=%s, l2=%.1e): ' % (epoch, optim, lr, lr_decay, amsgrad, weight_decay))
213 |         if args.cuda:
214 |             torch.cuda.empty_cache()
215 |         gc.collect()
216 |         for step, data in enumerate(iterate_data(data_train, batch_size, bucketed=True, unk_replace=unk_replace, shuffle=True)):
217 |             optimizer.zero_grad()
218 |             words = data['WORD'].to(device)
219 |             chars = data['CHAR'].to(device)
220 |             labels = data['POS'].to(device)
221 |             masks = data['MASK'].to(device)
222 | 
223 |             nbatch = words.size(0)
224 |             nwords = masks.sum().item()
225 | 
226 |             loss_total = network.loss(words, chars, labels, mask=masks).sum()
227 |             if loss_ty_token:
228 |                 loss = loss_total.div(nwords)
229 |             else:
230 |                 loss = loss_total.div(nbatch)
231 |             loss.backward()
232 |             if grad_clip > 0:
233 |                 clip_grad_norm_(network.parameters(), grad_clip)
234 |             optimizer.step()
235 |             scheduler.step()
236 | 
237 |             with torch.no_grad():
238 |                 num_insts += nbatch
239 |                 num_words += nwords
240 |                 train_loss += loss_total.item()
241 | 
242 |             # update log
243 |             if step % 100 == 0:
244 |                 torch.cuda.empty_cache()
245 |                 sys.stdout.write("\b" * num_back)
246 |                 sys.stdout.write(" " * num_back)
247 |                 sys.stdout.write("\b" * num_back)
248 |                 curr_lr = scheduler.get_lr()[0]
249 |                 log_info = '[%d/%d (%.0f%%) lr=%.6f] loss: %.4f (%.4f)' % (step, num_batches, 100. * step / num_batches,
250 |                                                                            curr_lr, train_loss / num_insts, train_loss / num_words)
251 |                 sys.stdout.write(log_info)
252 |                 sys.stdout.flush()
253 |                 num_back = len(log_info)
254 | 
255 |         sys.stdout.write("\b" * num_back)
256 |         sys.stdout.write(" " * num_back)
257 |         sys.stdout.write("\b" * num_back)
258 |         print('total: %d (%d), loss: %.4f (%.4f), time: %.2fs' % (num_insts, num_words, train_loss / num_insts,
259 |                                                                   train_loss / num_words, time.time() - start_time))
260 |         print('-' * 100)
261 | 
262 |         # evaluate performance on dev data
263 |         with torch.no_grad():
264 |             outfile = os.path.join(result_path, 'pred_dev%d' % epoch)
265 |             dev_corr, dev_total = eval(data_dev, network, writer, outfile, device)
266 |             print('Dev  corr: %d, total: %d, acc: %.2f%%' % (dev_corr, dev_total, dev_corr * 100 / dev_total))
267 |             if best_corr < dev_corr:
268 |                 torch.save(network.state_dict(), model_name)
269 |                 best_corr = dev_corr
270 |                 best_total = dev_total
271 |                 best_epoch = epoch
272 | 
273 |                 # evaluate on test data when better performance detected
274 |                 outfile = os.path.join(result_path, 'pred_test%d' % epoch)
275 |                 test_corr, test_total = eval(data_test, network, writer, outfile, device)
276 |                 print('test corr: %d, total: %d, acc: %.2f%%' % (test_corr, test_total, test_corr * 100 / test_total))
277 |                 patient = 0
278 |             else:
279 |                 patient += 1
280 |             print('-' * 100)
281 | 
282 |             print("Best dev  corr: %d, total: %d, acc: %.2f%% (epoch: %d (%d))" % (best_corr, best_total, best_corr * 100 / best_total, best_epoch, patient))
283 |             print("Best test corr: %d, total: %d, acc: %.2f%% (epoch: %d (%d))" % (test_corr, test_total, test_corr * 100 / test_total, best_epoch, patient))
284 |             print('=' * 100)
285 | 
286 |         if patient > 4:
287 |             logger.info('reset optimizer momentums')
288 |             scheduler.reset_state()
289 |             patient = 0
290 | 
291 | 
292 | if __name__ == '__main__':
293 |     main()
294 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_analyze.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | CUDA_VISIBLE_DEVICES=1 python examples/analyze.py --parser stackptr --beam 5 --ordered --gpu \
3 |  --punctuation '.' '``' "''" ':' ',' \
4 |  --test "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.test.conll" \
5 |  --model_path "models/parsing/stack_ptr/" --model_name 'network.pt'
6 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_deepbiaf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python -u parsing.py --mode train --config configs/parsing/biaffine.json --num_epochs 400 --batch_size 32 \
 3 |  --opt adam --learning_rate 0.001 --lr_decay 0.999995 --beta1 0.9 --beta2 0.9 --eps 1e-4 --grad_clip 5.0 \
 4 |  --loss_type token --warmup_steps 40 --reset 20 --weight_decay 0.0 --unk_replace 0.5 \
 5 |  --word_embedding sskip --word_path "data/sskip/sskip.eng.100.gz" --char_embedding random \
 6 |  --punctuation '.' '``' "''" ':' ',' \
 7 |  --train "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.train.conll" \
 8 |  --dev "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.dev.conll" \
 9 |  --test "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.test.conll" \
10 |  --model_path "models/parsing/deepbiaf/"
11 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_ner_conll03.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python ner.py --config configs/ner/conll03.json --num_epochs 400 --batch_size 16 \
3 |  --loss_type sentence --optim sgd --learning_rate 0.01 --lr_decay 0.99999 --grad_clip 0.0 --warmup_steps 100 --weight_decay 0.0 --unk_replace 0.0 \
4 |  --embedding glove --embedding_dict "data/glove/glove.6B/glove.6B.100d.gz" --model_path "models/ner/conll03" \
5 |  --train "data/conll2003/english/eng.train.bioes.conll" --dev "data/conll2003/english/eng.dev.bioes.conll" --test "data/conll2003/english/eng.test.bioes.conll"
6 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_neuromst.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python -u parsing.py --mode train --config configs/parsing/neuromst.json --num_epochs 400 --batch_size 32 \
 3 |  --opt adam --learning_rate 0.001 --lr_decay 0.999995 --beta1 0.9 --beta2 0.9 --eps 1e-4 --grad_clip 5.0 \
 4 |  --loss_type token --warmup_steps 40 --reset 20 --weight_decay 0.0 --unk_replace 0.5 \
 5 |  --word_embedding sskip --word_path "data/sskip/sskip.eng.100.gz" --char_embedding random \
 6 |  --punctuation '.' '``' "''" ':' ',' \
 7 |  --train "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.train.conll" \
 8 |  --dev "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.dev.conll" \
 9 |  --test "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.test.conll" \
10 |  --model_path "models/parsing/neuromst/"
11 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_pos_wsj.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python pos_tagging.py --config configs/pos/wsj.json --num_epochs 400 --batch_size 32 \
3 |  --loss_type sentence --optim sgd --learning_rate 0.01 --lr_decay 0.99999 --grad_clip 0.0 --warmup_steps 10 --weight_decay 0.0 --unk_replace 0.0 \
4 |  --embedding glove --embedding_dict "data/glove/glove.6B/glove.6B.100d.gz" --model_path "models/pos/wsj" \
5 |  --train "data/POS-penn/wsj/split1/wsj1.train.original" --dev "data/POS-penn/wsj/split1/wsj1.dev.original" --test "data/POS-penn/wsj/split1/wsj1.test.original" 
6 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_stackptr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python -u parsing.py --mode train --config configs/parsing/stackptr.json --num_epochs 600 --batch_size 32 \
 3 |  --opt adam --learning_rate 0.001 --lr_decay 0.999997 --beta1 0.9 --beta2 0.9 --eps 1e-4 --grad_clip 5.0 \
 4 |  --loss_type token --warmup_steps 40 --reset 20 --weight_decay 0.0 --unk_replace 0.5 --beam 10 \
 5 |  --word_embedding sskip --word_path "data/sskip/sskip.eng.100.gz" --char_embedding random \
 6 |  --punctuation '.' '``' "''" ':' ',' \
 7 |  --train "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.train.conll" \
 8 |  --dev "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.dev.conll" \
 9 |  --test "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.test.conll" \
10 |  --model_path "models/parsing/stackptr/"
11 | 


--------------------------------------------------------------------------------
/neuronlp2/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | __version__ = "0.2.dev1"
4 | 


--------------------------------------------------------------------------------
/neuronlp2/io/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from neuronlp2.io.alphabet import Alphabet
4 | from neuronlp2.io.instance import *
5 | from neuronlp2.io.logger import get_logger
6 | from neuronlp2.io.writer import CoNLL03Writer, CoNLLXWriter, POSWriter
7 | from neuronlp2.io.utils import get_batch, get_bucketed_batch, iterate_data
8 | from neuronlp2.io import conllx_data, conll03_data, conllx_stacked_data
9 | 


--------------------------------------------------------------------------------
/neuronlp2/io/alphabet.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | """
  4 | Alphabet maps objects to integer ids. It provides two way mapping from the index to the objects.
  5 | """
  6 | import json
  7 | import os
  8 | from neuronlp2.io.logger import get_logger
  9 | 
 10 | class Alphabet(object):
 11 |     def __init__(self, name, defualt_value=False, keep_growing=True, singleton=False):
 12 |         self.__name = name
 13 | 
 14 |         self.instance2index = {}
 15 |         self.instances = []
 16 |         self.default_value = defualt_value
 17 |         self.offset = 1 if self.default_value else 0
 18 |         self.keep_growing = keep_growing
 19 |         self.singletons = set() if singleton else None
 20 | 
 21 |         # Index 0 is occupied by default, all else following.
 22 |         self.default_index = 0 if self.default_value else None
 23 | 
 24 |         self.next_index = self.offset
 25 | 
 26 |         self.logger = get_logger('Alphabet')
 27 | 
 28 |     def add(self, instance):
 29 |         if instance not in self.instance2index:
 30 |             self.instances.append(instance)
 31 |             self.instance2index[instance] = self.next_index
 32 |             self.next_index += 1
 33 | 
 34 |     def add_singleton(self, id):
 35 |         if self.singletons is None:
 36 |             raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
 37 |         else:
 38 |             self.singletons.add(id)
 39 | 
 40 |     def add_singletons(self, ids):
 41 |         if self.singletons is None:
 42 |             raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
 43 |         else:
 44 |             self.singletons.update(ids)
 45 | 
 46 |     def is_singleton(self, id):
 47 |         if self.singletons is None:
 48 |             raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
 49 |         else:
 50 |             return id in self.singletons
 51 | 
 52 |     def get_index(self, instance):
 53 |         try:
 54 |             return self.instance2index[instance]
 55 |         except KeyError:
 56 |             if self.keep_growing:
 57 |                 index = self.next_index
 58 |                 self.add(instance)
 59 |                 return index
 60 |             else:
 61 |                 if self.default_value:
 62 |                     return self.default_index
 63 |                 else:
 64 |                     raise KeyError("instance not found: %s" % instance)
 65 | 
 66 |     def get_instance(self, index):
 67 |         if self.default_value and index == self.default_index:
 68 |             # First index is occupied by the wildcard element.
 69 |             return '<_UNK>'
 70 |         else:
 71 |             try:
 72 |                 return self.instances[index - self.offset]
 73 |             except IndexError:
 74 |                 raise IndexError('unknown index: %d' % index)
 75 | 
 76 |     def size(self):
 77 |         return len(self.instances) + self.offset
 78 | 
 79 |     def singleton_size(self):
 80 |         return len(self.singletons)
 81 | 
 82 |     def items(self):
 83 |         return self.instance2index.items()
 84 | 
 85 |     def enumerate_items(self, start):
 86 |         if start < self.offset or start >= self.size():
 87 |             raise IndexError("Enumerate is allowed between [%d : size of the alphabet)" % self.offset)
 88 |         return zip(range(start, len(self.instances) + self.offset), self.instances[start - self.offset:])
 89 | 
 90 |     def close(self):
 91 |         self.keep_growing = False
 92 | 
 93 |     def open(self):
 94 |         self.keep_growing = True
 95 | 
 96 |     def get_content(self):
 97 |         if self.singletons is None:
 98 |             return {'instance2index': self.instance2index, 'instances': self.instances}
 99 |         else:
100 |             return {'instance2index': self.instance2index, 'instances': self.instances,
101 |                     'singletions': list(self.singletons)}
102 | 
103 |     def __from_json(self, data):
104 |         self.instances = data["instances"]
105 |         self.instance2index = data["instance2index"]
106 |         if 'singletions' in data:
107 |             self.singletons = set(data['singletions'])
108 |         else:
109 |             self.singletons = None
110 | 
111 |     def save(self, output_directory, name=None):
112 |         """
113 |         Save both alhpabet records to the given directory.
114 |         :param output_directory: Directory to save model and weights.
115 |         :param name: The alphabet saving name, optional.
116 |         :return:
117 |         """
118 |         saving_name = name if name else self.__name
119 |         try:
120 |             if not os.path.exists(output_directory):
121 |                 os.makedirs(output_directory)
122 | 
123 |             json.dump(self.get_content(),
124 |                       open(os.path.join(output_directory, saving_name + ".json"), 'w'), indent=4)
125 |         except Exception as e:
126 |             self.logger.warn("Alphabet is not saved: %s" % repr(e))
127 | 
128 |     def load(self, input_directory, name=None):
129 |         """
130 |         Load model architecture and weights from the give directory. This allow we use old models even the structure
131 |         changes.
132 |         :param input_directory: Directory to save model and weights
133 |         :return:
134 |         """
135 |         loading_name = name if name else self.__name
136 |         self.__from_json(json.load(open(os.path.join(input_directory, loading_name + ".json"))))
137 |         self.next_index = len(self.instances) + self.offset
138 |         self.keep_growing = False
139 | 


--------------------------------------------------------------------------------
/neuronlp2/io/common.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | import re
 4 | 
 5 | MAX_CHAR_LENGTH = 45
 6 | 
 7 | # Regular expressions used to normalize digits.
 8 | DIGIT_RE = re.compile(r"\d")
 9 | 
10 | PAD = "_PAD"
11 | PAD_POS = "_PAD_POS"
12 | PAD_TYPE = "_<PAD>"
13 | PAD_CHAR = "_PAD_CHAR"
14 | PAD_CHUNK = "_PAD_CHUNK"
15 | PAD_NER = "_PAD_NER"
16 | 
17 | ROOT = "_ROOT"
18 | ROOT_POS = "_ROOT_POS"
19 | ROOT_TYPE = "_<ROOT>"
20 | ROOT_CHAR = "_ROOT_CHAR"
21 | 
22 | END = "_END"
23 | END_POS = "_END_POS"
24 | END_TYPE = "_<END>"
25 | END_CHAR = "_END_CHAR"
26 | 
27 | UNK_ID = 0
28 | PAD_ID_WORD = 1
29 | PAD_ID_CHAR = 1
30 | PAD_ID_TAG = 0
31 | 


--------------------------------------------------------------------------------
/neuronlp2/io/conll03_data.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import os.path
  4 | import numpy as np
  5 | from collections import defaultdict, OrderedDict
  6 | import torch
  7 | 
  8 | from neuronlp2.io.reader import CoNLL03Reader
  9 | from neuronlp2.io.alphabet import Alphabet
 10 | from neuronlp2.io.logger import get_logger
 11 | from neuronlp2.io.common import DIGIT_RE, MAX_CHAR_LENGTH, UNK_ID
 12 | from neuronlp2.io.common import PAD_CHAR, PAD, PAD_CHUNK, PAD_POS, PAD_NER, PAD_ID_CHAR, PAD_ID_TAG, PAD_ID_WORD
 13 | 
 14 | # Special vocabulary symbols - we always put them at the start.
 15 | _START_VOCAB = [PAD,]
 16 | NUM_SYMBOLIC_TAGS = 1
 17 | 
 18 | _buckets = [5, 10, 15, 20, 25, 30, 40, 50, 140]
 19 | 
 20 | 
 21 | def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None,
 22 |                      min_occurrence=1, normalize_digits=True):
 23 | 
 24 |     def expand_vocab():
 25 |         vocab_set = set(vocab_list)
 26 |         for data_path in data_paths:
 27 |             # logger.info("Processing data: %s" % data_path)
 28 |             with open(data_path, 'r') as file:
 29 |                 for line in file:
 30 |                     line = line.strip()
 31 |                     if len(line) == 0:
 32 |                         continue
 33 | 
 34 |                     tokens = line.split(' ')
 35 |                     word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
 36 |                     pos = tokens[2]
 37 |                     chunk = tokens[3]
 38 |                     ner = tokens[4]
 39 | 
 40 |                     pos_alphabet.add(pos)
 41 |                     chunk_alphabet.add(chunk)
 42 |                     ner_alphabet.add(ner)
 43 | 
 44 |                     if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
 45 |                         vocab_set.add(word)
 46 |                         vocab_list.append(word)
 47 | 
 48 |     logger = get_logger("Create Alphabets")
 49 |     word_alphabet = Alphabet('word', defualt_value=True, singleton=True)
 50 |     char_alphabet = Alphabet('character', defualt_value=True)
 51 |     pos_alphabet = Alphabet('pos')
 52 |     chunk_alphabet = Alphabet('chunk')
 53 |     ner_alphabet = Alphabet('ner')
 54 | 
 55 |     if not os.path.isdir(alphabet_directory):
 56 |         logger.info("Creating Alphabets: %s" % alphabet_directory)
 57 | 
 58 |         char_alphabet.add(PAD_CHAR)
 59 |         pos_alphabet.add(PAD_POS)
 60 |         chunk_alphabet.add(PAD_CHUNK)
 61 |         ner_alphabet.add(PAD_NER)
 62 | 
 63 |         vocab = defaultdict(int)
 64 |         with open(train_path, 'r') as file:
 65 |             for line in file:
 66 |                 line = line.strip()
 67 |                 if len(line) == 0:
 68 |                     continue
 69 | 
 70 |                 tokens = line.split(' ')
 71 |                 for char in tokens[1]:
 72 |                     char_alphabet.add(char)
 73 | 
 74 |                 word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
 75 |                 vocab[word] += 1
 76 | 
 77 |                 pos = tokens[2]
 78 |                 pos_alphabet.add(pos)
 79 | 
 80 |                 chunk = tokens[3]
 81 |                 chunk_alphabet.add(chunk)
 82 | 
 83 |                 ner = tokens[4]
 84 |                 ner_alphabet.add(ner)
 85 | 
 86 |         # collect singletons
 87 |         singletons = set([word for word, count in vocab.items() if count <= min_occurrence])
 88 | 
 89 |         # if a singleton is in pretrained embedding dict, set the count to min_occur + c
 90 |         if embedd_dict is not None:
 91 |             assert isinstance(embedd_dict, OrderedDict)
 92 |             for word in vocab.keys():
 93 |                 if word in embedd_dict or word.lower() in embedd_dict:
 94 |                     vocab[word] += min_occurrence
 95 | 
 96 |         vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
 97 |         logger.info("Total Vocabulary Size: %d" % len(vocab_list))
 98 |         logger.info("Total Singleton Size:  %d" % len(singletons))
 99 |         vocab_list = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence]
100 |         logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list))
101 | 
102 |         if len(vocab_list) > max_vocabulary_size:
103 |             vocab_list = vocab_list[:max_vocabulary_size]
104 | 
105 |         if data_paths is not None and embedd_dict is not None:
106 |             expand_vocab()
107 | 
108 |         for word in vocab_list:
109 |             word_alphabet.add(word)
110 |             if word in singletons:
111 |                 word_alphabet.add_singleton(word_alphabet.get_index(word))
112 | 
113 |         word_alphabet.save(alphabet_directory)
114 |         char_alphabet.save(alphabet_directory)
115 |         pos_alphabet.save(alphabet_directory)
116 |         chunk_alphabet.save(alphabet_directory)
117 |         ner_alphabet.save(alphabet_directory)
118 |     else:
119 |         word_alphabet.load(alphabet_directory)
120 |         char_alphabet.load(alphabet_directory)
121 |         pos_alphabet.load(alphabet_directory)
122 |         chunk_alphabet.load(alphabet_directory)
123 |         ner_alphabet.load(alphabet_directory)
124 | 
125 |     word_alphabet.close()
126 |     char_alphabet.close()
127 |     pos_alphabet.close()
128 |     chunk_alphabet.close()
129 |     ner_alphabet.close()
130 |     logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size()))
131 |     logger.info("Character Alphabet Size: %d" % char_alphabet.size())
132 |     logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
133 |     logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
134 |     logger.info("NER Alphabet Size: %d" % ner_alphabet.size())
135 |     return word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet
136 | 
137 | 
138 | def read_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet,
139 |               pos_alphabet: Alphabet, chunk_alphabet: Alphabet, ner_alphabet: Alphabet,
140 |               max_size=None, normalize_digits=True):
141 |     data = []
142 |     max_length = 0
143 |     max_char_length = 0
144 |     print('Reading data from %s' % source_path)
145 |     counter = 0
146 |     reader = CoNLL03Reader(source_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)
147 |     inst = reader.getNext(normalize_digits)
148 |     while inst is not None and (not max_size or counter < max_size):
149 |         counter += 1
150 |         if counter % 10000 == 0:
151 |             print("reading data: %d" % counter)
152 | 
153 |         sent = inst.sentence
154 |         data.append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.chunk_ids, inst.ner_ids])
155 |         max_len = max([len(char_seq) for char_seq in sent.char_seqs])
156 |         if max_char_length < max_len:
157 |             max_char_length = max_len
158 |         if max_length < inst.length():
159 |             max_length = inst.length()
160 |         inst = reader.getNext(normalize_digits)
161 |     reader.close()
162 |     print("Total number of data: %d" % counter)
163 | 
164 |     data_size = len(data)
165 |     char_length = min(MAX_CHAR_LENGTH, max_char_length)
166 |     wid_inputs = np.empty([data_size, max_length], dtype=np.int64)
167 |     cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64)
168 |     pid_inputs = np.empty([data_size, max_length], dtype=np.int64)
169 |     chid_inputs = np.empty([data_size, max_length], dtype=np.int64)
170 |     nid_inputs = np.empty([data_size, max_length], dtype=np.int64)
171 | 
172 |     masks = np.zeros([data_size, max_length], dtype=np.float32)
173 |     single = np.zeros([data_size, max_length], dtype=np.int64)
174 |     lengths = np.empty(data_size, dtype=np.int64)
175 | 
176 |     for i, inst in enumerate(data):
177 |         wids, cid_seqs, pids, chids, nids = inst
178 |         inst_size = len(wids)
179 |         lengths[i] = inst_size
180 |         # word ids
181 |         wid_inputs[i, :inst_size] = wids
182 |         wid_inputs[i, inst_size:] = PAD_ID_WORD
183 |         for c, cids in enumerate(cid_seqs):
184 |             cid_inputs[i, c, :len(cids)] = cids
185 |             cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
186 |         cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
187 |         # pos ids
188 |         pid_inputs[i, :inst_size] = pids
189 |         pid_inputs[i, inst_size:] = PAD_ID_TAG
190 |         # chunk ids
191 |         chid_inputs[i, :inst_size] = chids
192 |         chid_inputs[i, inst_size:] = PAD_ID_TAG
193 |         # ner ids
194 |         nid_inputs[i, :inst_size] = nids
195 |         nid_inputs[i, inst_size:] = PAD_ID_TAG
196 |         # masks
197 |         masks[i, :inst_size] = 1.0
198 |         for j, wid in enumerate(wids):
199 |             if word_alphabet.is_singleton(wid):
200 |                 single[i, j] = 1
201 | 
202 |     words = torch.from_numpy(wid_inputs)
203 |     chars = torch.from_numpy(cid_inputs)
204 |     pos = torch.from_numpy(pid_inputs)
205 |     chunks = torch.from_numpy(chid_inputs)
206 |     ners = torch.from_numpy(nid_inputs)
207 |     masks = torch.from_numpy(masks)
208 |     single = torch.from_numpy(single)
209 |     lengths = torch.from_numpy(lengths)
210 | 
211 |     data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'CHUNK': chunks,
212 |                    'NER': ners, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths}
213 |     return data_tensor, data_size
214 | 
215 | 
216 | def read_bucketed_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet,
217 |                        pos_alphabet: Alphabet, chunk_alphabet: Alphabet, ner_alphabet: Alphabet,
218 |                        max_size=None, normalize_digits=True):
219 |     data = [[] for _ in _buckets]
220 |     max_char_length = [0 for _ in _buckets]
221 |     print('Reading data from %s' % source_path)
222 |     counter = 0
223 |     reader = CoNLL03Reader(source_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)
224 |     inst = reader.getNext(normalize_digits)
225 |     while inst is not None and (not max_size or counter < max_size):
226 |         counter += 1
227 |         if counter % 10000 == 0:
228 |             print("reading data: %d" % counter)
229 | 
230 |         inst_size = inst.length()
231 |         sent = inst.sentence
232 |         for bucket_id, bucket_size in enumerate(_buckets):
233 |             if inst_size < bucket_size:
234 |                 data[bucket_id].append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.chunk_ids, inst.ner_ids])
235 |                 max_len = max([len(char_seq) for char_seq in sent.char_seqs])
236 |                 if max_char_length[bucket_id] < max_len:
237 |                     max_char_length[bucket_id] = max_len
238 |                 break
239 | 
240 |         inst = reader.getNext(normalize_digits)
241 |     reader.close()
242 |     print("Total number of data: %d" % counter)
243 | 
244 |     bucket_sizes = [len(data[b]) for b in range(len(_buckets))]
245 |     data_tensors = []
246 |     for bucket_id in range(len(_buckets)):
247 |         bucket_size = bucket_sizes[bucket_id]
248 |         if bucket_size == 0:
249 |             data_tensors.append((1, 1))
250 |             continue
251 | 
252 |         bucket_length = _buckets[bucket_id]
253 |         char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id])
254 |         wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
255 |         cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64)
256 |         pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
257 |         chid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
258 |         nid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
259 | 
260 |         masks = np.zeros([bucket_size, bucket_length], dtype=np.float32)
261 |         single = np.zeros([bucket_size, bucket_length], dtype=np.int64)
262 |         lengths = np.empty(bucket_size, dtype=np.int64)
263 | 
264 |         for i, inst in enumerate(data[bucket_id]):
265 |             wids, cid_seqs, pids, chids, nids = inst
266 |             inst_size = len(wids)
267 |             lengths[i] = inst_size
268 |             # word ids
269 |             wid_inputs[i, :inst_size] = wids
270 |             wid_inputs[i, inst_size:] = PAD_ID_WORD
271 |             for c, cids in enumerate(cid_seqs):
272 |                 cid_inputs[i, c, :len(cids)] = cids
273 |                 cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
274 |             cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
275 |             # pos ids
276 |             pid_inputs[i, :inst_size] = pids
277 |             pid_inputs[i, inst_size:] = PAD_ID_TAG
278 |             # chunk ids
279 |             chid_inputs[i, :inst_size] = chids
280 |             chid_inputs[i, inst_size:] = PAD_ID_TAG
281 |             # ner ids
282 |             nid_inputs[i, :inst_size] = nids
283 |             nid_inputs[i, inst_size:] = PAD_ID_TAG
284 |             # masks
285 |             masks[i, :inst_size] = 1.0
286 |             for j, wid in enumerate(wids):
287 |                 if word_alphabet.is_singleton(wid):
288 |                     single[i, j] = 1
289 | 
290 |         words = torch.from_numpy(wid_inputs)
291 |         chars = torch.from_numpy(cid_inputs)
292 |         pos = torch.from_numpy(pid_inputs)
293 |         chunks = torch.from_numpy(chid_inputs)
294 |         ners = torch.from_numpy(nid_inputs)
295 |         masks = torch.from_numpy(masks)
296 |         single = torch.from_numpy(single)
297 |         lengths = torch.from_numpy(lengths)
298 | 
299 |         data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'CHUNK': chunks,
300 |                        'NER': ners, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths}
301 |         data_tensors.append(data_tensor)
302 |     return data_tensors, bucket_sizes
303 | 


--------------------------------------------------------------------------------
/neuronlp2/io/conllx_data.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import os.path
  4 | import numpy as np
  5 | from collections import defaultdict, OrderedDict
  6 | import torch
  7 | 
  8 | from neuronlp2.io.reader import CoNLLXReader
  9 | from neuronlp2.io.alphabet import Alphabet
 10 | from neuronlp2.io.logger import get_logger
 11 | from neuronlp2.io.common import DIGIT_RE, MAX_CHAR_LENGTH, UNK_ID
 12 | from neuronlp2.io.common import PAD_CHAR, PAD, PAD_POS, PAD_TYPE, PAD_ID_CHAR, PAD_ID_TAG, PAD_ID_WORD
 13 | from neuronlp2.io.common import ROOT, END, ROOT_CHAR, ROOT_POS, ROOT_TYPE, END_CHAR, END_POS, END_TYPE
 14 | 
 15 | # Special vocabulary symbols - we always put them at the start.
 16 | _START_VOCAB = [PAD, ROOT, END]
 17 | NUM_SYMBOLIC_TAGS = 3
 18 | 
 19 | _buckets = [10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 140]
 20 | 
 21 | 
 22 | def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None,
 23 |                      min_occurrence=1, normalize_digits=True):
 24 | 
 25 |     def expand_vocab():
 26 |         vocab_set = set(vocab_list)
 27 |         for data_path in data_paths:
 28 |             # logger.info("Processing data: %s" % data_path)
 29 |             with open(data_path, 'r') as file:
 30 |                 for line in file:
 31 |                     line = line.strip()
 32 |                     if len(line) == 0:
 33 |                         continue
 34 | 
 35 |                     tokens = line.split('\t')
 36 |                     for char in tokens[1]:
 37 |                         char_alphabet.add(char)
 38 | 
 39 |                     word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
 40 |                     pos = tokens[4]
 41 |                     type = tokens[7]
 42 | 
 43 |                     pos_alphabet.add(pos)
 44 |                     type_alphabet.add(type)
 45 | 
 46 |                     if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
 47 |                         vocab_set.add(word)
 48 |                         vocab_list.append(word)
 49 | 
 50 |     logger = get_logger("Create Alphabets")
 51 |     word_alphabet = Alphabet('word', defualt_value=True, singleton=True)
 52 |     char_alphabet = Alphabet('character', defualt_value=True)
 53 |     pos_alphabet = Alphabet('pos')
 54 |     type_alphabet = Alphabet('type')
 55 |     if not os.path.isdir(alphabet_directory):
 56 |         logger.info("Creating Alphabets: %s" % alphabet_directory)
 57 | 
 58 |         char_alphabet.add(PAD_CHAR)
 59 |         pos_alphabet.add(PAD_POS)
 60 |         type_alphabet.add(PAD_TYPE)
 61 | 
 62 |         char_alphabet.add(ROOT_CHAR)
 63 |         pos_alphabet.add(ROOT_POS)
 64 |         type_alphabet.add(ROOT_TYPE)
 65 | 
 66 |         char_alphabet.add(END_CHAR)
 67 |         pos_alphabet.add(END_POS)
 68 |         type_alphabet.add(END_TYPE)
 69 | 
 70 |         vocab = defaultdict(int)
 71 |         with open(train_path, 'r') as file:
 72 |             for line in file:
 73 |                 line = line.strip()
 74 |                 if len(line) == 0:
 75 |                     continue
 76 | 
 77 |                 tokens = line.split('\t')
 78 |                 for char in tokens[1]:
 79 |                     char_alphabet.add(char)
 80 | 
 81 |                 word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
 82 |                 vocab[word] += 1
 83 | 
 84 |                 pos = tokens[4]
 85 |                 pos_alphabet.add(pos)
 86 | 
 87 |                 type = tokens[7]
 88 |                 type_alphabet.add(type)
 89 | 
 90 |         # collect singletons
 91 |         singletons = set([word for word, count in vocab.items() if count <= min_occurrence])
 92 | 
 93 |         # if a singleton is in pretrained embedding dict, set the count to min_occur + c
 94 |         if embedd_dict is not None:
 95 |             assert isinstance(embedd_dict, OrderedDict)
 96 |             for word in vocab.keys():
 97 |                 if word in embedd_dict or word.lower() in embedd_dict:
 98 |                     vocab[word] += min_occurrence
 99 | 
100 |         vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
101 |         logger.info("Total Vocabulary Size: %d" % len(vocab_list))
102 |         logger.info("Total Singleton Size:  %d" % len(singletons))
103 |         vocab_list = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence]
104 |         logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list))
105 | 
106 |         if len(vocab_list) > max_vocabulary_size:
107 |             vocab_list = vocab_list[:max_vocabulary_size]
108 | 
109 |         if data_paths is not None and embedd_dict is not None:
110 |             expand_vocab()
111 | 
112 |         for word in vocab_list:
113 |             word_alphabet.add(word)
114 |             if word in singletons:
115 |                 word_alphabet.add_singleton(word_alphabet.get_index(word))
116 | 
117 |         word_alphabet.save(alphabet_directory)
118 |         char_alphabet.save(alphabet_directory)
119 |         pos_alphabet.save(alphabet_directory)
120 |         type_alphabet.save(alphabet_directory)
121 |     else:
122 |         word_alphabet.load(alphabet_directory)
123 |         char_alphabet.load(alphabet_directory)
124 |         pos_alphabet.load(alphabet_directory)
125 |         type_alphabet.load(alphabet_directory)
126 | 
127 |     word_alphabet.close()
128 |     char_alphabet.close()
129 |     pos_alphabet.close()
130 |     type_alphabet.close()
131 |     logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size()))
132 |     logger.info("Character Alphabet Size: %d" % char_alphabet.size())
133 |     logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
134 |     logger.info("Type Alphabet Size: %d" % type_alphabet.size())
135 |     return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
136 | 
137 | 
138 | def read_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, pos_alphabet: Alphabet, type_alphabet: Alphabet,
139 |               max_size=None, normalize_digits=True, symbolic_root=False, symbolic_end=False):
140 |     data = []
141 |     max_length = 0
142 |     max_char_length = 0
143 |     print('Reading data from %s' % source_path)
144 |     counter = 0
145 |     reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
146 |     inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end)
147 |     while inst is not None and (not max_size or counter < max_size):
148 |         counter += 1
149 |         if counter % 10000 == 0:
150 |             print("reading data: %d" % counter)
151 | 
152 |         sent = inst.sentence
153 |         data.append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids])
154 |         max_len = max([len(char_seq) for char_seq in sent.char_seqs])
155 |         if max_char_length < max_len:
156 |             max_char_length = max_len
157 |         if max_length < inst.length():
158 |             max_length = inst.length()
159 |         inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end)
160 |     reader.close()
161 |     print("Total number of data: %d" % counter)
162 | 
163 |     data_size = len(data)
164 |     char_length = min(MAX_CHAR_LENGTH, max_char_length)
165 |     wid_inputs = np.empty([data_size, max_length], dtype=np.int64)
166 |     cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64)
167 |     pid_inputs = np.empty([data_size, max_length], dtype=np.int64)
168 |     hid_inputs = np.empty([data_size, max_length], dtype=np.int64)
169 |     tid_inputs = np.empty([data_size, max_length], dtype=np.int64)
170 | 
171 |     masks = np.zeros([data_size, max_length], dtype=np.float32)
172 |     single = np.zeros([data_size, max_length], dtype=np.int64)
173 |     lengths = np.empty(data_size, dtype=np.int64)
174 | 
175 |     for i, inst in enumerate(data):
176 |         wids, cid_seqs, pids, hids, tids = inst
177 |         inst_size = len(wids)
178 |         lengths[i] = inst_size
179 |         # word ids
180 |         wid_inputs[i, :inst_size] = wids
181 |         wid_inputs[i, inst_size:] = PAD_ID_WORD
182 |         for c, cids in enumerate(cid_seqs):
183 |             cid_inputs[i, c, :len(cids)] = cids
184 |             cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
185 |         cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
186 |         # pos ids
187 |         pid_inputs[i, :inst_size] = pids
188 |         pid_inputs[i, inst_size:] = PAD_ID_TAG
189 |         # type ids
190 |         tid_inputs[i, :inst_size] = tids
191 |         tid_inputs[i, inst_size:] = PAD_ID_TAG
192 |         # heads
193 |         hid_inputs[i, :inst_size] = hids
194 |         hid_inputs[i, inst_size:] = PAD_ID_TAG
195 |         # masks
196 |         masks[i, :inst_size] = 1.0
197 |         for j, wid in enumerate(wids):
198 |             if word_alphabet.is_singleton(wid):
199 |                 single[i, j] = 1
200 | 
201 |     words = torch.from_numpy(wid_inputs)
202 |     chars = torch.from_numpy(cid_inputs)
203 |     pos = torch.from_numpy(pid_inputs)
204 |     heads = torch.from_numpy(hid_inputs)
205 |     types = torch.from_numpy(tid_inputs)
206 |     masks = torch.from_numpy(masks)
207 |     single = torch.from_numpy(single)
208 |     lengths = torch.from_numpy(lengths)
209 | 
210 |     data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types,
211 |                    'MASK': masks, 'SINGLE': single, 'LENGTH': lengths}
212 |     return data_tensor, data_size
213 | 
214 | 
215 | def read_bucketed_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, pos_alphabet: Alphabet, type_alphabet: Alphabet,
216 |                        max_size=None, normalize_digits=True, symbolic_root=False, symbolic_end=False):
217 |     data = [[] for _ in _buckets]
218 |     max_char_length = [0 for _ in _buckets]
219 |     print('Reading data from %s' % source_path)
220 |     counter = 0
221 |     reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
222 |     inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end)
223 |     while inst is not None and (not max_size or counter < max_size):
224 |         counter += 1
225 |         if counter % 10000 == 0:
226 |             print("reading data: %d" % counter)
227 | 
228 |         inst_size = inst.length()
229 |         sent = inst.sentence
230 |         for bucket_id, bucket_size in enumerate(_buckets):
231 |             if inst_size < bucket_size:
232 |                 data[bucket_id].append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids])
233 |                 max_len = max([len(char_seq) for char_seq in sent.char_seqs])
234 |                 if max_char_length[bucket_id] < max_len:
235 |                     max_char_length[bucket_id] = max_len
236 |                 break
237 | 
238 |         inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end)
239 |     reader.close()
240 |     print("Total number of data: %d" % counter)
241 | 
242 |     bucket_sizes = [len(data[b]) for b in range(len(_buckets))]
243 |     data_tensors = []
244 |     for bucket_id in range(len(_buckets)):
245 |         bucket_size = bucket_sizes[bucket_id]
246 |         if bucket_size == 0:
247 |             data_tensors.append((1, 1))
248 |             continue
249 | 
250 |         bucket_length = _buckets[bucket_id]
251 |         char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id])
252 |         wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
253 |         cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64)
254 |         pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
255 |         hid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
256 |         tid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
257 | 
258 |         masks = np.zeros([bucket_size, bucket_length], dtype=np.float32)
259 |         single = np.zeros([bucket_size, bucket_length], dtype=np.int64)
260 |         lengths = np.empty(bucket_size, dtype=np.int64)
261 | 
262 |         for i, inst in enumerate(data[bucket_id]):
263 |             wids, cid_seqs, pids, hids, tids = inst
264 |             inst_size = len(wids)
265 |             lengths[i] = inst_size
266 |             # word ids
267 |             wid_inputs[i, :inst_size] = wids
268 |             wid_inputs[i, inst_size:] = PAD_ID_WORD
269 |             for c, cids in enumerate(cid_seqs):
270 |                 cid_inputs[i, c, :len(cids)] = cids
271 |                 cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
272 |             cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
273 |             # pos ids
274 |             pid_inputs[i, :inst_size] = pids
275 |             pid_inputs[i, inst_size:] = PAD_ID_TAG
276 |             # type ids
277 |             tid_inputs[i, :inst_size] = tids
278 |             tid_inputs[i, inst_size:] = PAD_ID_TAG
279 |             # heads
280 |             hid_inputs[i, :inst_size] = hids
281 |             hid_inputs[i, inst_size:] = PAD_ID_TAG
282 |             # masks
283 |             masks[i, :inst_size] = 1.0
284 |             for j, wid in enumerate(wids):
285 |                 if word_alphabet.is_singleton(wid):
286 |                     single[i, j] = 1
287 | 
288 |         words = torch.from_numpy(wid_inputs)
289 |         chars = torch.from_numpy(cid_inputs)
290 |         pos = torch.from_numpy(pid_inputs)
291 |         heads = torch.from_numpy(hid_inputs)
292 |         types = torch.from_numpy(tid_inputs)
293 |         masks = torch.from_numpy(masks)
294 |         single = torch.from_numpy(single)
295 |         lengths = torch.from_numpy(lengths)
296 | 
297 |         data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types,
298 |                        'MASK': masks, 'SINGLE': single, 'LENGTH': lengths}
299 |         data_tensors.append(data_tensor)
300 |     return data_tensors, bucket_sizes
301 | 


--------------------------------------------------------------------------------
/neuronlp2/io/conllx_stacked_data.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from neuronlp2.io.reader import CoNLLXReader
  6 | from neuronlp2.io.conllx_data import _buckets, NUM_SYMBOLIC_TAGS, create_alphabets
  7 | from neuronlp2.io.common import DIGIT_RE, MAX_CHAR_LENGTH, UNK_ID
  8 | from neuronlp2.io.common import PAD_CHAR, PAD, PAD_POS, PAD_TYPE, PAD_ID_CHAR, PAD_ID_TAG, PAD_ID_WORD
  9 | from neuronlp2.io.common import ROOT, END, ROOT_CHAR, ROOT_POS, ROOT_TYPE, END_CHAR, END_POS, END_TYPE
 10 | 
 11 | 
 12 | def _obtain_child_index_for_left2right(heads):
 13 |     child_ids = [[] for _ in range(len(heads))]
 14 |     # skip the symbolic root.
 15 |     for child in range(1, len(heads)):
 16 |         head = heads[child]
 17 |         child_ids[head].append(child)
 18 |     return child_ids
 19 | 
 20 | 
 21 | def _obtain_child_index_for_inside_out(heads):
 22 |     child_ids = [[] for _ in range(len(heads))]
 23 |     for head in range(len(heads)):
 24 |         # first find left children inside-out
 25 |         for child in reversed(range(1, head)):
 26 |             if heads[child] == head:
 27 |                 child_ids[head].append(child)
 28 |         # second find right children inside-out
 29 |         for child in range(head + 1, len(heads)):
 30 |             if heads[child] == head:
 31 |                 child_ids[head].append(child)
 32 |     return child_ids
 33 | 
 34 | 
 35 | def _obtain_child_index_for_depth(heads, reverse):
 36 |     def calc_depth(head):
 37 |         children = child_ids[head]
 38 |         max_depth = 0
 39 |         for child in children:
 40 |             depth = calc_depth(child)
 41 |             child_with_depth[head].append((child, depth))
 42 |             max_depth = max(max_depth, depth + 1)
 43 |         child_with_depth[head] = sorted(child_with_depth[head], key=lambda x: x[1], reverse=reverse)
 44 |         return max_depth
 45 | 
 46 |     child_ids = _obtain_child_index_for_left2right(heads)
 47 |     child_with_depth = [[] for _ in range(len(heads))]
 48 |     calc_depth(0)
 49 |     return [[child for child, depth in child_with_depth[head]] for head in range(len(heads))]
 50 | 
 51 | 
 52 | def _generate_stack_inputs(heads, types, prior_order):
 53 |     if prior_order == 'deep_first':
 54 |         child_ids = _obtain_child_index_for_depth(heads, True)
 55 |     elif prior_order == 'shallow_first':
 56 |         child_ids = _obtain_child_index_for_depth(heads, False)
 57 |     elif prior_order == 'left2right':
 58 |         child_ids = _obtain_child_index_for_left2right(heads)
 59 |     elif prior_order == 'inside_out':
 60 |         child_ids = _obtain_child_index_for_inside_out(heads)
 61 |     else:
 62 |         raise ValueError('Unknown prior order: %s' % prior_order)
 63 | 
 64 |     stacked_heads = []
 65 |     children = []
 66 |     siblings = []
 67 |     stacked_types = []
 68 |     skip_connect = []
 69 |     prev = [0 for _ in range(len(heads))]
 70 |     sibs = [0 for _ in range(len(heads))]
 71 |     stack = [0]
 72 |     position = 1
 73 |     while len(stack) > 0:
 74 |         head = stack[-1]
 75 |         stacked_heads.append(head)
 76 |         siblings.append(sibs[head])
 77 |         child_id = child_ids[head]
 78 |         skip_connect.append(prev[head])
 79 |         prev[head] = position
 80 |         if len(child_id) == 0:
 81 |             children.append(head)
 82 |             sibs[head] = 0
 83 |             stacked_types.append(PAD_ID_TAG)
 84 |             stack.pop()
 85 |         else:
 86 |             child = child_id.pop(0)
 87 |             children.append(child)
 88 |             sibs[head] = child
 89 |             stack.append(child)
 90 |             stacked_types.append(types[child])
 91 |         position += 1
 92 | 
 93 |     return stacked_heads, children, siblings, stacked_types, skip_connect
 94 | 
 95 | 
 96 | def read_data(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet,
 97 |               max_size=None, normalize_digits=True, prior_order='inside_out'):
 98 |     data = []
 99 |     max_length = 0
100 |     max_char_length = 0
101 |     print('Reading data from %s' % source_path)
102 |     counter = 0
103 |     reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
104 |     inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=True, symbolic_end=False)
105 |     while inst is not None and (not max_size or counter < max_size):
106 |         counter += 1
107 |         if counter % 10000 == 0:
108 |             print("reading data: %d" % counter)
109 | 
110 |         sent = inst.sentence
111 |         stacked_heads, children, siblings, stacked_types, skip_connect = _generate_stack_inputs(inst.heads, inst.type_ids, prior_order)
112 |         data.append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids, stacked_heads, children, siblings, stacked_types, skip_connect])
113 |         max_len = max([len(char_seq) for char_seq in sent.char_seqs])
114 |         if max_char_length < max_len:
115 |             max_char_length = max_len
116 |         if max_length < inst.length():
117 |             max_length = inst.length()
118 |         inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=True, symbolic_end=False)
119 |     reader.close()
120 |     print("Total number of data: %d" % counter)
121 | 
122 |     data_size = len(data)
123 |     char_length = min(MAX_CHAR_LENGTH, max_char_length)
124 |     wid_inputs = np.empty([data_size, max_length], dtype=np.int64)
125 |     cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64)
126 |     pid_inputs = np.empty([data_size, max_length], dtype=np.int64)
127 |     hid_inputs = np.empty([data_size, max_length], dtype=np.int64)
128 |     tid_inputs = np.empty([data_size, max_length], dtype=np.int64)
129 | 
130 |     masks_e = np.zeros([data_size, max_length], dtype=np.float32)
131 |     single = np.zeros([data_size, max_length], dtype=np.int64)
132 |     lengths = np.empty(data_size, dtype=np.int64)
133 | 
134 |     stack_hid_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64)
135 |     chid_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64)
136 |     ssid_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64)
137 |     stack_tid_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64)
138 |     skip_connect_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64)
139 | 
140 |     masks_d = np.zeros([data_size, 2 * max_length - 1], dtype=np.float32)
141 | 
142 |     for i, inst in enumerate(data):
143 |         wids, cid_seqs, pids, hids, tids, stack_hids, chids, ssids, stack_tids, skip_ids = inst
144 |         inst_size = len(wids)
145 |         lengths[i] = inst_size
146 |         # word ids
147 |         wid_inputs[i, :inst_size] = wids
148 |         wid_inputs[i, inst_size:] = PAD_ID_WORD
149 |         for c, cids in enumerate(cid_seqs):
150 |             cid_inputs[i, c, :len(cids)] = cids
151 |             cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
152 |         cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
153 |         # pos ids
154 |         pid_inputs[i, :inst_size] = pids
155 |         pid_inputs[i, inst_size:] = PAD_ID_TAG
156 |         # type ids
157 |         tid_inputs[i, :inst_size] = tids
158 |         tid_inputs[i, inst_size:] = PAD_ID_TAG
159 |         # heads
160 |         hid_inputs[i, :inst_size] = hids
161 |         hid_inputs[i, inst_size:] = PAD_ID_TAG
162 |         # masks_e
163 |         masks_e[i, :inst_size] = 1.0
164 |         for j, wid in enumerate(wids):
165 |             if word_alphabet.is_singleton(wid):
166 |                 single[i, j] = 1
167 | 
168 |         inst_size_decoder = 2 * inst_size - 1
169 |         # stacked heads
170 |         stack_hid_inputs[i, :inst_size_decoder] = stack_hids
171 |         stack_hid_inputs[i, inst_size_decoder:] = PAD_ID_TAG
172 |         # children
173 |         chid_inputs[i, :inst_size_decoder] = chids
174 |         chid_inputs[i, inst_size_decoder:] = PAD_ID_TAG
175 |         # siblings
176 |         ssid_inputs[i, :inst_size_decoder] = ssids
177 |         ssid_inputs[i, inst_size_decoder:] = PAD_ID_TAG
178 |         # stacked types
179 |         stack_tid_inputs[i, :inst_size_decoder] = stack_tids
180 |         stack_tid_inputs[i, inst_size_decoder:] = PAD_ID_TAG
181 |         # skip connects
182 |         skip_connect_inputs[i, :inst_size_decoder] = skip_ids
183 |         skip_connect_inputs[i, inst_size_decoder:] = PAD_ID_TAG
184 |         # masks_d
185 |         masks_d[i, :inst_size_decoder] = 1.0
186 | 
187 |     words = torch.from_numpy(wid_inputs)
188 |     chars = torch.from_numpy(cid_inputs)
189 |     pos = torch.from_numpy(pid_inputs)
190 |     heads = torch.from_numpy(hid_inputs)
191 |     types = torch.from_numpy(tid_inputs)
192 |     masks_e = torch.from_numpy(masks_e)
193 |     single = torch.from_numpy(single)
194 |     lengths = torch.from_numpy(lengths)
195 | 
196 |     stacked_heads = torch.from_numpy(stack_hid_inputs)
197 |     children = torch.from_numpy(chid_inputs)
198 |     siblings = torch.from_numpy(ssid_inputs)
199 |     stacked_types = torch.from_numpy(stack_tid_inputs)
200 |     skip_connect = torch.from_numpy(skip_connect_inputs)
201 |     masks_d = torch.from_numpy(masks_d)
202 | 
203 |     data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types, 'MASK_ENC': masks_e,
204 |                    'SINGLE': single, 'LENGTH': lengths, 'STACK_HEAD': stacked_heads, 'CHILD': children,
205 |                    'SIBLING': siblings, 'STACK_TYPE': stacked_types, 'SKIP_CONNECT': skip_connect, 'MASK_DEC': masks_d}
206 |     return data_tensor, data_size
207 | 
208 | 
209 | def read_bucketed_data(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet,
210 |                        max_size=None, normalize_digits=True, prior_order='inside_out'):
211 |     data = [[] for _ in _buckets]
212 |     max_char_length = [0 for _ in _buckets]
213 |     print('Reading data from %s' % source_path)
214 |     counter = 0
215 |     reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
216 |     inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=True, symbolic_end=False)
217 |     while inst is not None and (not max_size or counter < max_size):
218 |         counter += 1
219 |         if counter % 10000 == 0:
220 |             print("reading data: %d" % counter)
221 | 
222 |         inst_size = inst.length()
223 |         sent = inst.sentence
224 |         for bucket_id, bucket_size in enumerate(_buckets):
225 |             if inst_size < bucket_size:
226 |                 stacked_heads, children, siblings, stacked_types, skip_connect = _generate_stack_inputs(inst.heads, inst.type_ids, prior_order)
227 |                 data[bucket_id].append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids, stacked_heads, children, siblings, stacked_types, skip_connect])
228 |                 max_len = max([len(char_seq) for char_seq in sent.char_seqs])
229 |                 if max_char_length[bucket_id] < max_len:
230 |                     max_char_length[bucket_id] = max_len
231 |                 break
232 | 
233 |         inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=True, symbolic_end=False)
234 |     reader.close()
235 |     print("Total number of data: %d" % counter)
236 | 
237 |     bucket_sizes = [len(data[b]) for b in range(len(_buckets))]
238 |     data_tensors = []
239 |     for bucket_id in range(len(_buckets)):
240 |         bucket_size = bucket_sizes[bucket_id]
241 |         if bucket_size == 0:
242 |             data_tensors.append((1, 1))
243 |             continue
244 | 
245 |         bucket_length = _buckets[bucket_id]
246 |         char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id])
247 |         wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
248 |         cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64)
249 |         pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
250 |         hid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
251 |         tid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
252 | 
253 |         masks_e = np.zeros([bucket_size, bucket_length], dtype=np.float32)
254 |         single = np.zeros([bucket_size, bucket_length], dtype=np.int64)
255 |         lengths = np.empty(bucket_size, dtype=np.int64)
256 | 
257 |         stack_hid_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64)
258 |         chid_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64)
259 |         ssid_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64)
260 |         stack_tid_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64)
261 |         skip_connect_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64)
262 | 
263 |         masks_d = np.zeros([bucket_size, 2 * bucket_length - 1], dtype=np.float32)
264 | 
265 |         for i, inst in enumerate(data[bucket_id]):
266 |             wids, cid_seqs, pids, hids, tids, stack_hids, chids, ssids, stack_tids, skip_ids = inst
267 |             inst_size = len(wids)
268 |             lengths[i] = inst_size
269 |             # word ids
270 |             wid_inputs[i, :inst_size] = wids
271 |             wid_inputs[i, inst_size:] = PAD_ID_WORD
272 |             for c, cids in enumerate(cid_seqs):
273 |                 cid_inputs[i, c, :len(cids)] = cids
274 |                 cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
275 |             cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
276 |             # pos ids
277 |             pid_inputs[i, :inst_size] = pids
278 |             pid_inputs[i, inst_size:] = PAD_ID_TAG
279 |             # type ids
280 |             tid_inputs[i, :inst_size] = tids
281 |             tid_inputs[i, inst_size:] = PAD_ID_TAG
282 |             # heads
283 |             hid_inputs[i, :inst_size] = hids
284 |             hid_inputs[i, inst_size:] = PAD_ID_TAG
285 |             # masks_e
286 |             masks_e[i, :inst_size] = 1.0
287 |             for j, wid in enumerate(wids):
288 |                 if word_alphabet.is_singleton(wid):
289 |                     single[i, j] = 1
290 | 
291 |             inst_size_decoder = 2 * inst_size - 1
292 |             # stacked heads
293 |             stack_hid_inputs[i, :inst_size_decoder] = stack_hids
294 |             stack_hid_inputs[i, inst_size_decoder:] = PAD_ID_TAG
295 |             # children
296 |             chid_inputs[i, :inst_size_decoder] = chids
297 |             chid_inputs[i, inst_size_decoder:] = PAD_ID_TAG
298 |             # siblings
299 |             ssid_inputs[i, :inst_size_decoder] = ssids
300 |             ssid_inputs[i, inst_size_decoder:] = PAD_ID_TAG
301 |             # stacked types
302 |             stack_tid_inputs[i, :inst_size_decoder] = stack_tids
303 |             stack_tid_inputs[i, inst_size_decoder:] = PAD_ID_TAG
304 |             # skip connects
305 |             skip_connect_inputs[i, :inst_size_decoder] = skip_ids
306 |             skip_connect_inputs[i, inst_size_decoder:] = PAD_ID_TAG
307 |             # masks_d
308 |             masks_d[i, :inst_size_decoder] = 1.0
309 | 
310 |         words = torch.from_numpy(wid_inputs)
311 |         chars = torch.from_numpy(cid_inputs)
312 |         pos = torch.from_numpy(pid_inputs)
313 |         heads = torch.from_numpy(hid_inputs)
314 |         types = torch.from_numpy(tid_inputs)
315 |         masks_e = torch.from_numpy(masks_e)
316 |         single = torch.from_numpy(single)
317 |         lengths = torch.from_numpy(lengths)
318 | 
319 |         stacked_heads = torch.from_numpy(stack_hid_inputs)
320 |         children = torch.from_numpy(chid_inputs)
321 |         siblings = torch.from_numpy(ssid_inputs)
322 |         stacked_types = torch.from_numpy(stack_tid_inputs)
323 |         skip_connect = torch.from_numpy(skip_connect_inputs)
324 |         masks_d = torch.from_numpy(masks_d)
325 | 
326 |         data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types, 'MASK_ENC': masks_e,
327 |                        'SINGLE': single, 'LENGTH': lengths, 'STACK_HEAD': stacked_heads, 'CHILD': children,
328 |                        'SIBLING': siblings, 'STACK_TYPE': stacked_types, 'SKIP_CONNECT': skip_connect, 'MASK_DEC': masks_d}
329 |         data_tensors.append(data_tensor)
330 | 
331 |     return data_tensors, bucket_sizes
332 | 


--------------------------------------------------------------------------------
/neuronlp2/io/instance.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | __all__ = ['Sentence', 'DependencyInstance', 'NERInstance']
 4 | 
 5 | 
 6 | class Sentence(object):
 7 |     def __init__(self, words, word_ids, char_seqs, char_id_seqs):
 8 |         self.words = words
 9 |         self.word_ids = word_ids
10 |         self.char_seqs = char_seqs
11 |         self.char_id_seqs = char_id_seqs
12 | 
13 |     def length(self):
14 |         return len(self.words)
15 | 
16 | 
17 | class DependencyInstance(object):
18 |     def __init__(self, sentence, postags, pos_ids, heads, types, type_ids):
19 |         self.sentence = sentence
20 |         self.postags = postags
21 |         self.pos_ids = pos_ids
22 |         self.heads = heads
23 |         self.types = types
24 |         self.type_ids = type_ids
25 | 
26 |     def length(self):
27 |         return self.sentence.length()
28 | 
29 | 
30 | class NERInstance(object):
31 |     def __init__(self, sentence, postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids):
32 |         self.sentence = sentence
33 |         self.postags = postags
34 |         self.pos_ids = pos_ids
35 |         self.chunk_tags = chunk_tags
36 |         self.chunk_ids = chunk_ids
37 |         self.ner_tags = ner_tags
38 |         self.ner_ids = ner_ids
39 | 
40 |     def length(self):
41 |         return self.sentence.length()


--------------------------------------------------------------------------------
/neuronlp2/io/logger.py:
--------------------------------------------------------------------------------
 1 | _author__ = 'max'
 2 | 
 3 | import logging
 4 | import sys
 5 | 
 6 | 
 7 | def get_logger(name, level=logging.INFO, handler=sys.stdout,
 8 |                formatter='%(asctime)s - %(name)s - %(levelname)s - %(message)s'):
 9 |     logger = logging.getLogger(name)
10 |     logger.setLevel(logging.INFO)
11 |     formatter = logging.Formatter(formatter)
12 |     stream_handler = logging.StreamHandler(handler)
13 |     stream_handler.setLevel(level)
14 |     stream_handler.setFormatter(formatter)
15 |     logger.addHandler(stream_handler)
16 | 
17 |     return logger
18 | 


--------------------------------------------------------------------------------
/neuronlp2/io/reader.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | from neuronlp2.io.instance import DependencyInstance, NERInstance
  4 | from neuronlp2.io.instance import Sentence
  5 | from neuronlp2.io.common import ROOT, ROOT_POS, ROOT_CHAR, ROOT_TYPE, END, END_POS, END_CHAR, END_TYPE
  6 | from neuronlp2.io.common import DIGIT_RE, MAX_CHAR_LENGTH
  7 | 
  8 | 
  9 | class CoNLLXReader(object):
 10 |     def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet):
 11 |         self.__source_file = open(file_path, 'r')
 12 |         self.__word_alphabet = word_alphabet
 13 |         self.__char_alphabet = char_alphabet
 14 |         self.__pos_alphabet = pos_alphabet
 15 |         self.__type_alphabet = type_alphabet
 16 | 
 17 |     def close(self):
 18 |         self.__source_file.close()
 19 | 
 20 |     def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False):
 21 |         line = self.__source_file.readline()
 22 |         # skip multiple blank lines.
 23 |         while len(line) > 0 and len(line.strip()) == 0:
 24 |             line = self.__source_file.readline()
 25 |         if len(line) == 0:
 26 |             return None
 27 | 
 28 |         lines = []
 29 |         while len(line.strip()) > 0:
 30 |             line = line.strip()
 31 |             lines.append(line.split('\t'))
 32 |             line = self.__source_file.readline()
 33 | 
 34 |         length = len(lines)
 35 |         if length == 0:
 36 |             return None
 37 | 
 38 |         words = []
 39 |         word_ids = []
 40 |         char_seqs = []
 41 |         char_id_seqs = []
 42 |         postags = []
 43 |         pos_ids = []
 44 |         types = []
 45 |         type_ids = []
 46 |         heads = []
 47 | 
 48 |         if symbolic_root:
 49 |             words.append(ROOT)
 50 |             word_ids.append(self.__word_alphabet.get_index(ROOT))
 51 |             char_seqs.append([ROOT_CHAR, ])
 52 |             char_id_seqs.append([self.__char_alphabet.get_index(ROOT_CHAR), ])
 53 |             postags.append(ROOT_POS)
 54 |             pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
 55 |             types.append(ROOT_TYPE)
 56 |             type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE))
 57 |             heads.append(0)
 58 | 
 59 |         for tokens in lines:
 60 |             chars = []
 61 |             char_ids = []
 62 |             for char in tokens[1]:
 63 |                 chars.append(char)
 64 |                 char_ids.append(self.__char_alphabet.get_index(char))
 65 |             if len(chars) > MAX_CHAR_LENGTH:
 66 |                 chars = chars[:MAX_CHAR_LENGTH]
 67 |                 char_ids = char_ids[:MAX_CHAR_LENGTH]
 68 |             char_seqs.append(chars)
 69 |             char_id_seqs.append(char_ids)
 70 | 
 71 |             word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
 72 |             pos = tokens[4]
 73 |             head = int(tokens[6])
 74 |             type = tokens[7]
 75 | 
 76 |             words.append(word)
 77 |             word_ids.append(self.__word_alphabet.get_index(word))
 78 | 
 79 |             postags.append(pos)
 80 |             pos_ids.append(self.__pos_alphabet.get_index(pos))
 81 | 
 82 |             types.append(type)
 83 |             type_ids.append(self.__type_alphabet.get_index(type))
 84 | 
 85 |             heads.append(head)
 86 | 
 87 |         if symbolic_end:
 88 |             words.append(END)
 89 |             word_ids.append(self.__word_alphabet.get_index(END))
 90 |             char_seqs.append([END_CHAR, ])
 91 |             char_id_seqs.append([self.__char_alphabet.get_index(END_CHAR), ])
 92 |             postags.append(END_POS)
 93 |             pos_ids.append(self.__pos_alphabet.get_index(END_POS))
 94 |             types.append(END_TYPE)
 95 |             type_ids.append(self.__type_alphabet.get_index(END_TYPE))
 96 |             heads.append(0)
 97 | 
 98 |         return DependencyInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids)
 99 | 
100 | 
101 | class CoNLL03Reader(object):
102 |     def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet):
103 |         self.__source_file = open(file_path, 'r')
104 |         self.__word_alphabet = word_alphabet
105 |         self.__char_alphabet = char_alphabet
106 |         self.__pos_alphabet = pos_alphabet
107 |         self.__chunk_alphabet = chunk_alphabet
108 |         self.__ner_alphabet = ner_alphabet
109 | 
110 |     def close(self):
111 |         self.__source_file.close()
112 | 
113 |     def getNext(self, normalize_digits=True):
114 |         line = self.__source_file.readline()
115 |         # skip multiple blank lines.
116 |         while len(line) > 0 and len(line.strip()) == 0:
117 |             line = self.__source_file.readline()
118 |         if len(line) == 0:
119 |             return None
120 | 
121 |         lines = []
122 |         while len(line.strip()) > 0:
123 |             line = line.strip()
124 |             lines.append(line.split(' '))
125 |             line = self.__source_file.readline()
126 | 
127 |         length = len(lines)
128 |         if length == 0:
129 |             return None
130 | 
131 |         words = []
132 |         word_ids = []
133 |         char_seqs = []
134 |         char_id_seqs = []
135 |         postags = []
136 |         pos_ids = []
137 |         chunk_tags = []
138 |         chunk_ids = []
139 |         ner_tags = []
140 |         ner_ids = []
141 | 
142 |         for tokens in lines:
143 |             chars = []
144 |             char_ids = []
145 |             for char in tokens[1]:
146 |                 chars.append(char)
147 |                 char_ids.append(self.__char_alphabet.get_index(char))
148 |             if len(chars) > MAX_CHAR_LENGTH:
149 |                 chars = chars[:MAX_CHAR_LENGTH]
150 |                 char_ids = char_ids[:MAX_CHAR_LENGTH]
151 |             char_seqs.append(chars)
152 |             char_id_seqs.append(char_ids)
153 | 
154 |             word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
155 |             pos = tokens[2]
156 |             chunk = tokens[3]
157 |             ner = tokens[4]
158 | 
159 |             words.append(word)
160 |             word_ids.append(self.__word_alphabet.get_index(word))
161 | 
162 |             postags.append(pos)
163 |             pos_ids.append(self.__pos_alphabet.get_index(pos))
164 | 
165 |             chunk_tags.append(chunk)
166 |             chunk_ids.append(self.__chunk_alphabet.get_index(chunk))
167 | 
168 |             ner_tags.append(ner)
169 |             ner_ids.append(self.__ner_alphabet.get_index(ner))
170 | 
171 |         return NERInstance(Sentence(words, word_ids, char_seqs, char_id_seqs),
172 |                            postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids)
173 | 


--------------------------------------------------------------------------------
/neuronlp2/io/utils.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | 
  7 | def get_batch(data, batch_size, unk_replace=0.):
  8 |     data, data_size = data
  9 |     batch_size = min(data_size, batch_size)
 10 |     index = torch.randperm(data_size).long()[:batch_size]
 11 | 
 12 |     lengths = data['LENGTH'][index]
 13 |     max_length = lengths.max().item()
 14 |     words = data['WORD']
 15 |     single = data['SINGLE']
 16 |     words = words[index, :max_length]
 17 |     single = single[index, :max_length]
 18 |     if unk_replace:
 19 |         ones = single.new_ones(batch_size, max_length)
 20 |         noise = single.new_empty(batch_size, max_length).bernoulli_(unk_replace).long()
 21 |         words = words * (ones - single * noise)
 22 | 
 23 |     stack_keys = ['STACK_HEAD', 'CHILD', 'SIBLING', 'STACK_TYPE', 'SKIP_CONNECT', 'MASK_DEC']
 24 |     exclude_keys = set(['SINGLE', 'WORD', 'LENGTH'] + stack_keys)
 25 |     stack_keys = set(stack_keys)
 26 |     batch = {'WORD': words, 'LENGTH': lengths}
 27 |     batch.update({key: field[index, :max_length] for key, field in data.items() if key not in exclude_keys})
 28 |     batch.update({key: field[index, :2 * max_length - 1] for key, field in data.items() if key in stack_keys})
 29 |     return batch
 30 | 
 31 | 
 32 | def get_bucketed_batch(data, batch_size, unk_replace=0.):
 33 |     data_buckets, bucket_sizes = data
 34 |     total_size = float(sum(bucket_sizes))
 35 |     # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
 36 |     # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
 37 |     # the size if i-th training bucket, as used later.
 38 |     buckets_scale = [sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes))]
 39 | 
 40 |     # Choose a bucket according to data distribution. We pick a random number
 41 |     # in [0, 1] and use the corresponding interval in train_buckets_scale.
 42 |     random_number = np.random.random_sample()
 43 |     bucket_id = min([i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number])
 44 | 
 45 |     data = data_buckets[bucket_id]
 46 |     bucket_size = bucket_sizes[bucket_id]
 47 |     batch_size = min(bucket_size, batch_size)
 48 |     index = torch.randperm(bucket_size).long()[:batch_size]
 49 | 
 50 |     lengths = data['LENGTH'][index]
 51 |     max_length = lengths.max().item()
 52 |     words = data['WORD']
 53 |     single = data['SINGLE']
 54 |     words = words[index, :max_length]
 55 |     single = single[index, :max_length]
 56 |     if unk_replace:
 57 |         ones = single.new_ones(batch_size, max_length)
 58 |         noise = single.new_empty(batch_size, max_length).bernoulli_(unk_replace).long()
 59 |         words = words * (ones - single * noise)
 60 | 
 61 |     stack_keys = ['STACK_HEAD', 'CHILD', 'SIBLING', 'STACK_TYPE', 'SKIP_CONNECT', 'MASK_DEC']
 62 |     exclude_keys = set(['SINGLE', 'WORD', 'LENGTH'] + stack_keys)
 63 |     stack_keys = set(stack_keys)
 64 |     batch = {'WORD': words, 'LENGTH': lengths}
 65 |     batch.update({key: field[index, :max_length] for key, field in data.items() if key not in exclude_keys})
 66 |     batch.update({key: field[index, :2 * max_length - 1] for key, field in data.items() if key in stack_keys})
 67 |     return batch
 68 | 
 69 | 
 70 | def iterate_batch(data, batch_size, unk_replace=0., shuffle=False):
 71 |     data, data_size = data
 72 | 
 73 |     words = data['WORD']
 74 |     single = data['SINGLE']
 75 |     max_length = words.size(1)
 76 |     if unk_replace:
 77 |         ones = single.new_ones(data_size, max_length)
 78 |         noise = single.new_empty(data_size, max_length).bernoulli_(unk_replace).long()
 79 |         words = words * (ones - single * noise)
 80 | 
 81 |     indices = None
 82 |     if shuffle:
 83 |         indices = torch.randperm(data_size).long()
 84 |         indices = indices.to(words.device)
 85 | 
 86 |     stack_keys = ['STACK_HEAD', 'CHILD', 'SIBLING', 'STACK_TYPE', 'SKIP_CONNECT', 'MASK_DEC']
 87 |     exclude_keys = set(['SINGLE', 'WORD', 'LENGTH'] + stack_keys)
 88 |     stack_keys = set(stack_keys)
 89 |     for start_idx in range(0, data_size, batch_size):
 90 |         if shuffle:
 91 |             excerpt = indices[start_idx:start_idx + batch_size]
 92 |         else:
 93 |             excerpt = slice(start_idx, start_idx + batch_size)
 94 | 
 95 |         lengths = data['LENGTH'][excerpt]
 96 |         batch_length = lengths.max().item()
 97 |         batch = {'WORD': words[excerpt, :batch_length], 'LENGTH': lengths}
 98 |         batch.update({key: field[excerpt, :batch_length] for key, field in data.items() if key not in exclude_keys})
 99 |         batch.update({key: field[excerpt, :2 * batch_length - 1] for key, field in data.items() if key in stack_keys})
100 |         yield batch
101 | 
102 | 
103 | def iterate_bucketed_batch(data, batch_size, unk_replace=0., shuffle=False):
104 |     data_tensor, bucket_sizes = data
105 | 
106 |     bucket_indices = np.arange(len(bucket_sizes))
107 |     if shuffle:
108 |         np.random.shuffle((bucket_indices))
109 | 
110 |     stack_keys = ['STACK_HEAD', 'CHILD', 'SIBLING', 'STACK_TYPE', 'SKIP_CONNECT', 'MASK_DEC']
111 |     exclude_keys = set(['SINGLE', 'WORD', 'LENGTH'] + stack_keys)
112 |     stack_keys = set(stack_keys)
113 |     for bucket_id in bucket_indices:
114 |         data = data_tensor[bucket_id]
115 |         bucket_size = bucket_sizes[bucket_id]
116 |         if bucket_size == 0:
117 |             continue
118 | 
119 |         words = data['WORD']
120 |         single = data['SINGLE']
121 |         bucket_length = words.size(1)
122 |         if unk_replace:
123 |             ones = single.new_ones(bucket_size, bucket_length)
124 |             noise = single.new_empty(bucket_size, bucket_length).bernoulli_(unk_replace).long()
125 |             words = words * (ones - single * noise)
126 | 
127 |         indices = None
128 |         if shuffle:
129 |             indices = torch.randperm(bucket_size).long()
130 |             indices = indices.to(words.device)
131 |         for start_idx in range(0, bucket_size, batch_size):
132 |             if shuffle:
133 |                 excerpt = indices[start_idx:start_idx + batch_size]
134 |             else:
135 |                 excerpt = slice(start_idx, start_idx + batch_size)
136 | 
137 |             lengths = data['LENGTH'][excerpt]
138 |             batch_length = lengths.max().item()
139 |             batch = {'WORD': words[excerpt, :batch_length], 'LENGTH': lengths}
140 |             batch.update({key: field[excerpt, :batch_length] for key, field in data.items() if key not in exclude_keys})
141 |             batch.update({key: field[excerpt, :2 * batch_length - 1] for key, field in data.items() if key in stack_keys})
142 |             yield batch
143 | 
144 | 
145 | def iterate_data(data, batch_size, bucketed=False, unk_replace=0., shuffle=False):
146 |     if bucketed:
147 |         return iterate_bucketed_batch(data, batch_size, unk_replace==unk_replace, shuffle=shuffle)
148 |     else:
149 |         return iterate_batch(data, batch_size, unk_replace==unk_replace, shuffle=shuffle)
150 | 


--------------------------------------------------------------------------------
/neuronlp2/io/writer.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | 
 4 | class CoNLL03Writer(object):
 5 |     def __init__(self, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet):
 6 |         self.__source_file = None
 7 |         self.__word_alphabet = word_alphabet
 8 |         self.__char_alphabet = char_alphabet
 9 |         self.__pos_alphabet = pos_alphabet
10 |         self.__chunk_alphabet = chunk_alphabet
11 |         self.__ner_alphabet = ner_alphabet
12 | 
13 |     def start(self, file_path):
14 |         self.__source_file = open(file_path, 'w')
15 | 
16 |     def close(self):
17 |         self.__source_file.close()
18 | 
19 |     def write(self, word, pos, chunk, predictions, targets, lengths):
20 |         batch_size, _ = word.shape
21 |         for i in range(batch_size):
22 |             for j in range(lengths[i]):
23 |                 w = self.__word_alphabet.get_instance(word[i, j])
24 |                 p = self.__pos_alphabet.get_instance(pos[i, j])
25 |                 ch = self.__chunk_alphabet.get_instance(chunk[i, j])
26 |                 tgt = self.__ner_alphabet.get_instance(targets[i, j])
27 |                 pred = self.__ner_alphabet.get_instance(predictions[i, j])
28 |                 self.__source_file.write('%d %s %s %s %s %s\n' % (j + 1, w, p, ch, tgt, pred))
29 |             self.__source_file.write('\n')
30 | 
31 | 
32 | class POSWriter(object):
33 |     def __init__(self, word_alphabet, char_alphabet, pos_alphabet):
34 |         self.__source_file = None
35 |         self.__word_alphabet = word_alphabet
36 |         self.__char_alphabet = char_alphabet
37 |         self.__pos_alphabet = pos_alphabet
38 | 
39 |     def start(self, file_path):
40 |         self.__source_file = open(file_path, 'w')
41 | 
42 |     def close(self):
43 |         self.__source_file.close()
44 | 
45 |     def write(self, word, predictions, targets, lengths, symbolic_root=False, symbolic_end=False):
46 |         batch_size, _ = word.shape
47 |         start = 1 if symbolic_root else 0
48 |         end = 1 if symbolic_end else 0
49 |         for i in range(batch_size):
50 |             for j in range(start, lengths[i] - end):
51 |                 w = self.__word_alphabet.get_instance(word[i, j])
52 |                 pred = self.__pos_alphabet.get_instance(predictions[i, j])
53 |                 tgt = self.__pos_alphabet.get_instance(targets[i, j])
54 |                 self.__source_file.write('%d\t%s\t_\t%s\t%s\n' % (j, w, tgt, pred))
55 |             self.__source_file.write('\n')
56 | 
57 | 
58 | class CoNLLXWriter(object):
59 |     def __init__(self, word_alphabet, char_alphabet, pos_alphabet, type_alphabet):
60 |         self.__source_file = None
61 |         self.__word_alphabet = word_alphabet
62 |         self.__char_alphabet = char_alphabet
63 |         self.__pos_alphabet = pos_alphabet
64 |         self.__type_alphabet = type_alphabet
65 | 
66 |     def start(self, file_path):
67 |         self.__source_file = open(file_path, 'w')
68 | 
69 |     def close(self):
70 |         self.__source_file.close()
71 | 
72 |     def write(self, word, pos, head, type, lengths, symbolic_root=False, symbolic_end=False):
73 |         batch_size, _ = word.shape
74 |         start = 1 if symbolic_root else 0
75 |         end = 1 if symbolic_end else 0
76 |         for i in range(batch_size):
77 |             for j in range(start, lengths[i] - end):
78 |                 w = self.__word_alphabet.get_instance(word[i, j])
79 |                 p = self.__pos_alphabet.get_instance(pos[i, j])
80 |                 t = self.__type_alphabet.get_instance(type[i, j])
81 |                 h = head[i, j]
82 |                 self.__source_file.write('%d\t%s\t_\t_\t%s\t_\t%d\t%s\n' % (j, w, p, h, t))
83 |             self.__source_file.write('\n')
84 | 


--------------------------------------------------------------------------------
/neuronlp2/models/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from neuronlp2.models.sequence_labeling import *
4 | from .parsing import DeepBiAffine, NeuroMST, StackPtrNet
5 | 


--------------------------------------------------------------------------------
/neuronlp2/models/sequence_labeling.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | from overrides import overrides
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
  7 | from neuronlp2.nn import ChainCRF, VarGRU, VarRNN, VarLSTM, VarFastLSTM, CharCNN
  8 | 
  9 | 
 10 | class BiRecurrentConv(nn.Module):
 11 |     def __init__(self, word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers,
 12 |                  num_labels, embedd_word=None, embedd_char=None, p_in=0.33, p_out=0.5, p_rnn=(0.5, 0.5), activation='elu'):
 13 |         super(BiRecurrentConv, self).__init__()
 14 | 
 15 |         self.word_embed = nn.Embedding(num_words, word_dim, _weight=embedd_word, padding_idx=1)
 16 |         self.char_embed = nn.Embedding(num_chars, char_dim, _weight=embedd_char, padding_idx=1)
 17 |         self.char_cnn = CharCNN(2, char_dim, char_dim, hidden_channels=4 * char_dim, activation=activation)
 18 |         # dropout word
 19 |         self.dropout_in = nn.Dropout2d(p=p_in)
 20 |         # standard dropout
 21 |         self.dropout_rnn_in = nn.Dropout(p=p_rnn[0])
 22 |         self.dropout_out = nn.Dropout(p_out)
 23 | 
 24 |         if rnn_mode == 'RNN':
 25 |             RNN = nn.RNN
 26 |         elif rnn_mode == 'LSTM' or rnn_mode == 'FastLSTM':
 27 |             RNN = nn.LSTM
 28 |         elif rnn_mode == 'GRU':
 29 |             RNN = nn.GRU
 30 |         else:
 31 |             raise ValueError('Unknown RNN mode: %s' % rnn_mode)
 32 | 
 33 |         self.rnn = RNN(word_dim + char_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=p_rnn[1])
 34 | 
 35 |         self.fc = nn.Linear(hidden_size * 2, out_features)
 36 |         assert activation in ['elu', 'tanh']
 37 |         if activation == 'elu':
 38 |             self.activation = nn.ELU()
 39 |         else:
 40 |             self.activation = nn.Tanh()
 41 |         self.readout = nn.Linear(out_features, num_labels)
 42 |         self.criterion = nn.CrossEntropyLoss(reduction='none')
 43 | 
 44 |         self.reset_parameters(embedd_word, embedd_char)
 45 | 
 46 |     def reset_parameters(self, embedd_word, embedd_char):
 47 |         if embedd_word is None:
 48 |             nn.init.uniform_(self.word_embed.weight, -0.1, 0.1)
 49 |         if embedd_char is None:
 50 |             nn.init.uniform_(self.char_embed.weight, -0.1, 0.1)
 51 |         with torch.no_grad():
 52 |             self.word_embed.weight[self.word_embed.padding_idx].fill_(0)
 53 |             self.char_embed.weight[self.char_embed.padding_idx].fill_(0)
 54 | 
 55 |         for param in self.rnn.parameters():
 56 |             if param.dim() == 1:
 57 |                 nn.init.constant_(param, 0)
 58 |             else:
 59 |                 nn.init.xavier_uniform_(param)
 60 | 
 61 |         nn.init.xavier_uniform_(self.fc.weight)
 62 |         nn.init.constant_(self.fc.bias, 0.)
 63 | 
 64 |         nn.init.uniform_(self.readout.weight, -0.1, 0.1)
 65 |         nn.init.constant_(self.readout.bias, 0.)
 66 | 
 67 |     def _get_rnn_output(self, input_word, input_char, mask=None):
 68 |         # [batch, length, word_dim]
 69 |         word = self.word_embed(input_word)
 70 | 
 71 |         # [batch, length, char_length, char_dim]
 72 |         char = self.char_cnn(self.char_embed(input_char))
 73 | 
 74 |         # apply dropout word on input
 75 |         word = self.dropout_in(word)
 76 |         char = self.dropout_in(char)
 77 | 
 78 |         # concatenate word and char [batch, length, word_dim+char_filter]
 79 |         enc = torch.cat([word, char], dim=2)
 80 |         # apply dropout rnn input
 81 |         enc = self.dropout_rnn_in(enc)
 82 | 
 83 |         # output from rnn [batch, length, hidden_size * 2]
 84 |         if mask is not None:
 85 |             # prepare packed_sequence
 86 |             length = mask.sum(dim=1).long()
 87 |             packed_enc = pack_padded_sequence(enc, length, batch_first=True, enforce_sorted=False)
 88 |             packed_out, _ = self.rnn(packed_enc)
 89 |             output, _ = pad_packed_sequence(packed_out, batch_first=True)
 90 |         else:
 91 |             output, _ = self.rnn(enc)
 92 | 
 93 |         output = self.dropout_out(output)
 94 |         # [batch, length, out_features]
 95 |         output = self.dropout_out(self.activation(self.fc(output)))
 96 |         return output
 97 | 
 98 |     def forward(self, input_word, input_char, mask=None):
 99 |         # output from rnn [batch, length, out_features]
100 |         output = self._get_rnn_output(input_word, input_char, mask=mask)
101 |         return output
102 | 
103 |     def loss(self, input_word, input_char, target, mask=None):
104 |         # [batch, length, out_features]
105 |         output = self(input_word, input_char, mask=mask)
106 |         # [batch, length, num_labels] -> [batch, num_labels, length]
107 |         logits = self.readout(output).transpose(1, 2)
108 | 
109 |         # [batch, length]
110 |         loss = self.criterion(logits, target)
111 |         if mask is not None:
112 |             loss = loss * mask
113 |         # [batch]
114 |         loss = loss.sum(dim=1)
115 |         return loss
116 | 
117 |     def decode(self, input_word, input_char, mask=None, leading_symbolic=0):
118 |         output = self(input_word, input_char, mask=mask)
119 |         # [batch, length, num_labels] -> [batch, num_labels, length]
120 |         logits = self.readout(output).transpose(1, 2)
121 |         # [batch, length]
122 |         _, preds = torch.max(logits[:, leading_symbolic:], dim=1)
123 |         preds += leading_symbolic
124 |         if mask is not None:
125 |             preds = preds * mask.long()
126 |         return preds
127 | 
128 | 
129 | class BiVarRecurrentConv(BiRecurrentConv):
130 |     def __init__(self, word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers,
131 |                  num_labels, embedd_word=None, embedd_char=None, p_in=0.33, p_out=0.33, p_rnn=(0.33, 0.33), activation='elu'):
132 |         super(BiVarRecurrentConv, self).__init__(word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers,
133 |                                                  num_labels, embedd_word=embedd_word, embedd_char=embedd_char,
134 |                                                  p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation)
135 | 
136 |         self.dropout_rnn_in = None
137 |         self.dropout_out = nn.Dropout2d(p_out)
138 | 
139 |         if rnn_mode == 'RNN':
140 |             RNN = VarRNN
141 |         elif rnn_mode == 'LSTM':
142 |             RNN = VarLSTM
143 |         elif rnn_mode == 'FastLSTM':
144 |             RNN = VarFastLSTM
145 |         elif rnn_mode == 'GRU':
146 |             RNN = VarGRU
147 |         else:
148 |             raise ValueError('Unknown RNN mode: %s' % rnn_mode)
149 | 
150 |         self.rnn = RNN(word_dim + char_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=p_rnn)
151 | 
152 |     @overrides
153 |     def _get_rnn_output(self, input_word, input_char, mask=None):
154 |         # [batch, length, word_dim]
155 |         word = self.word_embed(input_word)
156 | 
157 |         # [batch, length, char_length, char_dim]
158 |         char = self.char_cnn(self.char_embed(input_char))
159 | 
160 |         # apply dropout word on input
161 |         word = self.dropout_in(word)
162 |         char = self.dropout_in(char)
163 | 
164 |         # concatenate word and char [batch, length, word_dim+char_filter]
165 |         enc = torch.cat([word, char], dim=2)
166 |         # output from rnn [batch, length, 2 * hidden_size]
167 |         output, _ = self.rnn(enc, mask)
168 | 
169 |         # apply dropout for the output of rnn
170 |         # [batch, length, 2 * hidden_size] --> [batch, 2 * hidden_size, length] --> [batch, length, 2 * hidden_size]
171 |         output = self.dropout_out(output.transpose(1, 2)).transpose(1, 2)
172 |         # [batch, length, out_features]
173 |         output = self.activation(self.fc(output))
174 |         # [batch, length, out_features] --> [batch, out_features, length] --> [batch, length, out_features]
175 |         output = self.dropout_out(output.transpose(1, 2)).transpose(1, 2)
176 |         return output
177 | 
178 | 
179 | class BiRecurrentConvCRF(BiRecurrentConv):
180 |     def __init__(self, word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers,
181 |                  num_labels, embedd_word=None, embedd_char=None, p_in=0.33, p_out=0.5, p_rnn=(0.5, 0.5), bigram=False, activation='elu'):
182 |         super(BiRecurrentConvCRF, self).__init__(word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers,
183 |                                                  num_labels, embedd_word=embedd_word, embedd_char=embedd_char,
184 |                                                  p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation)
185 | 
186 |         self.crf = ChainCRF(out_features, num_labels, bigram=bigram)
187 |         self.readout = None
188 |         self.criterion = None
189 | 
190 |     def forward(self, input_word, input_char, mask=None):
191 |         # output from rnn [batch, length, hidden_size]
192 |         output = self._get_rnn_output(input_word, input_char, mask=mask)
193 |         # [batch, length, num_label, num_label]
194 |         return self.crf(output, mask=mask)
195 | 
196 |     @overrides
197 |     def loss(self, input_word, input_char, target, mask=None):
198 |         # output from rnn [batch, length, hidden_size]
199 |         output = self._get_rnn_output(input_word, input_char, mask=mask)
200 |         # [batch]
201 |         return self.crf.loss(output, target, mask=mask)
202 | 
203 |     @overrides
204 |     def decode(self, input_word, input_char, mask=None, leading_symbolic=0):
205 |         # output from rnn [batch, length, hidden_size]
206 |         output = self._get_rnn_output(input_word, input_char, mask=mask)
207 |         # [batch, length]
208 |         preds = self.crf.decode(output, mask=mask, leading_symbolic=leading_symbolic)
209 |         if mask is not None:
210 |             preds = preds * mask.long()
211 |         return preds
212 | 
213 | 
214 | class BiVarRecurrentConvCRF(BiVarRecurrentConv):
215 |     def __init__(self, word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers,
216 |                  num_labels, embedd_word=None, embedd_char=None, p_in=0.33, p_out=0.33, p_rnn=(0.33, 0.33), bigram=False, activation='elu'):
217 |         super(BiVarRecurrentConvCRF, self).__init__(word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers,
218 |                                                     num_labels, embedd_word=embedd_word, embedd_char=embedd_char,
219 |                                                     p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation)
220 | 
221 |         self.crf = ChainCRF(out_features, num_labels, bigram=bigram)
222 |         self.readout = None
223 |         self.criterion = None
224 | 
225 |     def forward(self, input_word, input_char, mask=None):
226 |         # output from rnn [batch, length, hidden_size]
227 |         output = self._get_rnn_output(input_word, input_char, mask=mask)
228 |         # [batch, length, num_label,  num_label]
229 |         return self.crf(output, mask=mask)
230 | 
231 |     @overrides
232 |     def loss(self, input_word, input_char, target, mask=None):
233 |         # output from rnn [batch, length, hidden_size]
234 |         output = self._get_rnn_output(input_word, input_char, mask=mask)
235 |         # [batch]
236 |         return self.crf.loss(output, target, mask=mask)
237 | 
238 |     @overrides
239 |     def decode(self, input_word, input_char, mask=None, leading_symbolic=0):
240 |         # output from rnn [batch, length, hidden_size]
241 |         output = self._get_rnn_output(input_word, input_char, mask=mask)
242 |         preds = self.crf.decode(output, mask=mask, leading_symbolic=leading_symbolic)
243 |         if mask is not None:
244 |             preds = preds * mask.long()
245 |         return preds
246 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from neuronlp2.nn import init
4 | from neuronlp2.nn.crf import ChainCRF, TreeCRF
5 | from neuronlp2.nn.modules import BiLinear, BiAffine, CharCNN
6 | from neuronlp2.nn.variational_rnn import *
7 | from neuronlp2.nn.skip_rnn import *
8 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from neuronlp2.nn._functions import variational_rnn
4 | from neuronlp2.nn._functions import skipconnect_rnn
5 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/rnnFusedBackend.py:
--------------------------------------------------------------------------------
 1 | from torch.autograd.function import Function, once_differentiable
 2 | from torch._thnn import type2backend
 3 | 
 4 | 
 5 | class GRUFused(Function):
 6 |     @staticmethod
 7 |     def forward(ctx, input_gate, hidden_gate, hx, ibias=None, hbias=None):
 8 |         ctx.backend = type2backend[input_gate.type()]
 9 | 
10 |         hy = input_gate.new()
11 |         workspace = input_gate.new(hx.numel() * 5)
12 | 
13 |         ctx.has_bias = False
14 |         if ibias is not None:
15 |             ctx.has_bias = True
16 |             if ibias.dim() == 1:
17 |                 ibias = ibias.unsqueeze(0)
18 |             if hbias.dim() == 1:
19 |                 hbias = hbias.unsqueeze(0)
20 | 
21 |         ctx.backend.GRUFused_updateOutput(
22 |             ctx.backend.library_state,
23 |             input_gate, hidden_gate, ibias, hbias, hx, hy, workspace)
24 | 
25 |         ctx.workspace = workspace
26 |         ctx.igate_size = input_gate.size()
27 |         ctx.hgate_size = hidden_gate.size()
28 | 
29 |         return hy
30 | 
31 |     @staticmethod
32 |     @once_differentiable
33 |     def backward(ctx, gradOutput):
34 |         ctx.backend = type2backend[gradOutput.type()]
35 | 
36 |         gradInputHx = gradOutput.new()
37 |         gradInInput = gradOutput.new(*ctx.igate_size)
38 |         gradInHidden = gradOutput.new(*ctx.hgate_size)
39 | 
40 |         ctx.backend.GRUFused_updateGradInput(
41 |             ctx.backend.library_state,
42 |             gradInInput, gradInHidden, gradOutput, gradInputHx, ctx.workspace)
43 | 
44 |         gb1 = gb2 = None
45 |         if ctx.has_bias:
46 |             gb1 = gradInInput.sum(0, keepdim=False)
47 |             gb2 = gradInHidden.sum(0, keepdim=False)
48 |         return gradInInput, gradInHidden, gradInputHx, gb1, gb2
49 | 
50 | 
51 | class LSTMFused(Function):
52 |     @staticmethod
53 |     def forward(ctx, input_gate, hidden_gate, cx, ibias=None, hbias=None):
54 |         ctx.backend = type2backend[input_gate.type()]
55 |         hy = input_gate.new()
56 |         cy = input_gate.new()
57 | 
58 |         ctx.has_bias = False
59 |         if ibias is not None:
60 |             ctx.has_bias = True
61 |             if ibias.dim() == 1:
62 |                 ibias = ibias.unsqueeze(0)
63 |             if hbias.dim() == 1:
64 |                 hbias = hbias.unsqueeze(0)
65 | 
66 |         # input_gate gets overwritten with some intermediate values to use in backwards
67 |         ctx.backend.LSTMFused_updateOutput(
68 |             ctx.backend.library_state,
69 |             input_gate, hidden_gate,
70 |             ibias, hbias,
71 |             cx, hy, cy)
72 | 
73 |         ctx.hgate_size = hidden_gate.size()
74 |         ctx.save_for_backward(input_gate, cx, cy)
75 | 
76 |         return hy, cy
77 | 
78 |     @staticmethod
79 |     @once_differentiable
80 |     def backward(ctx, *gradOutput):
81 |         ctx.backend = type2backend[gradOutput[0].type()]
82 |         gradInputCx = gradOutput[0].new()
83 |         gradInGates = gradOutput[0].new(*ctx.hgate_size)
84 | 
85 |         saved_tens, cx, cy = ctx.saved_tensors
86 |         ctx.backend.LSTMFused_updateGradInput(
87 |             ctx.backend.library_state,
88 |             saved_tens, gradInGates, cx, cy,
89 |             gradOutput[0], gradOutput[1], gradInputCx)
90 | 
91 |         gb1 = gb2 = None
92 |         if ctx.has_bias:
93 |             gb1 = gradInGates.sum(0, keepdim=False)
94 |             gb2 = gradInGates.sum(0, keepdim=False)
95 | 
96 |         return gradInGates, gradInGates, gradInputCx, gb1, gb2
97 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/skipconnect_rnn.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import torch
  4 | from torch.nn import functional as F
  5 | 
  6 | 
  7 | def SkipConnectRNNReLUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None, noise_skip=None):
  8 |     if noise_in is not None:
  9 |         input = input * noise_in
 10 | 
 11 |     hidden = torch.cat([hidden, hidden_skip], dim=1)
 12 |     if noise_hidden is not None:
 13 |         hidden = hidden * noise_hidden
 14 | 
 15 |     hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 16 |     return hy
 17 | 
 18 | 
 19 | def SkipConnectRNNTanhCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 20 |     if noise_in is not None:
 21 |         input = input * noise_in
 22 | 
 23 |     hidden = torch.cat([hidden, hidden_skip], dim=1)
 24 |     if noise_hidden is not None:
 25 |         hidden = hidden * noise_hidden
 26 | 
 27 |     hy = torch.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 28 |     return hy
 29 | 
 30 | 
 31 | def SkipConnectLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 32 |     input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 33 | 
 34 |     hx, cx = hidden
 35 |     hx = torch.cat([hx, hidden_skip], dim=1)
 36 |     hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
 37 | 
 38 |     gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 39 | 
 40 |     ingate, forgetgate, cellgate, outgate = gates
 41 | 
 42 |     ingate = torch.sigmoid(ingate)
 43 |     forgetgate = torch.sigmoid(forgetgate)
 44 |     cellgate = torch.tanh(cellgate)
 45 |     outgate = torch.sigmoid(outgate)
 46 | 
 47 |     cy = (forgetgate * cx) + (ingate * cellgate)
 48 |     hy = outgate * torch.tanh(cy)
 49 | 
 50 |     return hy, cy
 51 | 
 52 | 
 53 | def SkipConnectFastLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 54 |     if noise_in is not None:
 55 |         input = input * noise_in
 56 | 
 57 |     hx, cx = hidden
 58 |     hx = torch.cat([hx, hidden_skip], dim=1)
 59 |     if noise_hidden is not None:
 60 |         hx = hx * noise_hidden
 61 | 
 62 |     gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
 63 | 
 64 |     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
 65 | 
 66 |     ingate = torch.sigmoid(ingate)
 67 |     forgetgate = torch.sigmoid(forgetgate)
 68 |     cellgate = torch.tanh(cellgate)
 69 |     outgate = torch.sigmoid(outgate)
 70 | 
 71 |     cy = (forgetgate * cx) + (ingate * cellgate)
 72 |     hy = outgate * torch.tanh(cy)
 73 | 
 74 |     return hy, cy
 75 | 
 76 | 
 77 | def SkipConnectGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 78 |     input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 79 |     hx = torch.cat([hidden, hidden_skip], dim=1)
 80 |     hx = hx.expand(3, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
 81 | 
 82 |     gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih)
 83 |     gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 84 |     i_r, i_i, i_n = gi
 85 |     h_r, h_i, h_n = gh
 86 | 
 87 |     resetgate = torch.sigmoid(i_r + h_r)
 88 |     inputgate = torch.sigmoid(i_i + h_i)
 89 |     newgate = torch.tanh(i_n + resetgate * h_n)
 90 |     hy = newgate + inputgate * (hidden - newgate)
 91 | 
 92 |     return hy
 93 | 
 94 | 
 95 | def SkipConnectFastGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 96 |     if noise_in is not None:
 97 |         input = input * noise_in
 98 | 
 99 |     hx = torch.cat([hidden, hidden_skip], dim=1)
100 |     if noise_hidden is not None:
101 |         hx = hx * noise_hidden
102 | 
103 |     gi = F.linear(input, w_ih, b_ih)
104 |     gh = F.linear(hx, w_hh, b_hh)
105 |     i_r, i_i, i_n = gi.chunk(3, 1)
106 |     h_r, h_i, h_n = gh.chunk(3, 1)
107 | 
108 |     resetgate = torch.sigmoid(i_r + h_r)
109 |     inputgate = torch.sigmoid(i_i + h_i)
110 |     newgate = torch.tanh(i_n + resetgate * h_n)
111 |     hy = newgate + inputgate * (hidden - newgate)
112 | 
113 |     return hy
114 | 
115 | 
116 | def SkipConnectRecurrent(reverse=False):
117 |     def forward(input, skip_connect, hidden, cell, mask):
118 |         # hack to handle LSTM
119 |         h0 = hidden[0] if isinstance(hidden, tuple) else hidden
120 |         # [length + 1, batch, hidden_size]
121 |         output = input.new_zeros(input.size(0) + 1, *h0.size()) + h0
122 |         steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
123 |         # create batch index
124 |         batch_index = torch.arange(0, h0.size(0)).type_as(skip_connect)
125 |         for i in steps:
126 |             if mask is None or mask[i].data.min() > 0.5:
127 |                 hidden_skip = output[skip_connect[i], batch_index]
128 |                 hidden = cell(input[i], hidden, hidden_skip)
129 |             elif mask[i].data.max() > 0.5:
130 |                 hidden_skip = output[skip_connect[i], batch_index]
131 |                 hidden_next = cell(input[i], hidden, hidden_skip)
132 |                 # hack to handle LSTM
133 |                 if isinstance(hidden, tuple):
134 |                     hx, cx = hidden
135 |                     hp1, cp1 = hidden_next
136 |                     hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
137 |                 else:
138 |                     hidden = hidden + (hidden_next - hidden) * mask[i]
139 |             # hack to handle LSTM
140 |             if reverse:
141 |                 output[i] = hidden[0] if isinstance(hidden, tuple) else hidden
142 |             else:
143 |                 output[i + 1] = hidden[0] if isinstance(hidden, tuple) else hidden
144 | 
145 |         if reverse:
146 |             # remove last position
147 |             output = output[:-1]
148 |         else:
149 |             # remove position 0
150 |             output = output[1:]
151 | 
152 |         return hidden, output
153 | 
154 |     return forward
155 | 
156 | 
157 | def StackedRNN(inners, num_layers, lstm=False):
158 |     num_directions = len(inners)
159 |     total_layers = num_layers * num_directions
160 | 
161 |     def reverse_skip_connection(skip_connect):
162 |         # TODO reverse skip connection for bidirectional rnn.
163 |         return skip_connect
164 | 
165 |     def forward(input, skip_connect, hidden, cells, mask):
166 |         assert (len(cells) == total_layers)
167 |         next_hidden = []
168 | 
169 |         skip_connect_forward = skip_connect
170 |         skip_connec_backward = reverse_skip_connection(skip_connect) if num_directions == 2 else None
171 | 
172 |         if lstm:
173 |             hidden = list(zip(*hidden))
174 | 
175 |         for i in range(num_layers):
176 |             all_output = []
177 |             for j, inner in enumerate(inners):
178 |                 l = i * num_directions + j
179 |                 skip_connect = skip_connect_forward if j == 0 else skip_connec_backward
180 |                 hy, output = inner(input, skip_connect, hidden[l], cells[l], mask)
181 |                 next_hidden.append(hy)
182 |                 all_output.append(output)
183 | 
184 |             input = torch.cat(all_output, input.dim() - 1)
185 | 
186 |         if lstm:
187 |             next_h, next_c = zip(*next_hidden)
188 |             next_hidden = (
189 |                 torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
190 |                 torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
191 |             )
192 |         else:
193 |             next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
194 | 
195 |         return next_hidden, input
196 | 
197 |     return forward
198 | 
199 | 
200 | def AutogradSkipConnectRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
201 |     rec_factory = SkipConnectRecurrent
202 | 
203 |     if bidirectional:
204 |         layer = (rec_factory(), rec_factory(reverse=True))
205 |     else:
206 |         layer = (rec_factory(),)
207 | 
208 |     func = StackedRNN(layer,
209 |                       num_layers,
210 |                       lstm=lstm)
211 | 
212 |     def forward(input, skip_connect, cells, hidden, mask):
213 |         if batch_first:
214 |             input = input.transpose(0, 1)
215 |             skip_connect = skip_connect.transpose(0, 1)
216 |             if mask is not None:
217 |                 mask = mask.transpose(0, 1)
218 | 
219 |         nexth, output = func(input, skip_connect, hidden, cells, mask)
220 | 
221 |         if batch_first:
222 |             output = output.transpose(0, 1)
223 | 
224 |         return output, nexth
225 | 
226 |     return forward
227 | 
228 | 
229 | def SkipConnectStep():
230 |     def forward(input, hidden, hidden_skip, cell, mask):
231 |         if mask is None or mask.data.min() > 0.5:
232 |             hidden = cell(input, hidden, hidden_skip)
233 |         elif mask.data.max() > 0.5:
234 |             hidden_next = cell(input, hidden, hidden_skip)
235 |             # hack to handle LSTM
236 |             if isinstance(hidden, tuple):
237 |                 hx, cx = hidden
238 |                 hp1, cp1 = hidden_next
239 |                 hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
240 |             else:
241 |                 hidden = hidden + (hidden_next - hidden) * mask
242 |         # hack to handle LSTM
243 |         output = hidden[0] if isinstance(hidden, tuple) else hidden
244 | 
245 |         return hidden, output
246 | 
247 |     return forward
248 | 
249 | 
250 | def StackedStep(layer, num_layers, lstm=False):
251 |     def forward(input, hidden, hidden_skip, cells, mask):
252 |         assert (len(cells) == num_layers)
253 |         next_hidden = []
254 | 
255 |         if lstm:
256 |             hidden = list(zip(*hidden))
257 | 
258 |         for l in range(num_layers):
259 |             hy, output = layer(input, hidden[l], hidden_skip[l], cells[l], mask)
260 |             next_hidden.append(hy)
261 |             input = output
262 | 
263 |         if lstm:
264 |             next_h, next_c = zip(*next_hidden)
265 |             next_hidden = (
266 |                 torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
267 |                 torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
268 |             )
269 |         else:
270 |             next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
271 | 
272 |         return next_hidden, input
273 | 
274 |     return forward
275 | 
276 | 
277 | def AutogradSkipConnectStep(num_layers=1, lstm=False):
278 |     layer = SkipConnectStep()
279 | 
280 |     func = StackedStep(layer,
281 |                        num_layers,
282 |                        lstm=lstm)
283 | 
284 |     def forward(input, cells, hidden, hidden_skip, mask):
285 |         nexth, output = func(input, hidden, hidden_skip, cells, mask)
286 |         return output, nexth
287 | 
288 |     return forward
289 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/variational_rnn.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import torch
  4 | from torch.nn import functional as F
  5 | 
  6 | 
  7 | def VarRNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
  8 |     if noise_in is not None:
  9 |         input = input * noise_in
 10 |     if noise_hidden is not None:
 11 |         hidden = hidden * noise_hidden
 12 |     hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 13 |     return hy
 14 | 
 15 | 
 16 | def VarRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 17 |     if noise_in is not None:
 18 |         input = input * noise_in
 19 |     if noise_hidden is not None:
 20 |         hidden = hidden * noise_hidden
 21 |     hy = torch.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 22 |     return hy
 23 | 
 24 | 
 25 | def VarLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 26 |     input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 27 | 
 28 |     hx, cx = hidden
 29 |     hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
 30 | 
 31 |     gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 32 | 
 33 |     ingate, forgetgate, cellgate, outgate = gates
 34 | 
 35 |     ingate = torch.sigmoid(ingate)
 36 |     forgetgate = torch.sigmoid(forgetgate)
 37 |     cellgate = torch.tanh(cellgate)
 38 |     outgate = torch.sigmoid(outgate)
 39 | 
 40 |     cy = (forgetgate * cx) + (ingate * cellgate)
 41 |     hy = outgate * torch.tanh(cy)
 42 | 
 43 |     return hy, cy
 44 | 
 45 | 
 46 | def VarFastLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 47 |     if noise_in is not None:
 48 |         input = input * noise_in
 49 | 
 50 |     hx, cx = hidden
 51 |     if noise_hidden is not None:
 52 |         hx = hx * noise_hidden
 53 |     gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
 54 | 
 55 |     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
 56 | 
 57 |     ingate = torch.sigmoid(ingate)
 58 |     forgetgate = torch.sigmoid(forgetgate)
 59 |     cellgate = torch.tanh(cellgate)
 60 |     outgate = torch.sigmoid(outgate)
 61 | 
 62 |     cy = (forgetgate * cx) + (ingate * cellgate)
 63 |     hy = outgate * torch.tanh(cy)
 64 | 
 65 |     return hy, cy
 66 | 
 67 | 
 68 | def VarGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 69 |     input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 70 |     hx = hidden.expand(3, *hidden.size()) if noise_hidden is None else hidden.unsqueeze(0) * noise_hidden
 71 | 
 72 |     gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih)
 73 |     gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 74 |     i_r, i_i, i_n = gi
 75 |     h_r, h_i, h_n = gh
 76 | 
 77 |     resetgate = torch.sigmoid(i_r + h_r)
 78 |     inputgate = torch.sigmoid(i_i + h_i)
 79 |     newgate = torch.tanh(i_n + resetgate * h_n)
 80 |     hy = newgate + inputgate * (hidden - newgate)
 81 | 
 82 |     return hy
 83 | 
 84 | 
 85 | def VarFastGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 86 |     if noise_in is not None:
 87 |         input = input * noise_in
 88 | 
 89 |     hx = hidden if noise_hidden is None else hidden * noise_hidden
 90 | 
 91 |     gi = F.linear(input, w_ih, b_ih)
 92 |     gh = F.linear(hx, w_hh, b_hh)
 93 |     i_r, i_i, i_n = gi.chunk(3, 1)
 94 |     h_r, h_i, h_n = gh.chunk(3, 1)
 95 | 
 96 |     resetgate = torch.sigmoid(i_r + h_r)
 97 |     inputgate = torch.sigmoid(i_i + h_i)
 98 |     newgate = torch.tanh(i_n + resetgate * h_n)
 99 |     hy = newgate + inputgate * (hidden - newgate)
100 | 
101 |     return hy
102 | 
103 | 
104 | def VarRecurrent(reverse=False):
105 |     def forward(input, hidden, cell, mask):
106 |         output = []
107 |         steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
108 |         for i in steps:
109 |             if mask is None or mask[i].data.min() > 0.5:
110 |                 hidden = cell(input[i], hidden)
111 |             elif mask[i].data.max() > 0.5:
112 |                 hidden_next = cell(input[i], hidden)
113 |                 # hack to handle LSTM
114 |                 if isinstance(hidden, tuple):
115 |                     hx, cx = hidden
116 |                     hp1, cp1 = hidden_next
117 |                     hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
118 |                 else:
119 |                     hidden = hidden + (hidden_next - hidden) * mask[i]
120 |             # hack to handle LSTM
121 |             output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
122 | 
123 |         if reverse:
124 |             output.reverse()
125 |         output = torch.cat(output, 0).view(input.size(0), *output[0].size())
126 | 
127 |         return hidden, output
128 | 
129 |     return forward
130 | 
131 | 
132 | def StackedRNN(inners, num_layers, lstm=False):
133 |     num_directions = len(inners)
134 |     total_layers = num_layers * num_directions
135 | 
136 |     def forward(input, hidden, cells, mask):
137 |         assert (len(cells) == total_layers)
138 |         next_hidden = []
139 | 
140 |         if lstm:
141 |             hidden = list(zip(*hidden))
142 | 
143 |         for i in range(num_layers):
144 |             all_output = []
145 |             for j, inner in enumerate(inners):
146 |                 l = i * num_directions + j
147 |                 hy, output = inner(input, hidden[l], cells[l], mask)
148 |                 next_hidden.append(hy)
149 |                 all_output.append(output)
150 | 
151 |             input = torch.cat(all_output, input.dim() - 1)
152 | 
153 |         if lstm:
154 |             next_h, next_c = zip(*next_hidden)
155 |             next_hidden = (
156 |                 torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
157 |                 torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
158 |             )
159 |         else:
160 |             next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
161 | 
162 |         return next_hidden, input
163 | 
164 |     return forward
165 | 
166 | 
167 | def AutogradVarRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
168 |     rec_factory = VarRecurrent
169 | 
170 |     if bidirectional:
171 |         layer = (rec_factory(), rec_factory(reverse=True))
172 |     else:
173 |         layer = (rec_factory(),)
174 | 
175 |     func = StackedRNN(layer,
176 |                       num_layers,
177 |                       lstm=lstm)
178 | 
179 |     def forward(input, cells, hidden, mask):
180 |         if batch_first:
181 |             input = input.transpose(0, 1)
182 |             if mask is not None:
183 |                 mask = mask.transpose(0, 1)
184 | 
185 |         nexth, output = func(input, hidden, cells, mask)
186 | 
187 |         if batch_first:
188 |             output = output.transpose(0, 1)
189 | 
190 |         return output, nexth
191 | 
192 |     return forward
193 | 
194 | 
195 | def VarRNNStep():
196 |     def forward(input, hidden, cell, mask):
197 |         if mask is None or mask.data.min() > 0.5:
198 |             hidden = cell(input, hidden)
199 |         elif mask.data.max() > 0.5:
200 |             hidden_next = cell(input, hidden)
201 |             # hack to handle LSTM
202 |             if isinstance(hidden, tuple):
203 |                 hx, cx = hidden
204 |                 hp1, cp1 = hidden_next
205 |                 hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
206 |             else:
207 |                 hidden = hidden + (hidden_next - hidden) * mask
208 |         # hack to handle LSTM
209 |         output = hidden[0] if isinstance(hidden, tuple) else hidden
210 | 
211 |         return hidden, output
212 | 
213 |     return forward
214 | 
215 | 
216 | def StackedStep(layer, num_layers, lstm=False):
217 |     def forward(input, hidden, cells, mask):
218 |         assert (len(cells) == num_layers)
219 |         next_hidden = []
220 | 
221 |         if lstm:
222 |             hidden = list(zip(*hidden))
223 | 
224 |         for l in range(num_layers):
225 |             hy, output = layer(input, hidden[l], cells[l], mask)
226 |             next_hidden.append(hy)
227 |             input = output
228 | 
229 |         if lstm:
230 |             next_h, next_c = zip(*next_hidden)
231 |             next_hidden = (
232 |                 torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
233 |                 torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
234 |             )
235 |         else:
236 |             next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
237 | 
238 |         return next_hidden, input
239 | 
240 |     return forward
241 | 
242 | 
243 | def AutogradVarRNNStep(num_layers=1, lstm=False):
244 |     layer = VarRNNStep()
245 | 
246 |     func = StackedStep(layer,
247 |                        num_layers,
248 |                        lstm=lstm)
249 | 
250 |     def forward(input, cells, hidden, mask):
251 |         nexth, output = func(input, hidden, cells, mask)
252 |         return output, nexth
253 | 
254 |     return forward
255 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/crf.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.nn.parameter import Parameter
  7 | from neuronlp2.nn.modules import BiAffine
  8 | 
  9 | 
 10 | class ChainCRF(nn.Module):
 11 |     def __init__(self, input_size, num_labels, bigram=True):
 12 |         '''
 13 | 
 14 |         Args:
 15 |             input_size: int
 16 |                 the dimension of the input.
 17 |             num_labels: int
 18 |                 the number of labels of the crf layer
 19 |             bigram: bool
 20 |                 if apply bi-gram parameter.
 21 |         '''
 22 |         super(ChainCRF, self).__init__()
 23 |         self.input_size = input_size
 24 |         self.num_labels = num_labels + 1
 25 |         self.pad_label_id = num_labels
 26 |         self.bigram = bigram
 27 | 
 28 | 
 29 |         # state weight tensor
 30 |         self.state_net = nn.Linear(input_size, self.num_labels)
 31 |         if bigram:
 32 |             # transition weight tensor
 33 |             self.transition_net = nn.Linear(input_size, self.num_labels * self.num_labels)
 34 |             self.register_parameter('transition_matrix', None)
 35 |         else:
 36 |             self.transition_net = None
 37 |             self.transition_matrix = Parameter(torch.Tensor(self.num_labels, self.num_labels))
 38 | 
 39 |         self.reset_parameters()
 40 | 
 41 |     def reset_parameters(self):
 42 |         nn.init.constant_(self.state_net.bias, 0.)
 43 |         if self.bigram:
 44 |             nn.init.xavier_uniform_(self.transition_net.weight)
 45 |             nn.init.constant_(self.transition_net.bias, 0.)
 46 |         else:
 47 |             nn.init.normal_(self.transition_matrix)
 48 | 
 49 |     def forward(self, input, mask=None):
 50 |         '''
 51 | 
 52 |         Args:
 53 |             input: Tensor
 54 |                 the input tensor with shape = [batch, length, model_dim]
 55 |             mask: Tensor or None
 56 |                 the mask tensor with shape = [batch, length]
 57 | 
 58 |         Returns: Tensor
 59 |             the energy tensor with shape = [batch, length, num_label, num_label]
 60 | 
 61 |         '''
 62 |         batch, length, _ = input.size()
 63 | 
 64 |         # compute out_s by tensor dot [batch, length, model_dim] * [model_dim, num_label]
 65 |         # thus out_s should be [batch, length, num_label] --> [batch, length, num_label, 1]
 66 |         out_s = self.state_net(input).unsqueeze(2)
 67 | 
 68 |         if self.bigram:
 69 |             # compute out_s by tensor dot: [batch, length, model_dim] * [model_dim, num_label * num_label]
 70 |             # the output should be [batch, length, num_label,  num_label]
 71 |             out_t = self.transition_net(input).view(batch, length, self.num_labels, self.num_labels)
 72 |             output = out_t + out_s
 73 |         else:
 74 |             # [batch, length, num_label, num_label]
 75 |             output = self.transition_matrix + out_s
 76 | 
 77 |         if mask is not None:
 78 |             output = output * mask.unsqueeze(2).unsqueeze(3)
 79 | 
 80 |         return output
 81 | 
 82 |     def loss(self, input, target, mask=None):
 83 |         '''
 84 | 
 85 |         Args:
 86 |             input: Tensor
 87 |                 the input tensor with shape = [batch, length, model_dim]
 88 |             target: Tensor
 89 |                 the tensor of target labels with shape [batch, length]
 90 |             mask:Tensor or None
 91 |                 the mask tensor with shape = [batch, length]
 92 | 
 93 |         Returns: Tensor
 94 |                 A 1D tensor for minus log likelihood loss [batch]
 95 |         '''
 96 |         batch, length, _ = input.size()
 97 |         energy = self(input, mask=mask)
 98 |         # shape = [length, batch, num_label, num_label]
 99 |         energy_transpose = energy.transpose(0, 1)
100 |         # shape = [length, batch]
101 |         target_transpose = target.transpose(0, 1)
102 |         # shape = [length, batch, 1]
103 |         mask_transpose = None
104 |         if mask is not None:
105 |             mask_transpose = mask.unsqueeze(2).transpose(0, 1)
106 | 
107 |         # shape = [batch, num_label]
108 |         partition = None
109 | 
110 |         # shape = [batch]
111 |         batch_index = torch.arange(0, batch).type_as(input).long()
112 |         prev_label = input.new_full((batch, ), self.num_labels - 1).long()
113 |         tgt_energy = input.new_zeros(batch)
114 | 
115 |         for t in range(length):
116 |             # shape = [batch, num_label, num_label]
117 |             curr_energy = energy_transpose[t]
118 |             if t == 0:
119 |                 partition = curr_energy[:, -1, :]
120 |             else:
121 |                 # shape = [batch, num_label]
122 |                 partition_new = torch.logsumexp(curr_energy + partition.unsqueeze(2), dim=1)
123 |                 if mask_transpose is None:
124 |                     partition = partition_new
125 |                 else:
126 |                     mask_t = mask_transpose[t]
127 |                     partition = partition + (partition_new - partition) * mask_t
128 |             tgt_energy += curr_energy[batch_index, prev_label, target_transpose[t]]
129 |             prev_label = target_transpose[t]
130 | 
131 |         return torch.logsumexp(partition, dim=1) - tgt_energy
132 | 
133 |     def decode(self, input, mask=None, leading_symbolic=0):
134 |         """
135 | 
136 |         Args:
137 |             input: Tensor
138 |                 the input tensor with shape = [batch, length, model_dim]
139 |             mask: Tensor or None
140 |                 the mask tensor with shape = [batch, length]
141 |             leading_symbolic: nt
142 |                 number of symbolic labels leading in type alphabets (set it to 0 if you are not sure)
143 | 
144 |         Returns: Tensor
145 |             decoding results in shape [batch, length]
146 | 
147 |         """
148 | 
149 |         energy = self(input, mask=mask)
150 | 
151 |         # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels)
152 |         # For convenience, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels)
153 |         energy_transpose = energy.transpose(0, 1)
154 | 
155 |         # the last row and column is the tag for pad symbol. reduce these two dimensions by 1 to remove that.
156 |         # also remove the first #symbolic rows and columns.
157 |         # now the shape of energies_shuffled is [n_time_steps, b_batch, t, t] where t = num_labels - #symbolic - 1.
158 |         energy_transpose = energy_transpose[:, :, leading_symbolic:-1, leading_symbolic:-1]
159 | 
160 |         length, batch_size, num_label, _ = energy_transpose.size()
161 | 
162 |         batch_index = torch.arange(0, batch_size).type_as(input).long()
163 |         pi = input.new_zeros([length, batch_size, num_label])
164 |         pointer = batch_index.new_zeros(length, batch_size, num_label)
165 |         back_pointer = batch_index.new_zeros(length, batch_size)
166 | 
167 |         pi[0] = energy[:, 0, -1, leading_symbolic:-1]
168 |         pointer[0] = -1
169 |         for t in range(1, length):
170 |             pi_prev = pi[t - 1]
171 |             pi[t], pointer[t] = torch.max(energy_transpose[t] + pi_prev.unsqueeze(2), dim=1)
172 | 
173 |         _, back_pointer[-1] = torch.max(pi[-1], dim=1)
174 |         for t in reversed(range(length - 1)):
175 |             pointer_last = pointer[t + 1]
176 |             back_pointer[t] = pointer_last[batch_index, back_pointer[t + 1]]
177 | 
178 |         return back_pointer.transpose(0, 1) + leading_symbolic
179 | 
180 | 
181 | class TreeCRF(nn.Module):
182 |     '''
183 |     Tree CRF layer.
184 |     '''
185 |     def __init__(self, model_dim):
186 |         """
187 | 
188 |         Args:
189 |             model_dim: int
190 |                 the dimension of the input.
191 | 
192 |         """
193 |         super(TreeCRF, self).__init__()
194 |         self.model_dim = model_dim
195 |         self.energy = BiAffine(model_dim, model_dim)
196 | 
197 |     def forward(self, heads, children, mask=None):
198 |         '''
199 | 
200 |         Args:
201 |             heads: Tensor
202 |                 the head input tensor with shape = [batch, length, model_dim]
203 |             children: Tensor
204 |                 the child input tensor with shape = [batch, length, model_dim]
205 |             mask: Tensor or None
206 |                 the mask tensor with shape = [batch, length]
207 |             lengths: Tensor or None
208 |                 the length tensor with shape = [batch]
209 | 
210 |         Returns: Tensor
211 |             the energy tensor with shape = [batch, length, length]
212 | 
213 |         '''
214 |         batch, length, _ = heads.size()
215 |         # [batch, length, length]
216 |         output = self.energy(heads, children, mask_query=mask, mask_key=mask)
217 |         return output
218 | 
219 |     def loss(self, heads, children, target_heads, mask=None):
220 |         '''
221 | 
222 |         Args:
223 |             heads: Tensor
224 |                 the head input tensor with shape = [batch, length, model_dim]
225 |             children: Tensor
226 |                 the child input tensor with shape = [batch, length, model_dim]
227 |             target_heads: Tensor
228 |                 the tensor of target labels with shape [batch, length]
229 |             mask:Tensor or None
230 |                 the mask tensor with shape = [batch, length]
231 | 
232 |         Returns: Tensor
233 |                 A 1D tensor for minus log likelihood loss
234 |         '''
235 |         batch, length, _ = heads.size()
236 |         # [batch, length, length]
237 |         energy = self(heads, children, mask=mask).double()
238 |         A = torch.exp(energy)
239 |         # mask out invalid positions
240 |         if mask is not None:
241 |             mask = mask.double()
242 |             A = A * mask.unsqueeze(2) * mask.unsqueeze(1)
243 | 
244 |         # set diagonal elements to 0
245 |         diag_mask = 1.0 - torch.eye(length).unsqueeze(0).type_as(energy)
246 |         A = A * diag_mask
247 |         energy = energy * diag_mask
248 | 
249 |         # get D [batch, length]
250 |         D = A.sum(dim=1)
251 |         rtol = 1e-4
252 |         atol = 1e-6
253 |         D += atol
254 |         if mask is not None:
255 |             D = D * mask
256 | 
257 |         # [batch, length, length]
258 |         D = torch.diag_embed(D)
259 | 
260 |         # compute laplacian matrix
261 |         # [batch, length, length]
262 |         L = D - A
263 | 
264 |         if mask is not None:
265 |             L = L + torch.diag_embed(1. - mask)
266 | 
267 |         # compute partition Z(x) [batch]
268 |         L = L[:, 1:, 1:]
269 |         z = torch.logdet(L)
270 | 
271 |         # first create index matrix [length, batch]
272 |         index = torch.arange(0, length).view(length, 1).expand(length, batch)
273 |         index = index.type_as(energy).long()
274 |         batch_index = torch.arange(0, batch).type_as(index)
275 |         # compute target energy [length-1, batch]
276 |         tgt_energy = energy[batch_index, target_heads.t(), index][1:]
277 |         # sum over dim=0 shape = [batch]
278 |         tgt_energy = tgt_energy.sum(dim=0)
279 | 
280 |         return (z - tgt_energy).float()
281 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/init.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def assign_tensor(tensor, val):
 7 |     """
 8 |     copy val to tensor
 9 |     Args:
10 |         tensor: an n-dimensional torch.Tensor or autograd.Variable
11 |         val: an n-dimensional torch.Tensor to fill the tensor with
12 | 
13 |     Returns:
14 | 
15 |     """
16 |     with torch.no_grad():
17 |         return tensor.copy_(val)
18 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/modules.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | from overrides import overrides
  4 | from collections import OrderedDict
  5 | import math
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.nn.parameter import Parameter
 11 | 
 12 | 
 13 | class BiLinear(nn.Module):
 14 |     """
 15 |     Bi-linear layer
 16 |     """
 17 |     def __init__(self, left_features, right_features, out_features, bias=True):
 18 |         """
 19 | 
 20 |         Args:
 21 |             left_features: size of left input
 22 |             right_features: size of right input
 23 |             out_features: size of output
 24 |             bias: If set to False, the layer will not learn an additive bias.
 25 |                 Default: True
 26 |         """
 27 |         super(BiLinear, self).__init__()
 28 |         self.left_features = left_features
 29 |         self.right_features = right_features
 30 |         self.out_features = out_features
 31 | 
 32 |         self.U = Parameter(torch.Tensor(self.out_features, self.left_features, self.right_features))
 33 |         self.weight_left = Parameter(torch.Tensor(self.out_features, self.left_features))
 34 |         self.weight_right = Parameter(torch.Tensor(self.out_features, self.right_features))
 35 | 
 36 |         if bias:
 37 |             self.bias = Parameter(torch.Tensor(out_features))
 38 |         else:
 39 |             self.register_parameter('bias', None)
 40 | 
 41 |         self.reset_parameters()
 42 | 
 43 |     def reset_parameters(self):
 44 |         nn.init.xavier_uniform_(self.weight_left)
 45 |         nn.init.xavier_uniform_(self.weight_right)
 46 |         nn.init.constant_(self.bias, 0.)
 47 |         nn.init.xavier_uniform_(self.U)
 48 | 
 49 |     def forward(self, input_left, input_right):
 50 |         """
 51 | 
 52 |         Args:
 53 |             input_left: Tensor
 54 |                 the left input tensor with shape = [batch1, batch2, ..., left_features]
 55 |             input_right: Tensor
 56 |                 the right input tensor with shape = [batch1, batch2, ..., right_features]
 57 | 
 58 |         Returns:
 59 | 
 60 |         """
 61 | 
 62 |         batch_size = input_left.size()[:-1]
 63 |         batch = int(np.prod(batch_size))
 64 | 
 65 |         # convert left and right input to matrices [batch, left_features], [batch, right_features]
 66 |         input_left = input_left.view(batch, self.left_features)
 67 |         input_right = input_right.view(batch, self.right_features)
 68 | 
 69 |         # output [batch, out_features]
 70 |         output = F.bilinear(input_left, input_right, self.U, self.bias)
 71 |         output = output + F.linear(input_left, self.weight_left, None) + F.linear(input_right, self.weight_right, None)
 72 |         # convert back to [batch1, batch2, ..., out_features]
 73 |         return output.view(batch_size + (self.out_features, ))
 74 | 
 75 |     def __repr__(self):
 76 |         return self.__class__.__name__ + ' (' \
 77 |                + 'left_features=' + str(self.left_features) \
 78 |                + ', right_features=' + str(self.right_features) \
 79 |                + ', out_features=' + str(self.out_features) + ')'
 80 | 
 81 | 
 82 | class BiAffine(nn.Module):
 83 |     '''
 84 |     Bi-Affine energy layer.
 85 |     '''
 86 | 
 87 |     def __init__(self, key_dim, query_dim):
 88 |         '''
 89 | 
 90 |         Args:
 91 |             key_dim: int
 92 |                 the dimension of the key.
 93 |             query_dim: int
 94 |                 the dimension of the query.
 95 | 
 96 |         '''
 97 |         super(BiAffine, self).__init__()
 98 |         self.key_dim = key_dim
 99 |         self.query_dim = query_dim
100 | 
101 |         self.q_weight = Parameter(torch.Tensor(self.query_dim))
102 |         self.key_weight = Parameter(torch.Tensor(self.key_dim))
103 |         self.b = Parameter(torch.Tensor(1))
104 |         self.U = Parameter(torch.Tensor(self.query_dim, self.key_dim))
105 |         self.reset_parameters()
106 | 
107 |     def reset_parameters(self):
108 |         bound = 1 / math.sqrt(self.query_dim)
109 |         nn.init.uniform_(self.q_weight, -bound, bound)
110 |         bound = 1 / math.sqrt(self.key_dim)
111 |         nn.init.uniform_(self.key_weight, -bound, bound)
112 |         nn.init.constant_(self.b, 0.)
113 |         nn.init.xavier_uniform_(self.U)
114 | 
115 |     def forward(self, query, key, mask_query=None, mask_key=None):
116 |         """
117 | 
118 |         Args:
119 |             query: Tensor
120 |                 the decoder input tensor with shape = [batch, length_query, query_dim]
121 |             key: Tensor
122 |                 the child input tensor with shape = [batch, length_key, key_dim]
123 |             mask_query: Tensor or None
124 |                 the mask tensor for decoder with shape = [batch, length_query]
125 |             mask_key: Tensor or None
126 |                 the mask tensor for encoder with shape = [batch, length_key]
127 | 
128 |         Returns: Tensor
129 |             the energy tensor with shape = [batch, length_query, length_key]
130 | 
131 |         """
132 |         # output shape [batch, length_query, length_key]
133 |         # compute bi-affine part
134 |         # [batch, length_query, query_dim] * [query_dim, key_dim]
135 |         # output shape [batch, length_query, key_dim]
136 |         output = torch.matmul(query, self.U)
137 |         # [batch, length_query, key_dim] * [batch, key_dim, length_key]
138 |         # output shape [batch, length_query, length_key]
139 |         output = torch.matmul(output, key.transpose(1, 2))
140 | 
141 |         # compute query part: [query_dim] * [batch, query_dim, length_query]
142 |         # the output shape is [batch, length_query, 1]
143 |         out_q = torch.matmul(self.q_weight, query.transpose(1, 2)).unsqueeze(2)
144 |         # compute decoder part: [key_dim] * [batch, key_dim, length_key]
145 |         # the output shape is [batch, 1, length_key]
146 |         out_k = torch.matmul(self.key_weight, key.transpose(1, 2)).unsqueeze(1)
147 | 
148 |         output = output + out_q + out_k + self.b
149 | 
150 |         if mask_query is not None:
151 |             output = output * mask_query.unsqueeze(2)
152 |         if mask_key is not None:
153 |             output = output * mask_key.unsqueeze(1)
154 |         return output
155 | 
156 |     @overrides
157 |     def extra_repr(self):
158 |         s = '{key_dim}, {query_dim}'
159 |         return s.format(**self.__dict__)
160 | 
161 | 
162 | class CharCNN(nn.Module):
163 |     """
164 |     CNN layers for characters
165 |     """
166 |     def __init__(self, num_layers, in_channels, out_channels, hidden_channels=None, activation='elu'):
167 |         super(CharCNN, self).__init__()
168 |         assert activation in ['elu', 'tanh']
169 |         if activation == 'elu':
170 |             ACT = nn.ELU
171 |         else:
172 |             ACT = nn.Tanh
173 |         layers = list()
174 |         for i in range(num_layers - 1):
175 |             layers.append(('conv{}'.format(i), nn.Conv1d(in_channels, hidden_channels, kernel_size=3, padding=1)))
176 |             layers.append(('act{}'.format(i), ACT()))
177 |             in_channels = hidden_channels
178 |         layers.append(('conv_top', nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1)))
179 |         layers.append(('act_top', ACT()))
180 |         self.act = ACT
181 |         self.net = nn.Sequential(OrderedDict(layers))
182 | 
183 |         self.reset_parameters()
184 | 
185 |     def reset_parameters(self):
186 |         for layer in self.net:
187 |             if isinstance(layer, nn.Conv1d):
188 |                 nn.init.xavier_uniform_(layer.weight)
189 |                 nn.init.constant_(layer.bias, 0.)
190 |             else:
191 |                 assert isinstance(layer, self.act)
192 | 
193 |     def forward(self, char):
194 |         """
195 | 
196 |         Args:
197 |             char: Tensor
198 |                 the input tensor of character [batch, sent_length, char_length, in_channels]
199 | 
200 |         Returns: Tensor
201 |             output character encoding with shape [batch, sent_length, in_channels]
202 | 
203 |         """
204 |         # [batch, sent_length, char_length, in_channels]
205 |         char_size = char.size()
206 |         # first transform to [batch * sent_length, char_length, in_channels]
207 |         # then transpose to [batch * sent_length, in_channels, char_length]
208 |         char = char.view(-1, char_size[2], char_size[3]).transpose(1, 2)
209 |         # [batch * sent_length, out_channels, char_length]
210 |         char = self.net(char).max(dim=2)[0]
211 |         # [batch, sent_length, out_channels]
212 |         return char.view(char_size[0], char_size[1], -1)
213 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/utils.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | from itertools import repeat
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch._six import inf
 6 | 
 7 | 
 8 | def _ntuple(n):
 9 |     def parse(x):
10 |         if isinstance(x, collections.Iterable):
11 |             return x
12 |         return tuple(repeat(x, n))
13 |     return parse
14 | 
15 | _single = _ntuple(1)
16 | _pair = _ntuple(2)
17 | _triple = _ntuple(3)
18 | _quadruple = _ntuple(4)
19 | 
20 | 
21 | def freeze_embedding(embedding):
22 |     assert isinstance(embedding, nn.Embedding), "input should be an Embedding module."
23 |     embedding.weight.detach_()
24 | 
25 | def total_grad_norm(parameters, norm_type=2):
26 |     if isinstance(parameters, torch.Tensor):
27 |         parameters = [parameters]
28 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
29 |     norm_type = float(norm_type)
30 |     if norm_type == inf:
31 |         total_norm = max(p.grad.data.abs().max() for p in parameters)
32 |     else:
33 |         total_norm = 0
34 |         for p in parameters:
35 |             param_norm = p.grad.data.norm(norm_type)
36 |             total_norm += param_norm.item() ** norm_type
37 |         total_norm = total_norm ** (1. / norm_type)
38 |     return total_norm
39 | 


--------------------------------------------------------------------------------
/neuronlp2/optim/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from neuronlp2.optim.lr_scheduler import InverseSquareRootScheduler, ExponentialScheduler
4 | 


--------------------------------------------------------------------------------
/neuronlp2/optim/lr_scheduler.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | from collections import defaultdict
  4 | from torch.optim.optimizer import Optimizer
  5 | 
  6 | 
  7 | class _LRScheduler(object):
  8 |     def __init__(self, optimizer, last_epoch=-1):
  9 |         if not isinstance(optimizer, Optimizer):
 10 |             raise TypeError('{} is not an Optimizer'.format(
 11 |                 type(optimizer).__name__))
 12 |         self.optimizer = optimizer
 13 |         if last_epoch == -1:
 14 |             for group in optimizer.param_groups:
 15 |                 group.setdefault('initial_lr', group['lr'])
 16 |             last_epoch = 0
 17 |         else:
 18 |             for i, group in enumerate(optimizer.param_groups):
 19 |                 if 'initial_lr' not in group:
 20 |                     raise KeyError("param 'initial_lr' is not specified "
 21 |                                    "in param_groups[{}] when resuming an optimizer".format(i))
 22 |         self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
 23 | 
 24 |     def state_dict(self):
 25 |         """Returns the state of the scheduler as a :class:`dict`.
 26 | 
 27 |         It contains an entry for every variable in self.__dict__ which
 28 |         is not the optimizer.
 29 |         """
 30 |         return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}
 31 | 
 32 |     def load_state_dict(self, state_dict):
 33 |         """Loads the schedulers state.
 34 | 
 35 |         Arguments:
 36 |             state_dict (dict): scheduler state. Should be an object returned
 37 |                 from a call to :meth:`state_dict`.
 38 |         """
 39 |         self.__dict__.update(state_dict)
 40 | 
 41 |     def get_lr(self):
 42 |         raise NotImplementedError
 43 | 
 44 |     def step(self, epoch=None):
 45 |         if epoch is None:
 46 |             epoch = self.last_epoch + 1
 47 |         self.last_epoch = epoch
 48 |         for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
 49 |             param_group['lr'] = lr
 50 | 
 51 |     def reset_state(self):
 52 |         self.optimizer.state.clear()
 53 | 
 54 | 
 55 | class InverseSquareRootScheduler(_LRScheduler):
 56 |     """
 57 |     Decay the LR based on the inverse square root of the update number.
 58 |     We also support a warmup phase where we linearly increase the learning rate
 59 |     from zero until the configured learning rate (``--lr``).
 60 |     Thereafter we decay proportional to the number of
 61 |     updates, with a decay factor set to align with the configured learning rate.
 62 |     During warmup::
 63 |       lrs = torch.linspace(0, args.lr, args.warmup_updates)
 64 |       lr = lrs[update_num]
 65 |     After warmup::
 66 |       decay_factor = args.lr * sqrt(args.warmup_updates)
 67 |       lr = decay_factor / sqrt(update_num)
 68 |     """
 69 |     def __init__(self, optimizer, warmup_steps, init_lr, last_epoch=-1):
 70 |         assert warmup_steps > 0, 'warmup steps should be larger than 0.'
 71 |         super(InverseSquareRootScheduler, self).__init__(optimizer, last_epoch)
 72 |         self.warmup_steps = float(warmup_steps)
 73 |         self.init_lr = init_lr
 74 |         self.lr_steps = [(base_lr - init_lr) / warmup_steps for base_lr in self.base_lrs]
 75 |         self.decay_factor = self.warmup_steps ** 0.5
 76 |         if last_epoch == -1:
 77 |             last_epoch = 0
 78 |         self.step(last_epoch)
 79 | 
 80 |     def get_lr(self):
 81 |         if self.last_epoch < self.warmup_steps:
 82 |             return [self.init_lr + lr_step * self.last_epoch for lr_step in self.lr_steps]
 83 |         else:
 84 |             lr_factor = self.decay_factor * self.last_epoch**-0.5
 85 |             return [base_lr * lr_factor for base_lr in self.base_lrs]
 86 | 
 87 | 
 88 | class ExponentialScheduler(_LRScheduler):
 89 |     """Set the learning rate of each parameter group to the initial lr decayed
 90 |     by gamma every epoch. When last_epoch=-1, sets initial lr as lr.
 91 |     We also support a warmup phase where we linearly increase the learning rate
 92 |     from zero until the configured learning rate (``--lr``).
 93 |     Args:
 94 |         optimizer (Optimizer): Wrapped optimizer.
 95 |         gamma (float): Multiplicative factor of learning rate decay.
 96 |         warmup_steps (int): Warmup steps..
 97 |         last_epoch (int): The index of last epoch. Default: -1.
 98 |     """
 99 | 
100 |     def __init__(self, optimizer, gamma, warmup_steps, init_lr, last_epoch=-1):
101 |         super(ExponentialScheduler, self).__init__(optimizer, last_epoch)
102 |         self.gamma = gamma
103 |         # handle warmup <= 0
104 |         self.warmup_steps = max(1, warmup_steps)
105 |         self.init_lr = init_lr
106 |         self.lr_steps = [(base_lr - init_lr) / self.warmup_steps for base_lr in self.base_lrs]
107 |         if last_epoch == -1:
108 |             last_epoch = 0
109 |         self.step(last_epoch)
110 | 
111 |     def get_lr(self):
112 |         if self.last_epoch < self.warmup_steps:
113 |             return [self.init_lr + lr_step * self.last_epoch for lr_step in self.lr_steps]
114 |         else:
115 |             lr_factor = self.gamma ** (self.last_epoch - self.warmup_steps)
116 |             return [base_lr * lr_factor for base_lr in self.base_lrs]
117 | 


--------------------------------------------------------------------------------
/neuronlp2/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from neuronlp2.tasks.parser import *
4 | 


--------------------------------------------------------------------------------
/neuronlp2/tasks/parser.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import re
  4 | import numpy as np
  5 | 
  6 | def is_uni_punctuation(word):
  7 |     match = re.match("^[^\w\s]+$]", word, flags=re.UNICODE)
  8 |     return match is not None
  9 | 
 10 | 
 11 | def is_punctuation(word, pos, punct_set=None):
 12 |     if punct_set is None:
 13 |         return is_uni_punctuation(word)
 14 |     else:
 15 |         return pos in punct_set
 16 | 
 17 | 
 18 | def eval(words, postags, heads_pred, types_pred, heads, types, word_alphabet, pos_alphabet, lengths,
 19 |          punct_set=None, symbolic_root=False, symbolic_end=False):
 20 |     batch_size, _ = words.shape
 21 |     ucorr = 0.
 22 |     lcorr = 0.
 23 |     total = 0.
 24 |     ucomplete_match = 0.
 25 |     lcomplete_match = 0.
 26 | 
 27 |     ucorr_nopunc = 0.
 28 |     lcorr_nopunc = 0.
 29 |     total_nopunc = 0.
 30 |     ucomplete_match_nopunc = 0.
 31 |     lcomplete_match_nopunc = 0.
 32 | 
 33 |     corr_root = 0.
 34 |     total_root = 0.
 35 |     start = 1 if symbolic_root else 0
 36 |     end = 1 if symbolic_end else 0
 37 |     for i in range(batch_size):
 38 |         ucm = 1.
 39 |         lcm = 1.
 40 |         ucm_nopunc = 1.
 41 |         lcm_nopunc = 1.
 42 |         for j in range(start, lengths[i] - end):
 43 |             word = word_alphabet.get_instance(words[i, j])
 44 |             pos = pos_alphabet.get_instance(postags[i, j])
 45 | 
 46 |             total += 1
 47 |             if heads[i, j] == heads_pred[i, j]:
 48 |                 ucorr += 1
 49 |                 if types[i, j] == types_pred[i, j]:
 50 |                     lcorr += 1
 51 |                 else:
 52 |                     lcm = 0
 53 |             else:
 54 |                 ucm = 0
 55 |                 lcm = 0
 56 | 
 57 |             if not is_punctuation(word, pos, punct_set):
 58 |                 total_nopunc += 1
 59 |                 if heads[i, j] == heads_pred[i, j]:
 60 |                     ucorr_nopunc += 1
 61 |                     if types[i, j] == types_pred[i, j]:
 62 |                         lcorr_nopunc += 1
 63 |                     else:
 64 |                         lcm_nopunc = 0
 65 |                 else:
 66 |                     ucm_nopunc = 0
 67 |                     lcm_nopunc = 0
 68 | 
 69 |             if heads[i, j] == 0:
 70 |                 total_root += 1
 71 |                 corr_root += 1 if heads_pred[i, j] == 0 else 0
 72 | 
 73 |         ucomplete_match += ucm
 74 |         lcomplete_match += lcm
 75 |         ucomplete_match_nopunc += ucm_nopunc
 76 |         lcomplete_match_nopunc += lcm_nopunc
 77 | 
 78 |     return (ucorr, lcorr, total, ucomplete_match, lcomplete_match), \
 79 |            (ucorr_nopunc, lcorr_nopunc, total_nopunc, ucomplete_match_nopunc, lcomplete_match_nopunc), \
 80 |            (corr_root, total_root), batch_size
 81 | 
 82 | 
 83 | def decode_MST(energies, lengths, leading_symbolic=0, labeled=True):
 84 |     """
 85 |     decode best parsing tree with MST algorithm.
 86 |     :param energies: energies: numpy 4D tensor
 87 |         energies of each edge. the shape is [batch_size, num_labels, n_steps, n_steps],
 88 |         where the summy root is at index 0.
 89 |     :param masks: numpy 2D tensor
 90 |         masks in the shape [batch_size, n_steps].
 91 |     :param leading_symbolic: int
 92 |         number of symbolic dependency types leading in type alphabets)
 93 |     :return:
 94 |     """
 95 | 
 96 |     def find_cycle(par):
 97 |         added = np.zeros([length], np.bool)
 98 |         added[0] = True
 99 |         cycle = set()
100 |         findcycle = False
101 |         for i in range(1, length):
102 |             if findcycle:
103 |                 break
104 | 
105 |             if added[i] or not curr_nodes[i]:
106 |                 continue
107 | 
108 |             # init cycle
109 |             tmp_cycle = set()
110 |             tmp_cycle.add(i)
111 |             added[i] = True
112 |             findcycle = True
113 |             l = i
114 | 
115 |             while par[l] not in tmp_cycle:
116 |                 l = par[l]
117 |                 if added[l]:
118 |                     findcycle = False
119 |                     break
120 |                 added[l] = True
121 |                 tmp_cycle.add(l)
122 | 
123 |             if findcycle:
124 |                 lorg = l
125 |                 cycle.add(lorg)
126 |                 l = par[lorg]
127 |                 while l != lorg:
128 |                     cycle.add(l)
129 |                     l = par[l]
130 |                 break
131 | 
132 |         return findcycle, cycle
133 | 
134 |     def chuLiuEdmonds():
135 |         par = np.zeros([length], dtype=np.int32)
136 |         # create best graph
137 |         par[0] = -1
138 |         for i in range(1, length):
139 |             # only interested at current nodes
140 |             if curr_nodes[i]:
141 |                 max_score = score_matrix[0, i]
142 |                 par[i] = 0
143 |                 for j in range(1, length):
144 |                     if j == i or not curr_nodes[j]:
145 |                         continue
146 | 
147 |                     new_score = score_matrix[j, i]
148 |                     if new_score > max_score:
149 |                         max_score = new_score
150 |                         par[i] = j
151 | 
152 |         # find a cycle
153 |         findcycle, cycle = find_cycle(par)
154 |         # no cycles, get all edges and return them.
155 |         if not findcycle:
156 |             final_edges[0] = -1
157 |             for i in range(1, length):
158 |                 if not curr_nodes[i]:
159 |                     continue
160 | 
161 |                 pr = oldI[par[i], i]
162 |                 ch = oldO[par[i], i]
163 |                 final_edges[ch] = pr
164 |             return
165 | 
166 |         cyc_len = len(cycle)
167 |         cyc_weight = 0.0
168 |         cyc_nodes = np.zeros([cyc_len], dtype=np.int32)
169 |         for id, cyc_node in enumerate(cycle):
170 |             cyc_nodes[id] = cyc_node
171 |             cyc_weight += score_matrix[par[cyc_node], cyc_node]
172 | 
173 |         rep = cyc_nodes[0]
174 |         for i in range(length):
175 |             if not curr_nodes[i] or i in cycle:
176 |                 continue
177 | 
178 |             max1 = float("-inf")
179 |             wh1 = -1
180 |             max2 = float("-inf")
181 |             wh2 = -1
182 | 
183 |             for j in cyc_nodes:
184 |                 if score_matrix[j, i] > max1:
185 |                     max1 = score_matrix[j, i]
186 |                     wh1 = j
187 | 
188 |                 scr = cyc_weight + score_matrix[i, j] - score_matrix[par[j], j]
189 | 
190 |                 if scr > max2:
191 |                     max2 = scr
192 |                     wh2 = j
193 | 
194 |             score_matrix[rep, i] = max1
195 |             oldI[rep, i] = oldI[wh1, i]
196 |             oldO[rep, i] = oldO[wh1, i]
197 |             score_matrix[i, rep] = max2
198 |             oldO[i, rep] = oldO[i, wh2]
199 |             oldI[i, rep] = oldI[i, wh2]
200 | 
201 |         rep_cons = []
202 |         for i in range(cyc_len):
203 |             rep_cons.append(set())
204 |             cyc_node = cyc_nodes[i]
205 |             for cc in reps[cyc_node]:
206 |                 rep_cons[i].add(cc)
207 | 
208 |         for cyc_node in cyc_nodes[1:]:
209 |             curr_nodes[cyc_node] = False
210 |             for cc in reps[cyc_node]:
211 |                 reps[rep].add(cc)
212 | 
213 |         chuLiuEdmonds()
214 | 
215 |         # check each node in cycle, if one of its representatives is a key in the final_edges, it is the one.
216 |         found = False
217 |         wh = -1
218 |         for i in range(cyc_len):
219 |             for repc in rep_cons[i]:
220 |                 if repc in final_edges:
221 |                     wh = cyc_nodes[i]
222 |                     found = True
223 |                     break
224 |             if found:
225 |                 break
226 | 
227 |         l = par[wh]
228 |         while l != wh:
229 |             ch = oldO[par[l], l]
230 |             pr = oldI[par[l], l]
231 |             final_edges[ch] = pr
232 |             l = par[l]
233 | 
234 |     if labeled:
235 |         assert energies.ndim == 4, 'dimension of energies is not equal to 4'
236 |     else:
237 |         assert energies.ndim == 3, 'dimension of energies is not equal to 3'
238 |     input_shape = energies.shape
239 |     batch_size = input_shape[0]
240 |     max_length = input_shape[2]
241 | 
242 |     pars = np.zeros([batch_size, max_length], dtype=np.int32)
243 |     types = np.zeros([batch_size, max_length], dtype=np.int32) if labeled else None
244 |     for i in range(batch_size):
245 |         energy = energies[i]
246 | 
247 |         # calc the realy length of this instance
248 |         length = lengths[i]
249 | 
250 |         # calc real energy matrix shape = [length, length, num_labels - #symbolic] (remove the label for symbolic types).
251 |         if labeled:
252 |             energy = energy[leading_symbolic:, :length, :length]
253 |             energy = energy - energy.min() + 1e-6
254 |             # get best label for each edge.
255 |             label_id_matrix = energy.argmax(axis=0) + leading_symbolic
256 |             energy = energy.max(axis=0)
257 |         else:
258 |             energy = energy[:length, :length]
259 |             energy = energy - energy.min() + 1e-6
260 |             label_id_matrix = None
261 |         # get original score matrix
262 |         orig_score_matrix = energy
263 |         # initialize score matrix to original score matrix
264 |         score_matrix = np.array(orig_score_matrix, copy=True)
265 | 
266 |         oldI = np.zeros([length, length], dtype=np.int32)
267 |         oldO = np.zeros([length, length], dtype=np.int32)
268 |         curr_nodes = np.zeros([length], dtype=np.bool)
269 |         reps = []
270 | 
271 |         for s in range(length):
272 |             orig_score_matrix[s, s] = 0.0
273 |             score_matrix[s, s] = 0.0
274 |             curr_nodes[s] = True
275 |             reps.append(set())
276 |             reps[s].add(s)
277 |             for t in range(s + 1, length):
278 |                 oldI[s, t] = s
279 |                 oldO[s, t] = t
280 | 
281 |                 oldI[t, s] = t
282 |                 oldO[t, s] = s
283 | 
284 |         final_edges = dict()
285 |         chuLiuEdmonds()
286 |         par = np.zeros([max_length], np.int32)
287 |         if labeled:
288 |             type = np.ones([max_length], np.int32)
289 |             type[0] = 0
290 |         else:
291 |             type = None
292 | 
293 |         for ch, pr in final_edges.items():
294 |             par[ch] = pr
295 |             if labeled and ch != 0:
296 |                 type[ch] = label_id_matrix[pr, ch]
297 | 
298 |         par[0] = 0
299 |         pars[i] = par
300 |         if labeled:
301 |             types[i] = type
302 | 
303 |     return pars, types
304 | 


--------------------------------------------------------------------------------
/neuronlp2/utils.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | from collections import OrderedDict
  4 | import pickle
  5 | import numpy as np
  6 | from gensim.models.word2vec import Word2Vec
  7 | import gzip
  8 | 
  9 | from neuronlp2.io.common import DIGIT_RE
 10 | 
 11 | 
 12 | def load_embedding_dict(embedding, embedding_path, normalize_digits=True):
 13 |     """
 14 |     load word embeddings from file
 15 |     :param embedding:
 16 |     :param embedding_path:
 17 |     :return: embedding dict, embedding dimention, caseless
 18 |     """
 19 |     print("loading embedding: %s from %s" % (embedding, embedding_path))
 20 |     if embedding == 'word2vec':
 21 |         # loading word2vec
 22 |         word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True)
 23 |         embedd_dim = word2vec.vector_size
 24 |         return word2vec, embedd_dim
 25 |     elif embedding == 'glove':
 26 |         # loading GloVe
 27 |         embedd_dim = -1
 28 |         embedd_dict = OrderedDict()
 29 |         with gzip.open(embedding_path, 'rt') as file:
 30 |             for line in file:
 31 |                 line = line.strip()
 32 |                 if len(line) == 0:
 33 |                     continue
 34 | 
 35 |                 tokens = line.split()
 36 |                 if embedd_dim < 0:
 37 |                     embedd_dim = len(tokens) - 1
 38 |                 else:
 39 |                     assert (embedd_dim + 1 == len(tokens))
 40 |                 embedd = np.empty([1, embedd_dim], dtype=np.float32)
 41 |                 embedd[:] = tokens[1:]
 42 |                 word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
 43 |                 embedd_dict[word] = embedd
 44 |         return embedd_dict, embedd_dim
 45 |     elif embedding == 'senna':
 46 |         # loading Senna
 47 |         embedd_dim = -1
 48 |         embedd_dict = OrderedDict()
 49 |         with gzip.open(embedding_path, 'rt') as file:
 50 |             for line in file:
 51 |                 line = line.strip()
 52 |                 if len(line) == 0:
 53 |                     continue
 54 | 
 55 |                 tokens = line.split()
 56 |                 if embedd_dim < 0:
 57 |                     embedd_dim = len(tokens) - 1
 58 |                 else:
 59 |                     assert (embedd_dim + 1 == len(tokens))
 60 |                 embedd = np.empty([1, embedd_dim], dtype=np.float32)
 61 |                 embedd[:] = tokens[1:]
 62 |                 word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
 63 |                 embedd_dict[word] = embedd
 64 |         return embedd_dict, embedd_dim
 65 |     elif embedding == 'sskip':
 66 |         embedd_dim = -1
 67 |         embedd_dict = OrderedDict()
 68 |         with gzip.open(embedding_path, 'rt') as file:
 69 |             # skip the first line
 70 |             file.readline()
 71 |             for line in file:
 72 |                 line = line.strip()
 73 |                 try:
 74 |                     if len(line) == 0:
 75 |                         continue
 76 | 
 77 |                     tokens = line.split()
 78 |                     if len(tokens) < embedd_dim:
 79 |                         continue
 80 | 
 81 |                     if embedd_dim < 0:
 82 |                         embedd_dim = len(tokens) - 1
 83 | 
 84 |                     embedd = np.empty([1, embedd_dim], dtype=np.float32)
 85 |                     start = len(tokens) - embedd_dim
 86 |                     word = ' '.join(tokens[0:start])
 87 |                     embedd[:] = tokens[start:]
 88 |                     word = DIGIT_RE.sub("0", word) if normalize_digits else word
 89 |                     embedd_dict[word] = embedd
 90 |                 except UnicodeDecodeError:
 91 |                     continue
 92 |         return embedd_dict, embedd_dim
 93 |     elif embedding == 'polyglot':
 94 |         words, embeddings = pickle.load(open(embedding_path, 'rb'), encoding='latin1')
 95 |         _, embedd_dim = embeddings.shape
 96 |         embedd_dict = OrderedDict()
 97 |         for i, word in enumerate(words):
 98 |             embedd = np.empty([1, embedd_dim], dtype=np.float32)
 99 |             embedd[:] = embeddings[i, :]
100 |             word = DIGIT_RE.sub("0", word) if normalize_digits else word
101 |             embedd_dict[word] = embedd
102 |         return embedd_dict, embedd_dim
103 | 
104 |     else:
105 |         raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]")
106 | 


--------------------------------------------------------------------------------