├── .gitignore ├── LICENSE ├── README.md ├── experiments ├── configs │ ├── ner │ │ └── conll03.json │ ├── parsing │ │ ├── biaffine.json │ │ ├── neuromst.json │ │ └── stackptr.json │ └── pos │ │ └── wsj.json ├── eval │ ├── conll03eval.v2 │ └── conll06eval.pl ├── ner.py ├── parsing.py ├── pos_tagging.py └── scripts │ ├── run_analyze.sh │ ├── run_deepbiaf.sh │ ├── run_ner_conll03.sh │ ├── run_neuromst.sh │ ├── run_pos_wsj.sh │ └── run_stackptr.sh └── neuronlp2 ├── __init__.py ├── io ├── __init__.py ├── alphabet.py ├── common.py ├── conll03_data.py ├── conllx_data.py ├── conllx_stacked_data.py ├── instance.py ├── logger.py ├── reader.py ├── utils.py └── writer.py ├── models ├── __init__.py ├── parsing.py └── sequence_labeling.py ├── nn ├── __init__.py ├── _functions │ ├── __init__.py │ ├── rnnFusedBackend.py │ ├── skipconnect_rnn.py │ └── variational_rnn.py ├── crf.py ├── init.py ├── modules.py ├── skip_rnn.py ├── utils.py └── variational_rnn.py ├── optim ├── __init__.py └── lr_scheduler.py ├── tasks ├── __init__.py └── parser.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # data dir 104 | /experiments/data/ 105 | 106 | # model dir 107 | /experiments/models/ 108 | 109 | # log dir 110 | /experiments/log/ 111 | 112 | # test dir 113 | /tests/ 114 | 115 | # IDE 116 | /.idea/ 117 | *.iml 118 | *.sublime-project 119 | *.sublime-workspace 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NeuroNLP2 2 | Deep neural models for core NLP tasks based on Pytorch(version 2) 3 | 4 | This is the code we used in the following papers 5 | >[End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF](http://www.cs.cmu.edu/~xuezhem/publications/P16-1101.pdf) 6 | 7 | >Xuezhe Ma, Eduard Hovy 8 | 9 | >ACL 2016 10 | 11 | >[Neural Probabilistic Model for Non-projective MST Parsing](http://www.cs.cmu.edu/~xuezhem/publications/IJCNLP2017.pdf) 12 | 13 | >Xuezhe Ma, Eduard Hovy 14 | 15 | >IJCNLP 2017 16 | 17 | >[Stack-Pointer Networks for Dependency Parsing](https://arxiv.org/pdf/1805.01087.pdf) 18 | 19 | >Xuezhe Ma, Zecong Hu, Jingzhou Liu, Nanyun Peng, Graham Neubig and Eduard Hovy 20 | 21 | >ACL 2018 22 | 23 | It also includes the re-implementation of the Stanford Deep BiAffine Parser: 24 | >[Deep Biaffine Attention for Neural Dependency Parsing](https://arxiv.org/abs/1611.01734) 25 | 26 | >Timothy Dozat, Christopher D. Manning 27 | 28 | >ICLR 2017 29 | 30 | ## Updates 31 | 1. Upgraded the code to support PyTorch 1.3 and Python 3.6 32 | 2. Re-factored code to better organization 33 | 3. Implemented the batch version of Stack-Pointer Parser decoding algorithm, about 50 times faster! 34 | 35 | ## Requirements 36 | 37 | Python 3.6, PyTorch >=1.3.1, Gensim >= 0.12.0 38 | 39 | ## Data format 40 | For the data format used in our implementation, please read this [issue](https://github.com/XuezheMax/NeuroNLP2/issues/9). 41 | 42 | ## Running the experiments 43 | First to the experiments folder: 44 | 45 | cd experiments 46 | ### Sequence labeling 47 | To train a CRF POS tagger of PTB WSJ corpus, 48 | 49 | ./scripts/run_pos_wsj.sh 50 | where the arguments for ```train/dev/test``` data, together with the pretrained word embedding should be setup. 51 | 52 | To train a NER model on CoNLL-2003 English data set, 53 | 54 | ./scripts/run_ner_conll03.sh 55 | 56 | ### Dependency Parsing 57 | To train a Stack-Pointer parser, simply run 58 | 59 | ./scripts/run_stackptr.sh 60 | Remeber to setup the paths for data and embeddings. 61 | 62 | To train a Deep BiAffine parser, simply run 63 | 64 | ./scripts/run_deepbiaf.sh 65 | Again, remember to setup the paths for data and embeddings. 66 | 67 | To train a Neural MST parser, 68 | 69 | ./scripts/run_neuromst.sh 70 | -------------------------------------------------------------------------------- /experiments/configs/ner/conll03.json: -------------------------------------------------------------------------------- 1 | { 2 | "crf": true, 3 | "bigram": true, 4 | "embedd_dim": 100, 5 | "char_dim": 30, 6 | "rnn_mode": "LSTM", 7 | "num_layers":1, 8 | "hidden_size": 256, 9 | "out_features": 128, 10 | "dropout": "std", 11 | "p_in": 0.33, 12 | "p_out": 0.5, 13 | "p_rnn": [0.33, 0.5], 14 | "activation": "elu" 15 | } 16 | -------------------------------------------------------------------------------- /experiments/configs/parsing/biaffine.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "DeepBiAffine", 3 | "word_dim": 100, 4 | "char_dim": 100, 5 | "pos": true, 6 | "pos_dim": 100, 7 | "rnn_mode": "FastLSTM", 8 | "num_layers":3, 9 | "hidden_size": 512, 10 | "arc_space": 512, 11 | "type_space": 128, 12 | "p_in": 0.33, 13 | "p_out": 0.33, 14 | "p_rnn": [0.33, 0.33], 15 | "activation": "elu" 16 | } 17 | -------------------------------------------------------------------------------- /experiments/configs/parsing/neuromst.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "NeuroMST", 3 | "word_dim": 100, 4 | "char_dim": 100, 5 | "pos": true, 6 | "pos_dim": 100, 7 | "rnn_mode": "FastLSTM", 8 | "num_layers":3, 9 | "hidden_size": 512, 10 | "arc_space": 512, 11 | "type_space": 128, 12 | "p_in": 0.33, 13 | "p_out": 0.33, 14 | "p_rnn": [0.33, 0.33], 15 | "activation": "elu" 16 | } 17 | -------------------------------------------------------------------------------- /experiments/configs/parsing/stackptr.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "StackPtr", 3 | "word_dim": 100, 4 | "char_dim": 100, 5 | "pos": true, 6 | "pos_dim": 100, 7 | "rnn_mode": "FastLSTM", 8 | "encoder_layers":3, 9 | "decoder_layers":1, 10 | "hidden_size": 512, 11 | "arc_space": 512, 12 | "type_space": 128, 13 | "p_in": 0.33, 14 | "p_out": 0.33, 15 | "p_rnn": [0.33, 0.33], 16 | "prior_order": "inside_out", 17 | "grandPar": true, 18 | "sibling": true, 19 | "activation": "elu" 20 | } 21 | -------------------------------------------------------------------------------- /experiments/configs/pos/wsj.json: -------------------------------------------------------------------------------- 1 | { 2 | "crf": true, 3 | "bigram": true, 4 | "embedd_dim": 100, 5 | "char_dim": 30, 6 | "rnn_mode": "LSTM", 7 | "num_layers":1, 8 | "hidden_size": 256, 9 | "out_features": 256, 10 | "dropout": "std", 11 | "p_in": 0.33, 12 | "p_out": 0.5, 13 | "p_rnn": [0.33, 0.5], 14 | "activation": "elu" 15 | } 16 | -------------------------------------------------------------------------------- /experiments/eval/conll03eval.v2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 86 | elsif ($nbrOfFeatures != $#features and @features != 0) { 87 | printf STDERR "unexpected number of features: %d (%d)\n", 88 | $#features+1,$nbrOfFeatures+1; 89 | exit(1); 90 | } 91 | if (@features == 0 or 92 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 93 | if (@features < 2) { 94 | die "conlleval: unexpected number of features in line $line\n"; 95 | } 96 | if ($raw) { 97 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 98 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 99 | if ($features[$#features] ne "O") { 100 | $features[$#features] = "B-$features[$#features]"; 101 | } 102 | if ($features[$#features-1] ne "O") { 103 | $features[$#features-1] = "B-$features[$#features-1]"; 104 | } 105 | } 106 | # 20040126 ET code which allows hyphens in the types 107 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 108 | $guessed = $1; 109 | $guessedType = $2; 110 | } else { 111 | $guessed = $features[$#features]; 112 | $guessedType = ""; 113 | } 114 | pop(@features); 115 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 116 | $correct = $1; 117 | $correctType = $2; 118 | } else { 119 | $correct = $features[$#features]; 120 | $correctType = ""; 121 | } 122 | pop(@features); 123 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 124 | # ($correct,$correctType) = split(/-/,pop(@features)); 125 | $guessedType = $guessedType ? $guessedType : ""; 126 | $correctType = $correctType ? $correctType : ""; 127 | $firstItem = shift(@features); 128 | 129 | # 1999-06-26 sentence breaks should always be counted as out of chunk 130 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 131 | 132 | if ($inCorrect) { 133 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 134 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 135 | $lastGuessedType eq $lastCorrectType) { 136 | $inCorrect=$false; 137 | $correctChunk++; 138 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 139 | $correctChunk{$lastCorrectType}+1 : 1; 140 | } elsif ( 141 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 142 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 143 | $guessedType ne $correctType ) { 144 | $inCorrect=$false; 145 | } 146 | } 147 | 148 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 149 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 150 | $guessedType eq $correctType) { $inCorrect = $true; } 151 | 152 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 153 | $foundCorrect++; 154 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 155 | $foundCorrect{$correctType}+1 : 1; 156 | } 157 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 158 | $foundGuessed++; 159 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 160 | $foundGuessed{$guessedType}+1 : 1; 161 | } 162 | if ( $firstItem ne $boundary ) { 163 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 164 | $correctTags++; 165 | } 166 | $tokenCounter++; 167 | } 168 | 169 | $lastGuessed = $guessed; 170 | $lastCorrect = $correct; 171 | $lastGuessedType = $guessedType; 172 | $lastCorrectType = $correctType; 173 | } 174 | if ($inCorrect) { 175 | $correctChunk++; 176 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 177 | $correctChunk{$lastCorrectType}+1 : 1; 178 | } 179 | 180 | if (not $latex) { 181 | # compute overall precision, recall and FB1 (default values are 0.0) 182 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 183 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 184 | $FB1 = 2*$precision*$recall/($precision+$recall) 185 | if ($precision+$recall > 0); 186 | 187 | # print overall performance 188 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 189 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 190 | if ($tokenCounter>0) { 191 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 192 | printf "precision: %6.2f%%; ",$precision; 193 | printf "recall: %6.2f%%; ",$recall; 194 | printf "FB1: %6.2f\n",$FB1; 195 | } 196 | } 197 | 198 | # sort chunk type names 199 | undef($lastType); 200 | @sortedTypes = (); 201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 202 | if (not($lastType) or $lastType ne $i) { 203 | push(@sortedTypes,($i)); 204 | } 205 | $lastType = $i; 206 | } 207 | # print performance per chunk type 208 | if (not $latex) { 209 | for $i (@sortedTypes) { 210 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 211 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 212 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 213 | if (not($foundCorrect{$i})) { $recall = 0.0; } 214 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 215 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 216 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 217 | printf "%17s: ",$i; 218 | printf "precision: %6.2f%%; ",$precision; 219 | printf "recall: %6.2f%%; ",$recall; 220 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 221 | } 222 | } else { 223 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 224 | for $i (@sortedTypes) { 225 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 226 | if (not($foundGuessed{$i})) { $precision = 0.0; } 227 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 228 | if (not($foundCorrect{$i})) { $recall = 0.0; } 229 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 230 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 231 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 232 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 233 | $i,$precision,$recall,$FB1; 234 | } 235 | print "\\hline\n"; 236 | $precision = 0.0; 237 | $recall = 0; 238 | $FB1 = 0.0; 239 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 240 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 241 | $FB1 = 2*$precision*$recall/($precision+$recall) 242 | if ($precision+$recall > 0); 243 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 244 | $precision,$recall,$FB1; 245 | } 246 | 247 | exit 0; 248 | 249 | # endOfChunk: checks if a chunk ended between the previous and current word 250 | # arguments: previous and current chunk tags, previous and current types 251 | # note: this code is capable of handling other chunk representations 252 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 253 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 254 | 255 | sub endOfChunk { 256 | my $prevTag = shift(@_); 257 | my $tag = shift(@_); 258 | my $prevType = shift(@_); 259 | my $type = shift(@_); 260 | my $chunkEnd = $false; 261 | 262 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 263 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 264 | if ( $prevTag eq "B" and $tag eq "S" ) { $chunkEnd = $true; } 265 | 266 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 267 | if ( $prevTag eq "I" and $tag eq "S" ) { $chunkEnd = $true; } 268 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 269 | 270 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 271 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 272 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 273 | if ( $prevTag eq "E" and $tag eq "S" ) { $chunkEnd = $true; } 274 | if ( $prevTag eq "E" and $tag eq "B" ) { $chunkEnd = $true; } 275 | 276 | if ( $prevTag eq "S" and $tag eq "E" ) { $chunkEnd = $true; } 277 | if ( $prevTag eq "S" and $tag eq "I" ) { $chunkEnd = $true; } 278 | if ( $prevTag eq "S" and $tag eq "O" ) { $chunkEnd = $true; } 279 | if ( $prevTag eq "S" and $tag eq "S" ) { $chunkEnd = $true; } 280 | if ( $prevTag eq "S" and $tag eq "B" ) { $chunkEnd = $true; } 281 | 282 | 283 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 284 | $chunkEnd = $true; 285 | } 286 | 287 | # corrected 1998-12-22: these chunks are assumed to have length 1 288 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 289 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 290 | 291 | return($chunkEnd); 292 | } 293 | 294 | # startOfChunk: checks if a chunk started between the previous and current word 295 | # arguments: previous and current chunk tags, previous and current types 296 | # note: this code is capable of handling other chunk representations 297 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 298 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 299 | 300 | sub startOfChunk { 301 | my $prevTag = shift(@_); 302 | my $tag = shift(@_); 303 | my $prevType = shift(@_); 304 | my $type = shift(@_); 305 | my $chunkStart = $false; 306 | 307 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 308 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 309 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 310 | if ( $prevTag eq "S" and $tag eq "B" ) { $chunkStart = $true; } 311 | if ( $prevTag eq "E" and $tag eq "B" ) { $chunkStart = $true; } 312 | 313 | if ( $prevTag eq "B" and $tag eq "S" ) { $chunkStart = $true; } 314 | if ( $prevTag eq "I" and $tag eq "S" ) { $chunkStart = $true; } 315 | if ( $prevTag eq "O" and $tag eq "S" ) { $chunkStart = $true; } 316 | if ( $prevTag eq "S" and $tag eq "S" ) { $chunkStart = $true; } 317 | if ( $prevTag eq "E" and $tag eq "S" ) { $chunkStart = $true; } 318 | 319 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 320 | if ( $prevTag eq "S" and $tag eq "I" ) { $chunkStart = $true; } 321 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 322 | 323 | if ( $prevTag eq "S" and $tag eq "E" ) { $chunkStart = $true; } 324 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 325 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 326 | 327 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 328 | $chunkStart = $true; 329 | } 330 | 331 | # corrected 1998-12-22: these chunks are assumed to have length 1 332 | if ( $tag eq "[" ) { $chunkStart = $true; } 333 | if ( $tag eq "]" ) { $chunkStart = $true; } 334 | 335 | return($chunkStart); 336 | } 337 | -------------------------------------------------------------------------------- /experiments/ner.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Bi-directional LSTM-CNNs-CRF model for NER. 3 | """ 4 | 5 | import os 6 | import sys 7 | import gc 8 | import json 9 | 10 | current_path = os.path.dirname(os.path.realpath(__file__)) 11 | root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 12 | sys.path.append(root_path) 13 | 14 | import time 15 | import argparse 16 | 17 | import numpy as np 18 | import torch 19 | from torch.optim.adamw import AdamW 20 | from torch.optim import SGD 21 | from torch.nn.utils import clip_grad_norm_ 22 | from neuronlp2.io import get_logger, conll03_data, CoNLL03Writer, iterate_data 23 | from neuronlp2.models import BiRecurrentConv, BiVarRecurrentConv, BiRecurrentConvCRF, BiVarRecurrentConvCRF 24 | from neuronlp2.optim import ExponentialScheduler 25 | from neuronlp2 import utils 26 | 27 | 28 | def evaluate(output_file, scorefile): 29 | script = os.path.join(current_path, 'eval/conll03eval.v2') 30 | os.system("perl %s < %s > %s" % (script, output_file, scorefile)) 31 | with open(scorefile, 'r') as fin: 32 | fin.readline() 33 | line = fin.readline() 34 | fields = line.split(";") 35 | acc = float(fields[0].split(":")[1].strip()[:-1]) 36 | precision = float(fields[1].split(":")[1].strip()[:-1]) 37 | recall = float(fields[2].split(":")[1].strip()[:-1]) 38 | f1 = float(fields[3].split(":")[1].strip()) 39 | return acc, precision, recall, f1 40 | 41 | 42 | def get_optimizer(parameters, optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps): 43 | if optim == 'sgd': 44 | optimizer = SGD(parameters, lr=learning_rate, momentum=0.9, weight_decay=weight_decay, nesterov=True) 45 | else: 46 | optimizer = AdamW(parameters, lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, amsgrad=amsgrad, weight_decay=weight_decay) 47 | init_lr = 1e-7 48 | scheduler = ExponentialScheduler(optimizer, lr_decay, warmup_steps, init_lr) 49 | return optimizer, scheduler 50 | 51 | 52 | def eval(data, network, writer, outfile, scorefile, device): 53 | network.eval() 54 | writer.start(outfile) 55 | for data in iterate_data(data, 256): 56 | words = data['WORD'].to(device) 57 | chars = data['CHAR'].to(device) 58 | labels = data['NER'].numpy() 59 | masks = data['MASK'].to(device) 60 | postags = data['POS'].numpy() 61 | chunks = data['CHUNK'].numpy() 62 | lengths = data['LENGTH'].numpy() 63 | preds = network.decode(words, chars, mask=masks, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) 64 | writer.write(words.cpu().numpy(), postags, chunks, preds.cpu().numpy(), labels, lengths) 65 | writer.close() 66 | acc, precision, recall, f1 = evaluate(outfile, scorefile) 67 | return acc, precision, recall, f1 68 | 69 | 70 | def main(): 71 | parser = argparse.ArgumentParser(description='NER with bi-directional RNN-CNN') 72 | parser.add_argument('--config', type=str, help='config file', required=True) 73 | parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') 74 | parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') 75 | parser.add_argument('--loss_type', choices=['sentence', 'token'], default='sentence', help='loss type (default: sentence)') 76 | parser.add_argument('--optim', choices=['sgd', 'adam'], help='type of optimizer', required=True) 77 | parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') 78 | parser.add_argument('--lr_decay', type=float, default=0.999995, help='Decay rate of learning rate') 79 | parser.add_argument('--amsgrad', action='store_true', help='AMS Grad') 80 | parser.add_argument('--grad_clip', type=float, default=0, help='max norm for gradient clip (default 0: no clip') 81 | parser.add_argument('--warmup_steps', type=int, default=0, metavar='N', help='number of steps to warm up (default: 0)') 82 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight for l2 norm decay') 83 | parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') 84 | parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True) 85 | parser.add_argument('--embedding_dict', help='path for embedding dict') 86 | parser.add_argument('--train', help='path for training file.', required=True) 87 | parser.add_argument('--dev', help='path for dev file.', required=True) 88 | parser.add_argument('--test', help='path for test file.', required=True) 89 | parser.add_argument('--model_path', help='path for saving model file.', required=True) 90 | 91 | args = parser.parse_args() 92 | 93 | logger = get_logger("NER") 94 | 95 | args.cuda = torch.cuda.is_available() 96 | device = torch.device('cuda', 0) if args.cuda else torch.device('cpu') 97 | train_path = args.train 98 | dev_path = args.dev 99 | test_path = args.test 100 | 101 | num_epochs = args.num_epochs 102 | batch_size = args.batch_size 103 | optim = args.optim 104 | learning_rate = args.learning_rate 105 | lr_decay = args.lr_decay 106 | amsgrad = args.amsgrad 107 | warmup_steps = args.warmup_steps 108 | weight_decay = args.weight_decay 109 | grad_clip = args.grad_clip 110 | 111 | loss_ty_token = args.loss_type == 'token' 112 | unk_replace = args.unk_replace 113 | 114 | model_path = args.model_path 115 | model_name = os.path.join(model_path, 'model.pt') 116 | embedding = args.embedding 117 | embedding_path = args.embedding_dict 118 | 119 | print(args) 120 | 121 | embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path) 122 | 123 | logger.info("Creating Alphabets") 124 | alphabet_path = os.path.join(model_path, 'alphabets') 125 | word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet = conll03_data.create_alphabets(alphabet_path, train_path, 126 | data_paths=[dev_path, test_path], 127 | embedd_dict=embedd_dict, max_vocabulary_size=50000) 128 | 129 | logger.info("Word Alphabet Size: %d" % word_alphabet.size()) 130 | logger.info("Character Alphabet Size: %d" % char_alphabet.size()) 131 | logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) 132 | logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) 133 | logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) 134 | 135 | logger.info("Reading Data") 136 | 137 | data_train = conll03_data.read_bucketed_data(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) 138 | num_data = sum(data_train[1]) 139 | num_labels = ner_alphabet.size() 140 | 141 | data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) 142 | data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) 143 | 144 | writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) 145 | 146 | def construct_word_embedding_table(): 147 | scale = np.sqrt(3.0 / embedd_dim) 148 | table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) 149 | table[conll03_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) 150 | oov = 0 151 | for word, index in word_alphabet.items(): 152 | if word in embedd_dict: 153 | embedding = embedd_dict[word] 154 | elif word.lower() in embedd_dict: 155 | embedding = embedd_dict[word.lower()] 156 | else: 157 | embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) 158 | oov += 1 159 | table[index, :] = embedding 160 | print('oov: %d' % oov) 161 | return torch.from_numpy(table) 162 | 163 | word_table = construct_word_embedding_table() 164 | 165 | logger.info("constructing network...") 166 | 167 | hyps = json.load(open(args.config, 'r')) 168 | json.dump(hyps, open(os.path.join(model_path, 'config.json'), 'w'), indent=2) 169 | dropout = hyps['dropout'] 170 | crf = hyps['crf'] 171 | bigram = hyps['bigram'] 172 | assert embedd_dim == hyps['embedd_dim'] 173 | char_dim = hyps['char_dim'] 174 | mode = hyps['rnn_mode'] 175 | hidden_size = hyps['hidden_size'] 176 | out_features = hyps['out_features'] 177 | num_layers = hyps['num_layers'] 178 | p_in = hyps['p_in'] 179 | p_out = hyps['p_out'] 180 | p_rnn = hyps['p_rnn'] 181 | activation = hyps['activation'] 182 | 183 | if dropout == 'std': 184 | if crf: 185 | network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, 186 | num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation) 187 | else: 188 | network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, 189 | num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) 190 | elif dropout == 'variational': 191 | if crf: 192 | network = BiVarRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, 193 | num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation) 194 | else: 195 | network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, 196 | num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) 197 | else: 198 | raise ValueError('Unkown dropout type: {}'.format(dropout)) 199 | 200 | network = network.to(device) 201 | 202 | optimizer, scheduler = get_optimizer(network.parameters(), optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps) 203 | model = "{}-CNN{}".format(mode, "-CRF" if crf else "") 204 | logger.info("Network: %s, num_layer=%d, hidden=%d, act=%s" % (model, num_layers, hidden_size, activation)) 205 | logger.info("training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (weight_decay, num_data, batch_size, unk_replace)) 206 | logger.info("dropout(in, out, rnn): %s(%.2f, %.2f, %s)" % (dropout, p_in, p_out, p_rnn)) 207 | print('# of Parameters: %d' % (sum([param.numel() for param in network.parameters()]))) 208 | 209 | best_f1 = 0.0 210 | best_acc = 0.0 211 | best_precision = 0.0 212 | best_recall = 0.0 213 | test_f1 = 0.0 214 | test_acc = 0.0 215 | test_precision = 0.0 216 | test_recall = 0.0 217 | best_epoch = 0 218 | patient = 0 219 | num_batches = num_data // batch_size + 1 220 | result_path = os.path.join(model_path, 'tmp') 221 | if not os.path.exists(result_path): 222 | os.makedirs(result_path) 223 | for epoch in range(1, num_epochs + 1): 224 | start_time = time.time() 225 | train_loss = 0. 226 | num_insts = 0 227 | num_words = 0 228 | num_back = 0 229 | network.train() 230 | lr = scheduler.get_lr()[0] 231 | print('Epoch %d (%s, lr=%.6f, lr decay=%.6f, amsgrad=%s, l2=%.1e): ' % (epoch, optim, lr, lr_decay, amsgrad, weight_decay)) 232 | if args.cuda: 233 | torch.cuda.empty_cache() 234 | gc.collect() 235 | for step, data in enumerate(iterate_data(data_train, batch_size, bucketed=True, unk_replace=unk_replace, shuffle=True)): 236 | optimizer.zero_grad() 237 | words = data['WORD'].to(device) 238 | chars = data['CHAR'].to(device) 239 | labels = data['NER'].to(device) 240 | masks = data['MASK'].to(device) 241 | 242 | nbatch = words.size(0) 243 | nwords = masks.sum().item() 244 | 245 | loss_total = network.loss(words, chars, labels, mask=masks).sum() 246 | if loss_ty_token: 247 | loss = loss_total.div(nwords) 248 | else: 249 | loss = loss_total.div(nbatch) 250 | loss.backward() 251 | if grad_clip > 0: 252 | clip_grad_norm_(network.parameters(), grad_clip) 253 | optimizer.step() 254 | scheduler.step() 255 | 256 | with torch.no_grad(): 257 | num_insts += nbatch 258 | num_words += nwords 259 | train_loss += loss_total.item() 260 | 261 | # update log 262 | if step % 100 == 0: 263 | torch.cuda.empty_cache() 264 | sys.stdout.write("\b" * num_back) 265 | sys.stdout.write(" " * num_back) 266 | sys.stdout.write("\b" * num_back) 267 | curr_lr = scheduler.get_lr()[0] 268 | log_info = '[%d/%d (%.0f%%) lr=%.6f] loss: %.4f (%.4f)' % (step, num_batches, 100. * step / num_batches, 269 | curr_lr, train_loss / num_insts, train_loss / num_words) 270 | sys.stdout.write(log_info) 271 | sys.stdout.flush() 272 | num_back = len(log_info) 273 | 274 | sys.stdout.write("\b" * num_back) 275 | sys.stdout.write(" " * num_back) 276 | sys.stdout.write("\b" * num_back) 277 | print('total: %d (%d), loss: %.4f (%.4f), time: %.2fs' % (num_insts, num_words, train_loss / num_insts, 278 | train_loss / num_words, time.time() - start_time)) 279 | print('-' * 100) 280 | 281 | # evaluate performance on dev data 282 | with torch.no_grad(): 283 | outfile = os.path.join(result_path, 'pred_dev%d' % epoch) 284 | scorefile = os.path.join(result_path, "score_dev%d" % epoch) 285 | acc, precision, recall, f1 = eval(data_dev, network, writer, outfile, scorefile, device) 286 | print('Dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) 287 | if best_f1 < f1: 288 | torch.save(network.state_dict(), model_name) 289 | best_f1 = f1 290 | best_acc = acc 291 | best_precision = precision 292 | best_recall = recall 293 | best_epoch = epoch 294 | 295 | # evaluate on test data when better performance detected 296 | outfile = os.path.join(result_path, 'pred_test%d' % epoch) 297 | scorefile = os.path.join(result_path, "score_test%d" % epoch) 298 | test_acc, test_precision, test_recall, test_f1 = eval(data_test, network, writer, outfile, scorefile, device) 299 | print('test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (test_acc, test_precision, test_recall, test_f1)) 300 | patient = 0 301 | else: 302 | patient += 1 303 | print('-' * 100) 304 | 305 | print("Best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))" % (best_acc, best_precision, best_recall, best_f1, best_epoch, patient)) 306 | print("Best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))" % (test_acc, test_precision, test_recall, test_f1, best_epoch, patient)) 307 | print('=' * 100) 308 | 309 | if patient > 4: 310 | logger.info('reset optimizer momentums') 311 | scheduler.reset_state() 312 | patient = 0 313 | 314 | 315 | if __name__ == '__main__': 316 | main() 317 | -------------------------------------------------------------------------------- /experiments/pos_tagging.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Bi-directional LSTM-CNNs-CRF model for POS Tagging. 3 | """ 4 | 5 | import os 6 | import sys 7 | import gc 8 | import json 9 | 10 | current_path = os.path.dirname(os.path.realpath(__file__)) 11 | root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 12 | sys.path.append(root_path) 13 | 14 | import time 15 | import argparse 16 | 17 | import numpy as np 18 | import torch 19 | from torch.optim.adamw import AdamW 20 | from torch.optim import SGD 21 | from torch.nn.utils import clip_grad_norm_ 22 | from neuronlp2.io import get_logger, conllx_data, iterate_data, POSWriter 23 | from neuronlp2.models import BiRecurrentConv, BiVarRecurrentConv, BiRecurrentConvCRF, BiVarRecurrentConvCRF 24 | from neuronlp2.optim import ExponentialScheduler 25 | from neuronlp2 import utils 26 | 27 | 28 | def get_optimizer(parameters, optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps): 29 | if optim == 'sgd': 30 | optimizer = SGD(parameters, lr=learning_rate, momentum=0.9, weight_decay=weight_decay, nesterov=True) 31 | else: 32 | optimizer = AdamW(parameters, lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, amsgrad=amsgrad, weight_decay=weight_decay) 33 | init_lr = 1e-7 34 | scheduler = ExponentialScheduler(optimizer, lr_decay, warmup_steps, init_lr) 35 | return optimizer, scheduler 36 | 37 | 38 | def eval(data, network, writer, outfile, device): 39 | network.eval() 40 | corr = 0 41 | total = 0 42 | writer.start(outfile) 43 | for data in iterate_data(data, 256): 44 | words = data['WORD'].to(device) 45 | chars = data['CHAR'].to(device) 46 | masks = data['MASK'].to(device) 47 | postags = data['POS'].to(device) 48 | lengths = data['LENGTH'].numpy() 49 | preds = network.decode(words, chars, mask=masks, leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS) 50 | corr += torch.eq(preds, postags).float().mul(masks).sum().item() 51 | total += masks.sum().item() 52 | writer.write(words.cpu().numpy(), preds.cpu().numpy(), postags.cpu().numpy(), lengths) 53 | writer.close() 54 | return corr, total 55 | 56 | 57 | def main(): 58 | parser = argparse.ArgumentParser(description='NER with bi-directional RNN-CNN') 59 | parser.add_argument('--config', type=str, help='config file', required=True) 60 | parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') 61 | parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') 62 | parser.add_argument('--loss_type', choices=['sentence', 'token'], default='sentence', help='loss type (default: sentence)') 63 | parser.add_argument('--optim', choices=['sgd', 'adam'], help='type of optimizer', required=True) 64 | parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') 65 | parser.add_argument('--lr_decay', type=float, default=0.999995, help='Decay rate of learning rate') 66 | parser.add_argument('--amsgrad', action='store_true', help='AMS Grad') 67 | parser.add_argument('--grad_clip', type=float, default=0, help='max norm for gradient clip (default 0: no clip') 68 | parser.add_argument('--warmup_steps', type=int, default=0, metavar='N', help='number of steps to warm up (default: 0)') 69 | parser.add_argument('--weight_decay', type=float, default=0.0, help='weight for l2 norm decay') 70 | parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') 71 | parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True) 72 | parser.add_argument('--embedding_dict', help='path for embedding dict') 73 | parser.add_argument('--train', help='path for training file.', required=True) 74 | parser.add_argument('--dev', help='path for dev file.', required=True) 75 | parser.add_argument('--test', help='path for test file.', required=True) 76 | parser.add_argument('--model_path', help='path for saving model file.', required=True) 77 | 78 | args = parser.parse_args() 79 | 80 | logger = get_logger("POS") 81 | 82 | args.cuda = torch.cuda.is_available() 83 | device = torch.device('cuda', 0) if args.cuda else torch.device('cpu') 84 | train_path = args.train 85 | dev_path = args.dev 86 | test_path = args.test 87 | 88 | num_epochs = args.num_epochs 89 | batch_size = args.batch_size 90 | optim = args.optim 91 | learning_rate = args.learning_rate 92 | lr_decay = args.lr_decay 93 | amsgrad = args.amsgrad 94 | warmup_steps = args.warmup_steps 95 | weight_decay = args.weight_decay 96 | grad_clip = args.grad_clip 97 | 98 | loss_ty_token = args.loss_type == 'token' 99 | unk_replace = args.unk_replace 100 | 101 | model_path = args.model_path 102 | model_name = os.path.join(model_path, 'model.pt') 103 | embedding = args.embedding 104 | embedding_path = args.embedding_dict 105 | 106 | print(args) 107 | 108 | embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path) 109 | 110 | logger.info("Creating Alphabets") 111 | alphabet_path = os.path.join(model_path, 'alphabets') 112 | word_alphabet, char_alphabet, pos_alphabet, type_alphabet = conllx_data.create_alphabets(alphabet_path, train_path, 113 | data_paths=[dev_path, test_path], 114 | embedd_dict=embedd_dict, max_vocabulary_size=50000) 115 | 116 | logger.info("Word Alphabet Size: %d" % word_alphabet.size()) 117 | logger.info("Character Alphabet Size: %d" % char_alphabet.size()) 118 | logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) 119 | 120 | logger.info("Reading Data") 121 | 122 | data_train = conllx_data.read_bucketed_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) 123 | num_data = sum(data_train[1]) 124 | num_labels = pos_alphabet.size() 125 | 126 | data_dev = conllx_data.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) 127 | data_test = conllx_data.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) 128 | 129 | writer = POSWriter(word_alphabet, char_alphabet, pos_alphabet) 130 | 131 | def construct_word_embedding_table(): 132 | scale = np.sqrt(3.0 / embedd_dim) 133 | table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) 134 | table[conllx_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) 135 | oov = 0 136 | for word, index in word_alphabet.items(): 137 | if word in embedd_dict: 138 | embedding = embedd_dict[word] 139 | elif word.lower() in embedd_dict: 140 | embedding = embedd_dict[word.lower()] 141 | else: 142 | embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) 143 | oov += 1 144 | table[index, :] = embedding 145 | print('oov: %d' % oov) 146 | return torch.from_numpy(table) 147 | 148 | word_table = construct_word_embedding_table() 149 | 150 | logger.info("constructing network...") 151 | 152 | hyps = json.load(open(args.config, 'r')) 153 | json.dump(hyps, open(os.path.join(model_path, 'config.json'), 'w'), indent=2) 154 | dropout = hyps['dropout'] 155 | crf = hyps['crf'] 156 | bigram = hyps['bigram'] 157 | assert embedd_dim == hyps['embedd_dim'] 158 | char_dim = hyps['char_dim'] 159 | mode = hyps['rnn_mode'] 160 | hidden_size = hyps['hidden_size'] 161 | out_features = hyps['out_features'] 162 | num_layers = hyps['num_layers'] 163 | p_in = hyps['p_in'] 164 | p_out = hyps['p_out'] 165 | p_rnn = hyps['p_rnn'] 166 | activation = hyps['activation'] 167 | 168 | if dropout == 'std': 169 | if crf: 170 | network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, 171 | num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation) 172 | else: 173 | network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, 174 | num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) 175 | elif dropout == 'variational': 176 | if crf: 177 | network = BiVarRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, 178 | num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation) 179 | else: 180 | network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, 181 | num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) 182 | else: 183 | raise ValueError('Unkown dropout type: {}'.format(dropout)) 184 | 185 | network = network.to(device) 186 | 187 | optimizer, scheduler = get_optimizer(network.parameters(), optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps) 188 | model = "{}-CNN{}".format(mode, "-CRF" if crf else "") 189 | logger.info("Network: %s, num_layer=%d, hidden=%d, act=%s" % (model, num_layers, hidden_size, activation)) 190 | logger.info("training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (weight_decay, num_data, batch_size, unk_replace)) 191 | logger.info("dropout(in, out, rnn): %s(%.2f, %.2f, %s)" % (dropout, p_in, p_out, p_rnn)) 192 | print('# of Parameters: %d' % (sum([param.numel() for param in network.parameters()]))) 193 | 194 | best_corr = 0.0 195 | best_total = 0.0 196 | test_corr = 0.0 197 | test_total = 0.0 198 | best_epoch = 0 199 | patient = 0 200 | num_batches = num_data // batch_size + 1 201 | result_path = os.path.join(model_path, 'tmp') 202 | if not os.path.exists(result_path): 203 | os.makedirs(result_path) 204 | for epoch in range(1, num_epochs + 1): 205 | start_time = time.time() 206 | train_loss = 0. 207 | num_insts = 0 208 | num_words = 0 209 | num_back = 0 210 | network.train() 211 | lr = scheduler.get_lr()[0] 212 | print('Epoch %d (%s, lr=%.6f, lr decay=%.6f, amsgrad=%s, l2=%.1e): ' % (epoch, optim, lr, lr_decay, amsgrad, weight_decay)) 213 | if args.cuda: 214 | torch.cuda.empty_cache() 215 | gc.collect() 216 | for step, data in enumerate(iterate_data(data_train, batch_size, bucketed=True, unk_replace=unk_replace, shuffle=True)): 217 | optimizer.zero_grad() 218 | words = data['WORD'].to(device) 219 | chars = data['CHAR'].to(device) 220 | labels = data['POS'].to(device) 221 | masks = data['MASK'].to(device) 222 | 223 | nbatch = words.size(0) 224 | nwords = masks.sum().item() 225 | 226 | loss_total = network.loss(words, chars, labels, mask=masks).sum() 227 | if loss_ty_token: 228 | loss = loss_total.div(nwords) 229 | else: 230 | loss = loss_total.div(nbatch) 231 | loss.backward() 232 | if grad_clip > 0: 233 | clip_grad_norm_(network.parameters(), grad_clip) 234 | optimizer.step() 235 | scheduler.step() 236 | 237 | with torch.no_grad(): 238 | num_insts += nbatch 239 | num_words += nwords 240 | train_loss += loss_total.item() 241 | 242 | # update log 243 | if step % 100 == 0: 244 | torch.cuda.empty_cache() 245 | sys.stdout.write("\b" * num_back) 246 | sys.stdout.write(" " * num_back) 247 | sys.stdout.write("\b" * num_back) 248 | curr_lr = scheduler.get_lr()[0] 249 | log_info = '[%d/%d (%.0f%%) lr=%.6f] loss: %.4f (%.4f)' % (step, num_batches, 100. * step / num_batches, 250 | curr_lr, train_loss / num_insts, train_loss / num_words) 251 | sys.stdout.write(log_info) 252 | sys.stdout.flush() 253 | num_back = len(log_info) 254 | 255 | sys.stdout.write("\b" * num_back) 256 | sys.stdout.write(" " * num_back) 257 | sys.stdout.write("\b" * num_back) 258 | print('total: %d (%d), loss: %.4f (%.4f), time: %.2fs' % (num_insts, num_words, train_loss / num_insts, 259 | train_loss / num_words, time.time() - start_time)) 260 | print('-' * 100) 261 | 262 | # evaluate performance on dev data 263 | with torch.no_grad(): 264 | outfile = os.path.join(result_path, 'pred_dev%d' % epoch) 265 | dev_corr, dev_total = eval(data_dev, network, writer, outfile, device) 266 | print('Dev corr: %d, total: %d, acc: %.2f%%' % (dev_corr, dev_total, dev_corr * 100 / dev_total)) 267 | if best_corr < dev_corr: 268 | torch.save(network.state_dict(), model_name) 269 | best_corr = dev_corr 270 | best_total = dev_total 271 | best_epoch = epoch 272 | 273 | # evaluate on test data when better performance detected 274 | outfile = os.path.join(result_path, 'pred_test%d' % epoch) 275 | test_corr, test_total = eval(data_test, network, writer, outfile, device) 276 | print('test corr: %d, total: %d, acc: %.2f%%' % (test_corr, test_total, test_corr * 100 / test_total)) 277 | patient = 0 278 | else: 279 | patient += 1 280 | print('-' * 100) 281 | 282 | print("Best dev corr: %d, total: %d, acc: %.2f%% (epoch: %d (%d))" % (best_corr, best_total, best_corr * 100 / best_total, best_epoch, patient)) 283 | print("Best test corr: %d, total: %d, acc: %.2f%% (epoch: %d (%d))" % (test_corr, test_total, test_corr * 100 / test_total, best_epoch, patient)) 284 | print('=' * 100) 285 | 286 | if patient > 4: 287 | logger.info('reset optimizer momentums') 288 | scheduler.reset_state() 289 | patient = 0 290 | 291 | 292 | if __name__ == '__main__': 293 | main() 294 | -------------------------------------------------------------------------------- /experiments/scripts/run_analyze.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | CUDA_VISIBLE_DEVICES=1 python examples/analyze.py --parser stackptr --beam 5 --ordered --gpu \ 3 | --punctuation '.' '``' "''" ':' ',' \ 4 | --test "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.test.conll" \ 5 | --model_path "models/parsing/stack_ptr/" --model_name 'network.pt' 6 | -------------------------------------------------------------------------------- /experiments/scripts/run_deepbiaf.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python -u parsing.py --mode train --config configs/parsing/biaffine.json --num_epochs 400 --batch_size 32 \ 3 | --opt adam --learning_rate 0.001 --lr_decay 0.999995 --beta1 0.9 --beta2 0.9 --eps 1e-4 --grad_clip 5.0 \ 4 | --loss_type token --warmup_steps 40 --reset 20 --weight_decay 0.0 --unk_replace 0.5 \ 5 | --word_embedding sskip --word_path "data/sskip/sskip.eng.100.gz" --char_embedding random \ 6 | --punctuation '.' '``' "''" ':' ',' \ 7 | --train "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.train.conll" \ 8 | --dev "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.dev.conll" \ 9 | --test "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.test.conll" \ 10 | --model_path "models/parsing/deepbiaf/" 11 | -------------------------------------------------------------------------------- /experiments/scripts/run_ner_conll03.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python ner.py --config configs/ner/conll03.json --num_epochs 400 --batch_size 16 \ 3 | --loss_type sentence --optim sgd --learning_rate 0.01 --lr_decay 0.99999 --grad_clip 0.0 --warmup_steps 100 --weight_decay 0.0 --unk_replace 0.0 \ 4 | --embedding glove --embedding_dict "data/glove/glove.6B/glove.6B.100d.gz" --model_path "models/ner/conll03" \ 5 | --train "data/conll2003/english/eng.train.bioes.conll" --dev "data/conll2003/english/eng.dev.bioes.conll" --test "data/conll2003/english/eng.test.bioes.conll" 6 | -------------------------------------------------------------------------------- /experiments/scripts/run_neuromst.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python -u parsing.py --mode train --config configs/parsing/neuromst.json --num_epochs 400 --batch_size 32 \ 3 | --opt adam --learning_rate 0.001 --lr_decay 0.999995 --beta1 0.9 --beta2 0.9 --eps 1e-4 --grad_clip 5.0 \ 4 | --loss_type token --warmup_steps 40 --reset 20 --weight_decay 0.0 --unk_replace 0.5 \ 5 | --word_embedding sskip --word_path "data/sskip/sskip.eng.100.gz" --char_embedding random \ 6 | --punctuation '.' '``' "''" ':' ',' \ 7 | --train "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.train.conll" \ 8 | --dev "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.dev.conll" \ 9 | --test "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.test.conll" \ 10 | --model_path "models/parsing/neuromst/" 11 | -------------------------------------------------------------------------------- /experiments/scripts/run_pos_wsj.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python pos_tagging.py --config configs/pos/wsj.json --num_epochs 400 --batch_size 32 \ 3 | --loss_type sentence --optim sgd --learning_rate 0.01 --lr_decay 0.99999 --grad_clip 0.0 --warmup_steps 10 --weight_decay 0.0 --unk_replace 0.0 \ 4 | --embedding glove --embedding_dict "data/glove/glove.6B/glove.6B.100d.gz" --model_path "models/pos/wsj" \ 5 | --train "data/POS-penn/wsj/split1/wsj1.train.original" --dev "data/POS-penn/wsj/split1/wsj1.dev.original" --test "data/POS-penn/wsj/split1/wsj1.test.original" 6 | -------------------------------------------------------------------------------- /experiments/scripts/run_stackptr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | CUDA_VISIBLE_DEVICES=0 OMP_NUM_THREADS=4 python -u parsing.py --mode train --config configs/parsing/stackptr.json --num_epochs 600 --batch_size 32 \ 3 | --opt adam --learning_rate 0.001 --lr_decay 0.999997 --beta1 0.9 --beta2 0.9 --eps 1e-4 --grad_clip 5.0 \ 4 | --loss_type token --warmup_steps 40 --reset 20 --weight_decay 0.0 --unk_replace 0.5 --beam 10 \ 5 | --word_embedding sskip --word_path "data/sskip/sskip.eng.100.gz" --char_embedding random \ 6 | --punctuation '.' '``' "''" ':' ',' \ 7 | --train "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.train.conll" \ 8 | --dev "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.dev.conll" \ 9 | --test "data/PTB3.0/PTB3.0-Stanford_dep/ptb3.0-stanford.auto.cpos.test.conll" \ 10 | --model_path "models/parsing/stackptr/" 11 | -------------------------------------------------------------------------------- /neuronlp2/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | __version__ = "0.2.dev1" 4 | -------------------------------------------------------------------------------- /neuronlp2/io/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from neuronlp2.io.alphabet import Alphabet 4 | from neuronlp2.io.instance import * 5 | from neuronlp2.io.logger import get_logger 6 | from neuronlp2.io.writer import CoNLL03Writer, CoNLLXWriter, POSWriter 7 | from neuronlp2.io.utils import get_batch, get_bucketed_batch, iterate_data 8 | from neuronlp2.io import conllx_data, conll03_data, conllx_stacked_data 9 | -------------------------------------------------------------------------------- /neuronlp2/io/alphabet.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | """ 4 | Alphabet maps objects to integer ids. It provides two way mapping from the index to the objects. 5 | """ 6 | import json 7 | import os 8 | from neuronlp2.io.logger import get_logger 9 | 10 | class Alphabet(object): 11 | def __init__(self, name, defualt_value=False, keep_growing=True, singleton=False): 12 | self.__name = name 13 | 14 | self.instance2index = {} 15 | self.instances = [] 16 | self.default_value = defualt_value 17 | self.offset = 1 if self.default_value else 0 18 | self.keep_growing = keep_growing 19 | self.singletons = set() if singleton else None 20 | 21 | # Index 0 is occupied by default, all else following. 22 | self.default_index = 0 if self.default_value else None 23 | 24 | self.next_index = self.offset 25 | 26 | self.logger = get_logger('Alphabet') 27 | 28 | def add(self, instance): 29 | if instance not in self.instance2index: 30 | self.instances.append(instance) 31 | self.instance2index[instance] = self.next_index 32 | self.next_index += 1 33 | 34 | def add_singleton(self, id): 35 | if self.singletons is None: 36 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name) 37 | else: 38 | self.singletons.add(id) 39 | 40 | def add_singletons(self, ids): 41 | if self.singletons is None: 42 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name) 43 | else: 44 | self.singletons.update(ids) 45 | 46 | def is_singleton(self, id): 47 | if self.singletons is None: 48 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name) 49 | else: 50 | return id in self.singletons 51 | 52 | def get_index(self, instance): 53 | try: 54 | return self.instance2index[instance] 55 | except KeyError: 56 | if self.keep_growing: 57 | index = self.next_index 58 | self.add(instance) 59 | return index 60 | else: 61 | if self.default_value: 62 | return self.default_index 63 | else: 64 | raise KeyError("instance not found: %s" % instance) 65 | 66 | def get_instance(self, index): 67 | if self.default_value and index == self.default_index: 68 | # First index is occupied by the wildcard element. 69 | return '<_UNK>' 70 | else: 71 | try: 72 | return self.instances[index - self.offset] 73 | except IndexError: 74 | raise IndexError('unknown index: %d' % index) 75 | 76 | def size(self): 77 | return len(self.instances) + self.offset 78 | 79 | def singleton_size(self): 80 | return len(self.singletons) 81 | 82 | def items(self): 83 | return self.instance2index.items() 84 | 85 | def enumerate_items(self, start): 86 | if start < self.offset or start >= self.size(): 87 | raise IndexError("Enumerate is allowed between [%d : size of the alphabet)" % self.offset) 88 | return zip(range(start, len(self.instances) + self.offset), self.instances[start - self.offset:]) 89 | 90 | def close(self): 91 | self.keep_growing = False 92 | 93 | def open(self): 94 | self.keep_growing = True 95 | 96 | def get_content(self): 97 | if self.singletons is None: 98 | return {'instance2index': self.instance2index, 'instances': self.instances} 99 | else: 100 | return {'instance2index': self.instance2index, 'instances': self.instances, 101 | 'singletions': list(self.singletons)} 102 | 103 | def __from_json(self, data): 104 | self.instances = data["instances"] 105 | self.instance2index = data["instance2index"] 106 | if 'singletions' in data: 107 | self.singletons = set(data['singletions']) 108 | else: 109 | self.singletons = None 110 | 111 | def save(self, output_directory, name=None): 112 | """ 113 | Save both alhpabet records to the given directory. 114 | :param output_directory: Directory to save model and weights. 115 | :param name: The alphabet saving name, optional. 116 | :return: 117 | """ 118 | saving_name = name if name else self.__name 119 | try: 120 | if not os.path.exists(output_directory): 121 | os.makedirs(output_directory) 122 | 123 | json.dump(self.get_content(), 124 | open(os.path.join(output_directory, saving_name + ".json"), 'w'), indent=4) 125 | except Exception as e: 126 | self.logger.warn("Alphabet is not saved: %s" % repr(e)) 127 | 128 | def load(self, input_directory, name=None): 129 | """ 130 | Load model architecture and weights from the give directory. This allow we use old models even the structure 131 | changes. 132 | :param input_directory: Directory to save model and weights 133 | :return: 134 | """ 135 | loading_name = name if name else self.__name 136 | self.__from_json(json.load(open(os.path.join(input_directory, loading_name + ".json")))) 137 | self.next_index = len(self.instances) + self.offset 138 | self.keep_growing = False 139 | -------------------------------------------------------------------------------- /neuronlp2/io/common.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import re 4 | 5 | MAX_CHAR_LENGTH = 45 6 | 7 | # Regular expressions used to normalize digits. 8 | DIGIT_RE = re.compile(r"\d") 9 | 10 | PAD = "_PAD" 11 | PAD_POS = "_PAD_POS" 12 | PAD_TYPE = "_" 13 | PAD_CHAR = "_PAD_CHAR" 14 | PAD_CHUNK = "_PAD_CHUNK" 15 | PAD_NER = "_PAD_NER" 16 | 17 | ROOT = "_ROOT" 18 | ROOT_POS = "_ROOT_POS" 19 | ROOT_TYPE = "_" 20 | ROOT_CHAR = "_ROOT_CHAR" 21 | 22 | END = "_END" 23 | END_POS = "_END_POS" 24 | END_TYPE = "_" 25 | END_CHAR = "_END_CHAR" 26 | 27 | UNK_ID = 0 28 | PAD_ID_WORD = 1 29 | PAD_ID_CHAR = 1 30 | PAD_ID_TAG = 0 31 | -------------------------------------------------------------------------------- /neuronlp2/io/conll03_data.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import os.path 4 | import numpy as np 5 | from collections import defaultdict, OrderedDict 6 | import torch 7 | 8 | from neuronlp2.io.reader import CoNLL03Reader 9 | from neuronlp2.io.alphabet import Alphabet 10 | from neuronlp2.io.logger import get_logger 11 | from neuronlp2.io.common import DIGIT_RE, MAX_CHAR_LENGTH, UNK_ID 12 | from neuronlp2.io.common import PAD_CHAR, PAD, PAD_CHUNK, PAD_POS, PAD_NER, PAD_ID_CHAR, PAD_ID_TAG, PAD_ID_WORD 13 | 14 | # Special vocabulary symbols - we always put them at the start. 15 | _START_VOCAB = [PAD,] 16 | NUM_SYMBOLIC_TAGS = 1 17 | 18 | _buckets = [5, 10, 15, 20, 25, 30, 40, 50, 140] 19 | 20 | 21 | def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, 22 | min_occurrence=1, normalize_digits=True): 23 | 24 | def expand_vocab(): 25 | vocab_set = set(vocab_list) 26 | for data_path in data_paths: 27 | # logger.info("Processing data: %s" % data_path) 28 | with open(data_path, 'r') as file: 29 | for line in file: 30 | line = line.strip() 31 | if len(line) == 0: 32 | continue 33 | 34 | tokens = line.split(' ') 35 | word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] 36 | pos = tokens[2] 37 | chunk = tokens[3] 38 | ner = tokens[4] 39 | 40 | pos_alphabet.add(pos) 41 | chunk_alphabet.add(chunk) 42 | ner_alphabet.add(ner) 43 | 44 | if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): 45 | vocab_set.add(word) 46 | vocab_list.append(word) 47 | 48 | logger = get_logger("Create Alphabets") 49 | word_alphabet = Alphabet('word', defualt_value=True, singleton=True) 50 | char_alphabet = Alphabet('character', defualt_value=True) 51 | pos_alphabet = Alphabet('pos') 52 | chunk_alphabet = Alphabet('chunk') 53 | ner_alphabet = Alphabet('ner') 54 | 55 | if not os.path.isdir(alphabet_directory): 56 | logger.info("Creating Alphabets: %s" % alphabet_directory) 57 | 58 | char_alphabet.add(PAD_CHAR) 59 | pos_alphabet.add(PAD_POS) 60 | chunk_alphabet.add(PAD_CHUNK) 61 | ner_alphabet.add(PAD_NER) 62 | 63 | vocab = defaultdict(int) 64 | with open(train_path, 'r') as file: 65 | for line in file: 66 | line = line.strip() 67 | if len(line) == 0: 68 | continue 69 | 70 | tokens = line.split(' ') 71 | for char in tokens[1]: 72 | char_alphabet.add(char) 73 | 74 | word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] 75 | vocab[word] += 1 76 | 77 | pos = tokens[2] 78 | pos_alphabet.add(pos) 79 | 80 | chunk = tokens[3] 81 | chunk_alphabet.add(chunk) 82 | 83 | ner = tokens[4] 84 | ner_alphabet.add(ner) 85 | 86 | # collect singletons 87 | singletons = set([word for word, count in vocab.items() if count <= min_occurrence]) 88 | 89 | # if a singleton is in pretrained embedding dict, set the count to min_occur + c 90 | if embedd_dict is not None: 91 | assert isinstance(embedd_dict, OrderedDict) 92 | for word in vocab.keys(): 93 | if word in embedd_dict or word.lower() in embedd_dict: 94 | vocab[word] += min_occurrence 95 | 96 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) 97 | logger.info("Total Vocabulary Size: %d" % len(vocab_list)) 98 | logger.info("Total Singleton Size: %d" % len(singletons)) 99 | vocab_list = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence] 100 | logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list)) 101 | 102 | if len(vocab_list) > max_vocabulary_size: 103 | vocab_list = vocab_list[:max_vocabulary_size] 104 | 105 | if data_paths is not None and embedd_dict is not None: 106 | expand_vocab() 107 | 108 | for word in vocab_list: 109 | word_alphabet.add(word) 110 | if word in singletons: 111 | word_alphabet.add_singleton(word_alphabet.get_index(word)) 112 | 113 | word_alphabet.save(alphabet_directory) 114 | char_alphabet.save(alphabet_directory) 115 | pos_alphabet.save(alphabet_directory) 116 | chunk_alphabet.save(alphabet_directory) 117 | ner_alphabet.save(alphabet_directory) 118 | else: 119 | word_alphabet.load(alphabet_directory) 120 | char_alphabet.load(alphabet_directory) 121 | pos_alphabet.load(alphabet_directory) 122 | chunk_alphabet.load(alphabet_directory) 123 | ner_alphabet.load(alphabet_directory) 124 | 125 | word_alphabet.close() 126 | char_alphabet.close() 127 | pos_alphabet.close() 128 | chunk_alphabet.close() 129 | ner_alphabet.close() 130 | logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) 131 | logger.info("Character Alphabet Size: %d" % char_alphabet.size()) 132 | logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) 133 | logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) 134 | logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) 135 | return word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet 136 | 137 | 138 | def read_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, 139 | pos_alphabet: Alphabet, chunk_alphabet: Alphabet, ner_alphabet: Alphabet, 140 | max_size=None, normalize_digits=True): 141 | data = [] 142 | max_length = 0 143 | max_char_length = 0 144 | print('Reading data from %s' % source_path) 145 | counter = 0 146 | reader = CoNLL03Reader(source_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) 147 | inst = reader.getNext(normalize_digits) 148 | while inst is not None and (not max_size or counter < max_size): 149 | counter += 1 150 | if counter % 10000 == 0: 151 | print("reading data: %d" % counter) 152 | 153 | sent = inst.sentence 154 | data.append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.chunk_ids, inst.ner_ids]) 155 | max_len = max([len(char_seq) for char_seq in sent.char_seqs]) 156 | if max_char_length < max_len: 157 | max_char_length = max_len 158 | if max_length < inst.length(): 159 | max_length = inst.length() 160 | inst = reader.getNext(normalize_digits) 161 | reader.close() 162 | print("Total number of data: %d" % counter) 163 | 164 | data_size = len(data) 165 | char_length = min(MAX_CHAR_LENGTH, max_char_length) 166 | wid_inputs = np.empty([data_size, max_length], dtype=np.int64) 167 | cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64) 168 | pid_inputs = np.empty([data_size, max_length], dtype=np.int64) 169 | chid_inputs = np.empty([data_size, max_length], dtype=np.int64) 170 | nid_inputs = np.empty([data_size, max_length], dtype=np.int64) 171 | 172 | masks = np.zeros([data_size, max_length], dtype=np.float32) 173 | single = np.zeros([data_size, max_length], dtype=np.int64) 174 | lengths = np.empty(data_size, dtype=np.int64) 175 | 176 | for i, inst in enumerate(data): 177 | wids, cid_seqs, pids, chids, nids = inst 178 | inst_size = len(wids) 179 | lengths[i] = inst_size 180 | # word ids 181 | wid_inputs[i, :inst_size] = wids 182 | wid_inputs[i, inst_size:] = PAD_ID_WORD 183 | for c, cids in enumerate(cid_seqs): 184 | cid_inputs[i, c, :len(cids)] = cids 185 | cid_inputs[i, c, len(cids):] = PAD_ID_CHAR 186 | cid_inputs[i, inst_size:, :] = PAD_ID_CHAR 187 | # pos ids 188 | pid_inputs[i, :inst_size] = pids 189 | pid_inputs[i, inst_size:] = PAD_ID_TAG 190 | # chunk ids 191 | chid_inputs[i, :inst_size] = chids 192 | chid_inputs[i, inst_size:] = PAD_ID_TAG 193 | # ner ids 194 | nid_inputs[i, :inst_size] = nids 195 | nid_inputs[i, inst_size:] = PAD_ID_TAG 196 | # masks 197 | masks[i, :inst_size] = 1.0 198 | for j, wid in enumerate(wids): 199 | if word_alphabet.is_singleton(wid): 200 | single[i, j] = 1 201 | 202 | words = torch.from_numpy(wid_inputs) 203 | chars = torch.from_numpy(cid_inputs) 204 | pos = torch.from_numpy(pid_inputs) 205 | chunks = torch.from_numpy(chid_inputs) 206 | ners = torch.from_numpy(nid_inputs) 207 | masks = torch.from_numpy(masks) 208 | single = torch.from_numpy(single) 209 | lengths = torch.from_numpy(lengths) 210 | 211 | data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'CHUNK': chunks, 212 | 'NER': ners, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths} 213 | return data_tensor, data_size 214 | 215 | 216 | def read_bucketed_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, 217 | pos_alphabet: Alphabet, chunk_alphabet: Alphabet, ner_alphabet: Alphabet, 218 | max_size=None, normalize_digits=True): 219 | data = [[] for _ in _buckets] 220 | max_char_length = [0 for _ in _buckets] 221 | print('Reading data from %s' % source_path) 222 | counter = 0 223 | reader = CoNLL03Reader(source_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) 224 | inst = reader.getNext(normalize_digits) 225 | while inst is not None and (not max_size or counter < max_size): 226 | counter += 1 227 | if counter % 10000 == 0: 228 | print("reading data: %d" % counter) 229 | 230 | inst_size = inst.length() 231 | sent = inst.sentence 232 | for bucket_id, bucket_size in enumerate(_buckets): 233 | if inst_size < bucket_size: 234 | data[bucket_id].append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.chunk_ids, inst.ner_ids]) 235 | max_len = max([len(char_seq) for char_seq in sent.char_seqs]) 236 | if max_char_length[bucket_id] < max_len: 237 | max_char_length[bucket_id] = max_len 238 | break 239 | 240 | inst = reader.getNext(normalize_digits) 241 | reader.close() 242 | print("Total number of data: %d" % counter) 243 | 244 | bucket_sizes = [len(data[b]) for b in range(len(_buckets))] 245 | data_tensors = [] 246 | for bucket_id in range(len(_buckets)): 247 | bucket_size = bucket_sizes[bucket_id] 248 | if bucket_size == 0: 249 | data_tensors.append((1, 1)) 250 | continue 251 | 252 | bucket_length = _buckets[bucket_id] 253 | char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id]) 254 | wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 255 | cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64) 256 | pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 257 | chid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 258 | nid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 259 | 260 | masks = np.zeros([bucket_size, bucket_length], dtype=np.float32) 261 | single = np.zeros([bucket_size, bucket_length], dtype=np.int64) 262 | lengths = np.empty(bucket_size, dtype=np.int64) 263 | 264 | for i, inst in enumerate(data[bucket_id]): 265 | wids, cid_seqs, pids, chids, nids = inst 266 | inst_size = len(wids) 267 | lengths[i] = inst_size 268 | # word ids 269 | wid_inputs[i, :inst_size] = wids 270 | wid_inputs[i, inst_size:] = PAD_ID_WORD 271 | for c, cids in enumerate(cid_seqs): 272 | cid_inputs[i, c, :len(cids)] = cids 273 | cid_inputs[i, c, len(cids):] = PAD_ID_CHAR 274 | cid_inputs[i, inst_size:, :] = PAD_ID_CHAR 275 | # pos ids 276 | pid_inputs[i, :inst_size] = pids 277 | pid_inputs[i, inst_size:] = PAD_ID_TAG 278 | # chunk ids 279 | chid_inputs[i, :inst_size] = chids 280 | chid_inputs[i, inst_size:] = PAD_ID_TAG 281 | # ner ids 282 | nid_inputs[i, :inst_size] = nids 283 | nid_inputs[i, inst_size:] = PAD_ID_TAG 284 | # masks 285 | masks[i, :inst_size] = 1.0 286 | for j, wid in enumerate(wids): 287 | if word_alphabet.is_singleton(wid): 288 | single[i, j] = 1 289 | 290 | words = torch.from_numpy(wid_inputs) 291 | chars = torch.from_numpy(cid_inputs) 292 | pos = torch.from_numpy(pid_inputs) 293 | chunks = torch.from_numpy(chid_inputs) 294 | ners = torch.from_numpy(nid_inputs) 295 | masks = torch.from_numpy(masks) 296 | single = torch.from_numpy(single) 297 | lengths = torch.from_numpy(lengths) 298 | 299 | data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'CHUNK': chunks, 300 | 'NER': ners, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths} 301 | data_tensors.append(data_tensor) 302 | return data_tensors, bucket_sizes 303 | -------------------------------------------------------------------------------- /neuronlp2/io/conllx_data.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import os.path 4 | import numpy as np 5 | from collections import defaultdict, OrderedDict 6 | import torch 7 | 8 | from neuronlp2.io.reader import CoNLLXReader 9 | from neuronlp2.io.alphabet import Alphabet 10 | from neuronlp2.io.logger import get_logger 11 | from neuronlp2.io.common import DIGIT_RE, MAX_CHAR_LENGTH, UNK_ID 12 | from neuronlp2.io.common import PAD_CHAR, PAD, PAD_POS, PAD_TYPE, PAD_ID_CHAR, PAD_ID_TAG, PAD_ID_WORD 13 | from neuronlp2.io.common import ROOT, END, ROOT_CHAR, ROOT_POS, ROOT_TYPE, END_CHAR, END_POS, END_TYPE 14 | 15 | # Special vocabulary symbols - we always put them at the start. 16 | _START_VOCAB = [PAD, ROOT, END] 17 | NUM_SYMBOLIC_TAGS = 3 18 | 19 | _buckets = [10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 140] 20 | 21 | 22 | def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None, 23 | min_occurrence=1, normalize_digits=True): 24 | 25 | def expand_vocab(): 26 | vocab_set = set(vocab_list) 27 | for data_path in data_paths: 28 | # logger.info("Processing data: %s" % data_path) 29 | with open(data_path, 'r') as file: 30 | for line in file: 31 | line = line.strip() 32 | if len(line) == 0: 33 | continue 34 | 35 | tokens = line.split('\t') 36 | for char in tokens[1]: 37 | char_alphabet.add(char) 38 | 39 | word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] 40 | pos = tokens[4] 41 | type = tokens[7] 42 | 43 | pos_alphabet.add(pos) 44 | type_alphabet.add(type) 45 | 46 | if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict): 47 | vocab_set.add(word) 48 | vocab_list.append(word) 49 | 50 | logger = get_logger("Create Alphabets") 51 | word_alphabet = Alphabet('word', defualt_value=True, singleton=True) 52 | char_alphabet = Alphabet('character', defualt_value=True) 53 | pos_alphabet = Alphabet('pos') 54 | type_alphabet = Alphabet('type') 55 | if not os.path.isdir(alphabet_directory): 56 | logger.info("Creating Alphabets: %s" % alphabet_directory) 57 | 58 | char_alphabet.add(PAD_CHAR) 59 | pos_alphabet.add(PAD_POS) 60 | type_alphabet.add(PAD_TYPE) 61 | 62 | char_alphabet.add(ROOT_CHAR) 63 | pos_alphabet.add(ROOT_POS) 64 | type_alphabet.add(ROOT_TYPE) 65 | 66 | char_alphabet.add(END_CHAR) 67 | pos_alphabet.add(END_POS) 68 | type_alphabet.add(END_TYPE) 69 | 70 | vocab = defaultdict(int) 71 | with open(train_path, 'r') as file: 72 | for line in file: 73 | line = line.strip() 74 | if len(line) == 0: 75 | continue 76 | 77 | tokens = line.split('\t') 78 | for char in tokens[1]: 79 | char_alphabet.add(char) 80 | 81 | word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] 82 | vocab[word] += 1 83 | 84 | pos = tokens[4] 85 | pos_alphabet.add(pos) 86 | 87 | type = tokens[7] 88 | type_alphabet.add(type) 89 | 90 | # collect singletons 91 | singletons = set([word for word, count in vocab.items() if count <= min_occurrence]) 92 | 93 | # if a singleton is in pretrained embedding dict, set the count to min_occur + c 94 | if embedd_dict is not None: 95 | assert isinstance(embedd_dict, OrderedDict) 96 | for word in vocab.keys(): 97 | if word in embedd_dict or word.lower() in embedd_dict: 98 | vocab[word] += min_occurrence 99 | 100 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) 101 | logger.info("Total Vocabulary Size: %d" % len(vocab_list)) 102 | logger.info("Total Singleton Size: %d" % len(singletons)) 103 | vocab_list = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence] 104 | logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list)) 105 | 106 | if len(vocab_list) > max_vocabulary_size: 107 | vocab_list = vocab_list[:max_vocabulary_size] 108 | 109 | if data_paths is not None and embedd_dict is not None: 110 | expand_vocab() 111 | 112 | for word in vocab_list: 113 | word_alphabet.add(word) 114 | if word in singletons: 115 | word_alphabet.add_singleton(word_alphabet.get_index(word)) 116 | 117 | word_alphabet.save(alphabet_directory) 118 | char_alphabet.save(alphabet_directory) 119 | pos_alphabet.save(alphabet_directory) 120 | type_alphabet.save(alphabet_directory) 121 | else: 122 | word_alphabet.load(alphabet_directory) 123 | char_alphabet.load(alphabet_directory) 124 | pos_alphabet.load(alphabet_directory) 125 | type_alphabet.load(alphabet_directory) 126 | 127 | word_alphabet.close() 128 | char_alphabet.close() 129 | pos_alphabet.close() 130 | type_alphabet.close() 131 | logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size())) 132 | logger.info("Character Alphabet Size: %d" % char_alphabet.size()) 133 | logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) 134 | logger.info("Type Alphabet Size: %d" % type_alphabet.size()) 135 | return word_alphabet, char_alphabet, pos_alphabet, type_alphabet 136 | 137 | 138 | def read_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, pos_alphabet: Alphabet, type_alphabet: Alphabet, 139 | max_size=None, normalize_digits=True, symbolic_root=False, symbolic_end=False): 140 | data = [] 141 | max_length = 0 142 | max_char_length = 0 143 | print('Reading data from %s' % source_path) 144 | counter = 0 145 | reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) 146 | inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end) 147 | while inst is not None and (not max_size or counter < max_size): 148 | counter += 1 149 | if counter % 10000 == 0: 150 | print("reading data: %d" % counter) 151 | 152 | sent = inst.sentence 153 | data.append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids]) 154 | max_len = max([len(char_seq) for char_seq in sent.char_seqs]) 155 | if max_char_length < max_len: 156 | max_char_length = max_len 157 | if max_length < inst.length(): 158 | max_length = inst.length() 159 | inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end) 160 | reader.close() 161 | print("Total number of data: %d" % counter) 162 | 163 | data_size = len(data) 164 | char_length = min(MAX_CHAR_LENGTH, max_char_length) 165 | wid_inputs = np.empty([data_size, max_length], dtype=np.int64) 166 | cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64) 167 | pid_inputs = np.empty([data_size, max_length], dtype=np.int64) 168 | hid_inputs = np.empty([data_size, max_length], dtype=np.int64) 169 | tid_inputs = np.empty([data_size, max_length], dtype=np.int64) 170 | 171 | masks = np.zeros([data_size, max_length], dtype=np.float32) 172 | single = np.zeros([data_size, max_length], dtype=np.int64) 173 | lengths = np.empty(data_size, dtype=np.int64) 174 | 175 | for i, inst in enumerate(data): 176 | wids, cid_seqs, pids, hids, tids = inst 177 | inst_size = len(wids) 178 | lengths[i] = inst_size 179 | # word ids 180 | wid_inputs[i, :inst_size] = wids 181 | wid_inputs[i, inst_size:] = PAD_ID_WORD 182 | for c, cids in enumerate(cid_seqs): 183 | cid_inputs[i, c, :len(cids)] = cids 184 | cid_inputs[i, c, len(cids):] = PAD_ID_CHAR 185 | cid_inputs[i, inst_size:, :] = PAD_ID_CHAR 186 | # pos ids 187 | pid_inputs[i, :inst_size] = pids 188 | pid_inputs[i, inst_size:] = PAD_ID_TAG 189 | # type ids 190 | tid_inputs[i, :inst_size] = tids 191 | tid_inputs[i, inst_size:] = PAD_ID_TAG 192 | # heads 193 | hid_inputs[i, :inst_size] = hids 194 | hid_inputs[i, inst_size:] = PAD_ID_TAG 195 | # masks 196 | masks[i, :inst_size] = 1.0 197 | for j, wid in enumerate(wids): 198 | if word_alphabet.is_singleton(wid): 199 | single[i, j] = 1 200 | 201 | words = torch.from_numpy(wid_inputs) 202 | chars = torch.from_numpy(cid_inputs) 203 | pos = torch.from_numpy(pid_inputs) 204 | heads = torch.from_numpy(hid_inputs) 205 | types = torch.from_numpy(tid_inputs) 206 | masks = torch.from_numpy(masks) 207 | single = torch.from_numpy(single) 208 | lengths = torch.from_numpy(lengths) 209 | 210 | data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types, 211 | 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths} 212 | return data_tensor, data_size 213 | 214 | 215 | def read_bucketed_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, pos_alphabet: Alphabet, type_alphabet: Alphabet, 216 | max_size=None, normalize_digits=True, symbolic_root=False, symbolic_end=False): 217 | data = [[] for _ in _buckets] 218 | max_char_length = [0 for _ in _buckets] 219 | print('Reading data from %s' % source_path) 220 | counter = 0 221 | reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) 222 | inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end) 223 | while inst is not None and (not max_size or counter < max_size): 224 | counter += 1 225 | if counter % 10000 == 0: 226 | print("reading data: %d" % counter) 227 | 228 | inst_size = inst.length() 229 | sent = inst.sentence 230 | for bucket_id, bucket_size in enumerate(_buckets): 231 | if inst_size < bucket_size: 232 | data[bucket_id].append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids]) 233 | max_len = max([len(char_seq) for char_seq in sent.char_seqs]) 234 | if max_char_length[bucket_id] < max_len: 235 | max_char_length[bucket_id] = max_len 236 | break 237 | 238 | inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end) 239 | reader.close() 240 | print("Total number of data: %d" % counter) 241 | 242 | bucket_sizes = [len(data[b]) for b in range(len(_buckets))] 243 | data_tensors = [] 244 | for bucket_id in range(len(_buckets)): 245 | bucket_size = bucket_sizes[bucket_id] 246 | if bucket_size == 0: 247 | data_tensors.append((1, 1)) 248 | continue 249 | 250 | bucket_length = _buckets[bucket_id] 251 | char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id]) 252 | wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 253 | cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64) 254 | pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 255 | hid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 256 | tid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 257 | 258 | masks = np.zeros([bucket_size, bucket_length], dtype=np.float32) 259 | single = np.zeros([bucket_size, bucket_length], dtype=np.int64) 260 | lengths = np.empty(bucket_size, dtype=np.int64) 261 | 262 | for i, inst in enumerate(data[bucket_id]): 263 | wids, cid_seqs, pids, hids, tids = inst 264 | inst_size = len(wids) 265 | lengths[i] = inst_size 266 | # word ids 267 | wid_inputs[i, :inst_size] = wids 268 | wid_inputs[i, inst_size:] = PAD_ID_WORD 269 | for c, cids in enumerate(cid_seqs): 270 | cid_inputs[i, c, :len(cids)] = cids 271 | cid_inputs[i, c, len(cids):] = PAD_ID_CHAR 272 | cid_inputs[i, inst_size:, :] = PAD_ID_CHAR 273 | # pos ids 274 | pid_inputs[i, :inst_size] = pids 275 | pid_inputs[i, inst_size:] = PAD_ID_TAG 276 | # type ids 277 | tid_inputs[i, :inst_size] = tids 278 | tid_inputs[i, inst_size:] = PAD_ID_TAG 279 | # heads 280 | hid_inputs[i, :inst_size] = hids 281 | hid_inputs[i, inst_size:] = PAD_ID_TAG 282 | # masks 283 | masks[i, :inst_size] = 1.0 284 | for j, wid in enumerate(wids): 285 | if word_alphabet.is_singleton(wid): 286 | single[i, j] = 1 287 | 288 | words = torch.from_numpy(wid_inputs) 289 | chars = torch.from_numpy(cid_inputs) 290 | pos = torch.from_numpy(pid_inputs) 291 | heads = torch.from_numpy(hid_inputs) 292 | types = torch.from_numpy(tid_inputs) 293 | masks = torch.from_numpy(masks) 294 | single = torch.from_numpy(single) 295 | lengths = torch.from_numpy(lengths) 296 | 297 | data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types, 298 | 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths} 299 | data_tensors.append(data_tensor) 300 | return data_tensors, bucket_sizes 301 | -------------------------------------------------------------------------------- /neuronlp2/io/conllx_stacked_data.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import numpy as np 4 | import torch 5 | from neuronlp2.io.reader import CoNLLXReader 6 | from neuronlp2.io.conllx_data import _buckets, NUM_SYMBOLIC_TAGS, create_alphabets 7 | from neuronlp2.io.common import DIGIT_RE, MAX_CHAR_LENGTH, UNK_ID 8 | from neuronlp2.io.common import PAD_CHAR, PAD, PAD_POS, PAD_TYPE, PAD_ID_CHAR, PAD_ID_TAG, PAD_ID_WORD 9 | from neuronlp2.io.common import ROOT, END, ROOT_CHAR, ROOT_POS, ROOT_TYPE, END_CHAR, END_POS, END_TYPE 10 | 11 | 12 | def _obtain_child_index_for_left2right(heads): 13 | child_ids = [[] for _ in range(len(heads))] 14 | # skip the symbolic root. 15 | for child in range(1, len(heads)): 16 | head = heads[child] 17 | child_ids[head].append(child) 18 | return child_ids 19 | 20 | 21 | def _obtain_child_index_for_inside_out(heads): 22 | child_ids = [[] for _ in range(len(heads))] 23 | for head in range(len(heads)): 24 | # first find left children inside-out 25 | for child in reversed(range(1, head)): 26 | if heads[child] == head: 27 | child_ids[head].append(child) 28 | # second find right children inside-out 29 | for child in range(head + 1, len(heads)): 30 | if heads[child] == head: 31 | child_ids[head].append(child) 32 | return child_ids 33 | 34 | 35 | def _obtain_child_index_for_depth(heads, reverse): 36 | def calc_depth(head): 37 | children = child_ids[head] 38 | max_depth = 0 39 | for child in children: 40 | depth = calc_depth(child) 41 | child_with_depth[head].append((child, depth)) 42 | max_depth = max(max_depth, depth + 1) 43 | child_with_depth[head] = sorted(child_with_depth[head], key=lambda x: x[1], reverse=reverse) 44 | return max_depth 45 | 46 | child_ids = _obtain_child_index_for_left2right(heads) 47 | child_with_depth = [[] for _ in range(len(heads))] 48 | calc_depth(0) 49 | return [[child for child, depth in child_with_depth[head]] for head in range(len(heads))] 50 | 51 | 52 | def _generate_stack_inputs(heads, types, prior_order): 53 | if prior_order == 'deep_first': 54 | child_ids = _obtain_child_index_for_depth(heads, True) 55 | elif prior_order == 'shallow_first': 56 | child_ids = _obtain_child_index_for_depth(heads, False) 57 | elif prior_order == 'left2right': 58 | child_ids = _obtain_child_index_for_left2right(heads) 59 | elif prior_order == 'inside_out': 60 | child_ids = _obtain_child_index_for_inside_out(heads) 61 | else: 62 | raise ValueError('Unknown prior order: %s' % prior_order) 63 | 64 | stacked_heads = [] 65 | children = [] 66 | siblings = [] 67 | stacked_types = [] 68 | skip_connect = [] 69 | prev = [0 for _ in range(len(heads))] 70 | sibs = [0 for _ in range(len(heads))] 71 | stack = [0] 72 | position = 1 73 | while len(stack) > 0: 74 | head = stack[-1] 75 | stacked_heads.append(head) 76 | siblings.append(sibs[head]) 77 | child_id = child_ids[head] 78 | skip_connect.append(prev[head]) 79 | prev[head] = position 80 | if len(child_id) == 0: 81 | children.append(head) 82 | sibs[head] = 0 83 | stacked_types.append(PAD_ID_TAG) 84 | stack.pop() 85 | else: 86 | child = child_id.pop(0) 87 | children.append(child) 88 | sibs[head] = child 89 | stack.append(child) 90 | stacked_types.append(types[child]) 91 | position += 1 92 | 93 | return stacked_heads, children, siblings, stacked_types, skip_connect 94 | 95 | 96 | def read_data(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, 97 | max_size=None, normalize_digits=True, prior_order='inside_out'): 98 | data = [] 99 | max_length = 0 100 | max_char_length = 0 101 | print('Reading data from %s' % source_path) 102 | counter = 0 103 | reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) 104 | inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=True, symbolic_end=False) 105 | while inst is not None and (not max_size or counter < max_size): 106 | counter += 1 107 | if counter % 10000 == 0: 108 | print("reading data: %d" % counter) 109 | 110 | sent = inst.sentence 111 | stacked_heads, children, siblings, stacked_types, skip_connect = _generate_stack_inputs(inst.heads, inst.type_ids, prior_order) 112 | data.append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids, stacked_heads, children, siblings, stacked_types, skip_connect]) 113 | max_len = max([len(char_seq) for char_seq in sent.char_seqs]) 114 | if max_char_length < max_len: 115 | max_char_length = max_len 116 | if max_length < inst.length(): 117 | max_length = inst.length() 118 | inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=True, symbolic_end=False) 119 | reader.close() 120 | print("Total number of data: %d" % counter) 121 | 122 | data_size = len(data) 123 | char_length = min(MAX_CHAR_LENGTH, max_char_length) 124 | wid_inputs = np.empty([data_size, max_length], dtype=np.int64) 125 | cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64) 126 | pid_inputs = np.empty([data_size, max_length], dtype=np.int64) 127 | hid_inputs = np.empty([data_size, max_length], dtype=np.int64) 128 | tid_inputs = np.empty([data_size, max_length], dtype=np.int64) 129 | 130 | masks_e = np.zeros([data_size, max_length], dtype=np.float32) 131 | single = np.zeros([data_size, max_length], dtype=np.int64) 132 | lengths = np.empty(data_size, dtype=np.int64) 133 | 134 | stack_hid_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64) 135 | chid_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64) 136 | ssid_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64) 137 | stack_tid_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64) 138 | skip_connect_inputs = np.empty([data_size, 2 * max_length - 1], dtype=np.int64) 139 | 140 | masks_d = np.zeros([data_size, 2 * max_length - 1], dtype=np.float32) 141 | 142 | for i, inst in enumerate(data): 143 | wids, cid_seqs, pids, hids, tids, stack_hids, chids, ssids, stack_tids, skip_ids = inst 144 | inst_size = len(wids) 145 | lengths[i] = inst_size 146 | # word ids 147 | wid_inputs[i, :inst_size] = wids 148 | wid_inputs[i, inst_size:] = PAD_ID_WORD 149 | for c, cids in enumerate(cid_seqs): 150 | cid_inputs[i, c, :len(cids)] = cids 151 | cid_inputs[i, c, len(cids):] = PAD_ID_CHAR 152 | cid_inputs[i, inst_size:, :] = PAD_ID_CHAR 153 | # pos ids 154 | pid_inputs[i, :inst_size] = pids 155 | pid_inputs[i, inst_size:] = PAD_ID_TAG 156 | # type ids 157 | tid_inputs[i, :inst_size] = tids 158 | tid_inputs[i, inst_size:] = PAD_ID_TAG 159 | # heads 160 | hid_inputs[i, :inst_size] = hids 161 | hid_inputs[i, inst_size:] = PAD_ID_TAG 162 | # masks_e 163 | masks_e[i, :inst_size] = 1.0 164 | for j, wid in enumerate(wids): 165 | if word_alphabet.is_singleton(wid): 166 | single[i, j] = 1 167 | 168 | inst_size_decoder = 2 * inst_size - 1 169 | # stacked heads 170 | stack_hid_inputs[i, :inst_size_decoder] = stack_hids 171 | stack_hid_inputs[i, inst_size_decoder:] = PAD_ID_TAG 172 | # children 173 | chid_inputs[i, :inst_size_decoder] = chids 174 | chid_inputs[i, inst_size_decoder:] = PAD_ID_TAG 175 | # siblings 176 | ssid_inputs[i, :inst_size_decoder] = ssids 177 | ssid_inputs[i, inst_size_decoder:] = PAD_ID_TAG 178 | # stacked types 179 | stack_tid_inputs[i, :inst_size_decoder] = stack_tids 180 | stack_tid_inputs[i, inst_size_decoder:] = PAD_ID_TAG 181 | # skip connects 182 | skip_connect_inputs[i, :inst_size_decoder] = skip_ids 183 | skip_connect_inputs[i, inst_size_decoder:] = PAD_ID_TAG 184 | # masks_d 185 | masks_d[i, :inst_size_decoder] = 1.0 186 | 187 | words = torch.from_numpy(wid_inputs) 188 | chars = torch.from_numpy(cid_inputs) 189 | pos = torch.from_numpy(pid_inputs) 190 | heads = torch.from_numpy(hid_inputs) 191 | types = torch.from_numpy(tid_inputs) 192 | masks_e = torch.from_numpy(masks_e) 193 | single = torch.from_numpy(single) 194 | lengths = torch.from_numpy(lengths) 195 | 196 | stacked_heads = torch.from_numpy(stack_hid_inputs) 197 | children = torch.from_numpy(chid_inputs) 198 | siblings = torch.from_numpy(ssid_inputs) 199 | stacked_types = torch.from_numpy(stack_tid_inputs) 200 | skip_connect = torch.from_numpy(skip_connect_inputs) 201 | masks_d = torch.from_numpy(masks_d) 202 | 203 | data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types, 'MASK_ENC': masks_e, 204 | 'SINGLE': single, 'LENGTH': lengths, 'STACK_HEAD': stacked_heads, 'CHILD': children, 205 | 'SIBLING': siblings, 'STACK_TYPE': stacked_types, 'SKIP_CONNECT': skip_connect, 'MASK_DEC': masks_d} 206 | return data_tensor, data_size 207 | 208 | 209 | def read_bucketed_data(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, 210 | max_size=None, normalize_digits=True, prior_order='inside_out'): 211 | data = [[] for _ in _buckets] 212 | max_char_length = [0 for _ in _buckets] 213 | print('Reading data from %s' % source_path) 214 | counter = 0 215 | reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) 216 | inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=True, symbolic_end=False) 217 | while inst is not None and (not max_size or counter < max_size): 218 | counter += 1 219 | if counter % 10000 == 0: 220 | print("reading data: %d" % counter) 221 | 222 | inst_size = inst.length() 223 | sent = inst.sentence 224 | for bucket_id, bucket_size in enumerate(_buckets): 225 | if inst_size < bucket_size: 226 | stacked_heads, children, siblings, stacked_types, skip_connect = _generate_stack_inputs(inst.heads, inst.type_ids, prior_order) 227 | data[bucket_id].append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids, stacked_heads, children, siblings, stacked_types, skip_connect]) 228 | max_len = max([len(char_seq) for char_seq in sent.char_seqs]) 229 | if max_char_length[bucket_id] < max_len: 230 | max_char_length[bucket_id] = max_len 231 | break 232 | 233 | inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=True, symbolic_end=False) 234 | reader.close() 235 | print("Total number of data: %d" % counter) 236 | 237 | bucket_sizes = [len(data[b]) for b in range(len(_buckets))] 238 | data_tensors = [] 239 | for bucket_id in range(len(_buckets)): 240 | bucket_size = bucket_sizes[bucket_id] 241 | if bucket_size == 0: 242 | data_tensors.append((1, 1)) 243 | continue 244 | 245 | bucket_length = _buckets[bucket_id] 246 | char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id]) 247 | wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 248 | cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64) 249 | pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 250 | hid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 251 | tid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64) 252 | 253 | masks_e = np.zeros([bucket_size, bucket_length], dtype=np.float32) 254 | single = np.zeros([bucket_size, bucket_length], dtype=np.int64) 255 | lengths = np.empty(bucket_size, dtype=np.int64) 256 | 257 | stack_hid_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64) 258 | chid_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64) 259 | ssid_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64) 260 | stack_tid_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64) 261 | skip_connect_inputs = np.empty([bucket_size, 2 * bucket_length - 1], dtype=np.int64) 262 | 263 | masks_d = np.zeros([bucket_size, 2 * bucket_length - 1], dtype=np.float32) 264 | 265 | for i, inst in enumerate(data[bucket_id]): 266 | wids, cid_seqs, pids, hids, tids, stack_hids, chids, ssids, stack_tids, skip_ids = inst 267 | inst_size = len(wids) 268 | lengths[i] = inst_size 269 | # word ids 270 | wid_inputs[i, :inst_size] = wids 271 | wid_inputs[i, inst_size:] = PAD_ID_WORD 272 | for c, cids in enumerate(cid_seqs): 273 | cid_inputs[i, c, :len(cids)] = cids 274 | cid_inputs[i, c, len(cids):] = PAD_ID_CHAR 275 | cid_inputs[i, inst_size:, :] = PAD_ID_CHAR 276 | # pos ids 277 | pid_inputs[i, :inst_size] = pids 278 | pid_inputs[i, inst_size:] = PAD_ID_TAG 279 | # type ids 280 | tid_inputs[i, :inst_size] = tids 281 | tid_inputs[i, inst_size:] = PAD_ID_TAG 282 | # heads 283 | hid_inputs[i, :inst_size] = hids 284 | hid_inputs[i, inst_size:] = PAD_ID_TAG 285 | # masks_e 286 | masks_e[i, :inst_size] = 1.0 287 | for j, wid in enumerate(wids): 288 | if word_alphabet.is_singleton(wid): 289 | single[i, j] = 1 290 | 291 | inst_size_decoder = 2 * inst_size - 1 292 | # stacked heads 293 | stack_hid_inputs[i, :inst_size_decoder] = stack_hids 294 | stack_hid_inputs[i, inst_size_decoder:] = PAD_ID_TAG 295 | # children 296 | chid_inputs[i, :inst_size_decoder] = chids 297 | chid_inputs[i, inst_size_decoder:] = PAD_ID_TAG 298 | # siblings 299 | ssid_inputs[i, :inst_size_decoder] = ssids 300 | ssid_inputs[i, inst_size_decoder:] = PAD_ID_TAG 301 | # stacked types 302 | stack_tid_inputs[i, :inst_size_decoder] = stack_tids 303 | stack_tid_inputs[i, inst_size_decoder:] = PAD_ID_TAG 304 | # skip connects 305 | skip_connect_inputs[i, :inst_size_decoder] = skip_ids 306 | skip_connect_inputs[i, inst_size_decoder:] = PAD_ID_TAG 307 | # masks_d 308 | masks_d[i, :inst_size_decoder] = 1.0 309 | 310 | words = torch.from_numpy(wid_inputs) 311 | chars = torch.from_numpy(cid_inputs) 312 | pos = torch.from_numpy(pid_inputs) 313 | heads = torch.from_numpy(hid_inputs) 314 | types = torch.from_numpy(tid_inputs) 315 | masks_e = torch.from_numpy(masks_e) 316 | single = torch.from_numpy(single) 317 | lengths = torch.from_numpy(lengths) 318 | 319 | stacked_heads = torch.from_numpy(stack_hid_inputs) 320 | children = torch.from_numpy(chid_inputs) 321 | siblings = torch.from_numpy(ssid_inputs) 322 | stacked_types = torch.from_numpy(stack_tid_inputs) 323 | skip_connect = torch.from_numpy(skip_connect_inputs) 324 | masks_d = torch.from_numpy(masks_d) 325 | 326 | data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types, 'MASK_ENC': masks_e, 327 | 'SINGLE': single, 'LENGTH': lengths, 'STACK_HEAD': stacked_heads, 'CHILD': children, 328 | 'SIBLING': siblings, 'STACK_TYPE': stacked_types, 'SKIP_CONNECT': skip_connect, 'MASK_DEC': masks_d} 329 | data_tensors.append(data_tensor) 330 | 331 | return data_tensors, bucket_sizes 332 | -------------------------------------------------------------------------------- /neuronlp2/io/instance.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | __all__ = ['Sentence', 'DependencyInstance', 'NERInstance'] 4 | 5 | 6 | class Sentence(object): 7 | def __init__(self, words, word_ids, char_seqs, char_id_seqs): 8 | self.words = words 9 | self.word_ids = word_ids 10 | self.char_seqs = char_seqs 11 | self.char_id_seqs = char_id_seqs 12 | 13 | def length(self): 14 | return len(self.words) 15 | 16 | 17 | class DependencyInstance(object): 18 | def __init__(self, sentence, postags, pos_ids, heads, types, type_ids): 19 | self.sentence = sentence 20 | self.postags = postags 21 | self.pos_ids = pos_ids 22 | self.heads = heads 23 | self.types = types 24 | self.type_ids = type_ids 25 | 26 | def length(self): 27 | return self.sentence.length() 28 | 29 | 30 | class NERInstance(object): 31 | def __init__(self, sentence, postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids): 32 | self.sentence = sentence 33 | self.postags = postags 34 | self.pos_ids = pos_ids 35 | self.chunk_tags = chunk_tags 36 | self.chunk_ids = chunk_ids 37 | self.ner_tags = ner_tags 38 | self.ner_ids = ner_ids 39 | 40 | def length(self): 41 | return self.sentence.length() -------------------------------------------------------------------------------- /neuronlp2/io/logger.py: -------------------------------------------------------------------------------- 1 | _author__ = 'max' 2 | 3 | import logging 4 | import sys 5 | 6 | 7 | def get_logger(name, level=logging.INFO, handler=sys.stdout, 8 | formatter='%(asctime)s - %(name)s - %(levelname)s - %(message)s'): 9 | logger = logging.getLogger(name) 10 | logger.setLevel(logging.INFO) 11 | formatter = logging.Formatter(formatter) 12 | stream_handler = logging.StreamHandler(handler) 13 | stream_handler.setLevel(level) 14 | stream_handler.setFormatter(formatter) 15 | logger.addHandler(stream_handler) 16 | 17 | return logger 18 | -------------------------------------------------------------------------------- /neuronlp2/io/reader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from neuronlp2.io.instance import DependencyInstance, NERInstance 4 | from neuronlp2.io.instance import Sentence 5 | from neuronlp2.io.common import ROOT, ROOT_POS, ROOT_CHAR, ROOT_TYPE, END, END_POS, END_CHAR, END_TYPE 6 | from neuronlp2.io.common import DIGIT_RE, MAX_CHAR_LENGTH 7 | 8 | 9 | class CoNLLXReader(object): 10 | def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet): 11 | self.__source_file = open(file_path, 'r') 12 | self.__word_alphabet = word_alphabet 13 | self.__char_alphabet = char_alphabet 14 | self.__pos_alphabet = pos_alphabet 15 | self.__type_alphabet = type_alphabet 16 | 17 | def close(self): 18 | self.__source_file.close() 19 | 20 | def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False): 21 | line = self.__source_file.readline() 22 | # skip multiple blank lines. 23 | while len(line) > 0 and len(line.strip()) == 0: 24 | line = self.__source_file.readline() 25 | if len(line) == 0: 26 | return None 27 | 28 | lines = [] 29 | while len(line.strip()) > 0: 30 | line = line.strip() 31 | lines.append(line.split('\t')) 32 | line = self.__source_file.readline() 33 | 34 | length = len(lines) 35 | if length == 0: 36 | return None 37 | 38 | words = [] 39 | word_ids = [] 40 | char_seqs = [] 41 | char_id_seqs = [] 42 | postags = [] 43 | pos_ids = [] 44 | types = [] 45 | type_ids = [] 46 | heads = [] 47 | 48 | if symbolic_root: 49 | words.append(ROOT) 50 | word_ids.append(self.__word_alphabet.get_index(ROOT)) 51 | char_seqs.append([ROOT_CHAR, ]) 52 | char_id_seqs.append([self.__char_alphabet.get_index(ROOT_CHAR), ]) 53 | postags.append(ROOT_POS) 54 | pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS)) 55 | types.append(ROOT_TYPE) 56 | type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE)) 57 | heads.append(0) 58 | 59 | for tokens in lines: 60 | chars = [] 61 | char_ids = [] 62 | for char in tokens[1]: 63 | chars.append(char) 64 | char_ids.append(self.__char_alphabet.get_index(char)) 65 | if len(chars) > MAX_CHAR_LENGTH: 66 | chars = chars[:MAX_CHAR_LENGTH] 67 | char_ids = char_ids[:MAX_CHAR_LENGTH] 68 | char_seqs.append(chars) 69 | char_id_seqs.append(char_ids) 70 | 71 | word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] 72 | pos = tokens[4] 73 | head = int(tokens[6]) 74 | type = tokens[7] 75 | 76 | words.append(word) 77 | word_ids.append(self.__word_alphabet.get_index(word)) 78 | 79 | postags.append(pos) 80 | pos_ids.append(self.__pos_alphabet.get_index(pos)) 81 | 82 | types.append(type) 83 | type_ids.append(self.__type_alphabet.get_index(type)) 84 | 85 | heads.append(head) 86 | 87 | if symbolic_end: 88 | words.append(END) 89 | word_ids.append(self.__word_alphabet.get_index(END)) 90 | char_seqs.append([END_CHAR, ]) 91 | char_id_seqs.append([self.__char_alphabet.get_index(END_CHAR), ]) 92 | postags.append(END_POS) 93 | pos_ids.append(self.__pos_alphabet.get_index(END_POS)) 94 | types.append(END_TYPE) 95 | type_ids.append(self.__type_alphabet.get_index(END_TYPE)) 96 | heads.append(0) 97 | 98 | return DependencyInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids) 99 | 100 | 101 | class CoNLL03Reader(object): 102 | def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet): 103 | self.__source_file = open(file_path, 'r') 104 | self.__word_alphabet = word_alphabet 105 | self.__char_alphabet = char_alphabet 106 | self.__pos_alphabet = pos_alphabet 107 | self.__chunk_alphabet = chunk_alphabet 108 | self.__ner_alphabet = ner_alphabet 109 | 110 | def close(self): 111 | self.__source_file.close() 112 | 113 | def getNext(self, normalize_digits=True): 114 | line = self.__source_file.readline() 115 | # skip multiple blank lines. 116 | while len(line) > 0 and len(line.strip()) == 0: 117 | line = self.__source_file.readline() 118 | if len(line) == 0: 119 | return None 120 | 121 | lines = [] 122 | while len(line.strip()) > 0: 123 | line = line.strip() 124 | lines.append(line.split(' ')) 125 | line = self.__source_file.readline() 126 | 127 | length = len(lines) 128 | if length == 0: 129 | return None 130 | 131 | words = [] 132 | word_ids = [] 133 | char_seqs = [] 134 | char_id_seqs = [] 135 | postags = [] 136 | pos_ids = [] 137 | chunk_tags = [] 138 | chunk_ids = [] 139 | ner_tags = [] 140 | ner_ids = [] 141 | 142 | for tokens in lines: 143 | chars = [] 144 | char_ids = [] 145 | for char in tokens[1]: 146 | chars.append(char) 147 | char_ids.append(self.__char_alphabet.get_index(char)) 148 | if len(chars) > MAX_CHAR_LENGTH: 149 | chars = chars[:MAX_CHAR_LENGTH] 150 | char_ids = char_ids[:MAX_CHAR_LENGTH] 151 | char_seqs.append(chars) 152 | char_id_seqs.append(char_ids) 153 | 154 | word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1] 155 | pos = tokens[2] 156 | chunk = tokens[3] 157 | ner = tokens[4] 158 | 159 | words.append(word) 160 | word_ids.append(self.__word_alphabet.get_index(word)) 161 | 162 | postags.append(pos) 163 | pos_ids.append(self.__pos_alphabet.get_index(pos)) 164 | 165 | chunk_tags.append(chunk) 166 | chunk_ids.append(self.__chunk_alphabet.get_index(chunk)) 167 | 168 | ner_tags.append(ner) 169 | ner_ids.append(self.__ner_alphabet.get_index(ner)) 170 | 171 | return NERInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), 172 | postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids) 173 | -------------------------------------------------------------------------------- /neuronlp2/io/utils.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def get_batch(data, batch_size, unk_replace=0.): 8 | data, data_size = data 9 | batch_size = min(data_size, batch_size) 10 | index = torch.randperm(data_size).long()[:batch_size] 11 | 12 | lengths = data['LENGTH'][index] 13 | max_length = lengths.max().item() 14 | words = data['WORD'] 15 | single = data['SINGLE'] 16 | words = words[index, :max_length] 17 | single = single[index, :max_length] 18 | if unk_replace: 19 | ones = single.new_ones(batch_size, max_length) 20 | noise = single.new_empty(batch_size, max_length).bernoulli_(unk_replace).long() 21 | words = words * (ones - single * noise) 22 | 23 | stack_keys = ['STACK_HEAD', 'CHILD', 'SIBLING', 'STACK_TYPE', 'SKIP_CONNECT', 'MASK_DEC'] 24 | exclude_keys = set(['SINGLE', 'WORD', 'LENGTH'] + stack_keys) 25 | stack_keys = set(stack_keys) 26 | batch = {'WORD': words, 'LENGTH': lengths} 27 | batch.update({key: field[index, :max_length] for key, field in data.items() if key not in exclude_keys}) 28 | batch.update({key: field[index, :2 * max_length - 1] for key, field in data.items() if key in stack_keys}) 29 | return batch 30 | 31 | 32 | def get_bucketed_batch(data, batch_size, unk_replace=0.): 33 | data_buckets, bucket_sizes = data 34 | total_size = float(sum(bucket_sizes)) 35 | # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use 36 | # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to 37 | # the size if i-th training bucket, as used later. 38 | buckets_scale = [sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes))] 39 | 40 | # Choose a bucket according to data distribution. We pick a random number 41 | # in [0, 1] and use the corresponding interval in train_buckets_scale. 42 | random_number = np.random.random_sample() 43 | bucket_id = min([i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number]) 44 | 45 | data = data_buckets[bucket_id] 46 | bucket_size = bucket_sizes[bucket_id] 47 | batch_size = min(bucket_size, batch_size) 48 | index = torch.randperm(bucket_size).long()[:batch_size] 49 | 50 | lengths = data['LENGTH'][index] 51 | max_length = lengths.max().item() 52 | words = data['WORD'] 53 | single = data['SINGLE'] 54 | words = words[index, :max_length] 55 | single = single[index, :max_length] 56 | if unk_replace: 57 | ones = single.new_ones(batch_size, max_length) 58 | noise = single.new_empty(batch_size, max_length).bernoulli_(unk_replace).long() 59 | words = words * (ones - single * noise) 60 | 61 | stack_keys = ['STACK_HEAD', 'CHILD', 'SIBLING', 'STACK_TYPE', 'SKIP_CONNECT', 'MASK_DEC'] 62 | exclude_keys = set(['SINGLE', 'WORD', 'LENGTH'] + stack_keys) 63 | stack_keys = set(stack_keys) 64 | batch = {'WORD': words, 'LENGTH': lengths} 65 | batch.update({key: field[index, :max_length] for key, field in data.items() if key not in exclude_keys}) 66 | batch.update({key: field[index, :2 * max_length - 1] for key, field in data.items() if key in stack_keys}) 67 | return batch 68 | 69 | 70 | def iterate_batch(data, batch_size, unk_replace=0., shuffle=False): 71 | data, data_size = data 72 | 73 | words = data['WORD'] 74 | single = data['SINGLE'] 75 | max_length = words.size(1) 76 | if unk_replace: 77 | ones = single.new_ones(data_size, max_length) 78 | noise = single.new_empty(data_size, max_length).bernoulli_(unk_replace).long() 79 | words = words * (ones - single * noise) 80 | 81 | indices = None 82 | if shuffle: 83 | indices = torch.randperm(data_size).long() 84 | indices = indices.to(words.device) 85 | 86 | stack_keys = ['STACK_HEAD', 'CHILD', 'SIBLING', 'STACK_TYPE', 'SKIP_CONNECT', 'MASK_DEC'] 87 | exclude_keys = set(['SINGLE', 'WORD', 'LENGTH'] + stack_keys) 88 | stack_keys = set(stack_keys) 89 | for start_idx in range(0, data_size, batch_size): 90 | if shuffle: 91 | excerpt = indices[start_idx:start_idx + batch_size] 92 | else: 93 | excerpt = slice(start_idx, start_idx + batch_size) 94 | 95 | lengths = data['LENGTH'][excerpt] 96 | batch_length = lengths.max().item() 97 | batch = {'WORD': words[excerpt, :batch_length], 'LENGTH': lengths} 98 | batch.update({key: field[excerpt, :batch_length] for key, field in data.items() if key not in exclude_keys}) 99 | batch.update({key: field[excerpt, :2 * batch_length - 1] for key, field in data.items() if key in stack_keys}) 100 | yield batch 101 | 102 | 103 | def iterate_bucketed_batch(data, batch_size, unk_replace=0., shuffle=False): 104 | data_tensor, bucket_sizes = data 105 | 106 | bucket_indices = np.arange(len(bucket_sizes)) 107 | if shuffle: 108 | np.random.shuffle((bucket_indices)) 109 | 110 | stack_keys = ['STACK_HEAD', 'CHILD', 'SIBLING', 'STACK_TYPE', 'SKIP_CONNECT', 'MASK_DEC'] 111 | exclude_keys = set(['SINGLE', 'WORD', 'LENGTH'] + stack_keys) 112 | stack_keys = set(stack_keys) 113 | for bucket_id in bucket_indices: 114 | data = data_tensor[bucket_id] 115 | bucket_size = bucket_sizes[bucket_id] 116 | if bucket_size == 0: 117 | continue 118 | 119 | words = data['WORD'] 120 | single = data['SINGLE'] 121 | bucket_length = words.size(1) 122 | if unk_replace: 123 | ones = single.new_ones(bucket_size, bucket_length) 124 | noise = single.new_empty(bucket_size, bucket_length).bernoulli_(unk_replace).long() 125 | words = words * (ones - single * noise) 126 | 127 | indices = None 128 | if shuffle: 129 | indices = torch.randperm(bucket_size).long() 130 | indices = indices.to(words.device) 131 | for start_idx in range(0, bucket_size, batch_size): 132 | if shuffle: 133 | excerpt = indices[start_idx:start_idx + batch_size] 134 | else: 135 | excerpt = slice(start_idx, start_idx + batch_size) 136 | 137 | lengths = data['LENGTH'][excerpt] 138 | batch_length = lengths.max().item() 139 | batch = {'WORD': words[excerpt, :batch_length], 'LENGTH': lengths} 140 | batch.update({key: field[excerpt, :batch_length] for key, field in data.items() if key not in exclude_keys}) 141 | batch.update({key: field[excerpt, :2 * batch_length - 1] for key, field in data.items() if key in stack_keys}) 142 | yield batch 143 | 144 | 145 | def iterate_data(data, batch_size, bucketed=False, unk_replace=0., shuffle=False): 146 | if bucketed: 147 | return iterate_bucketed_batch(data, batch_size, unk_replace==unk_replace, shuffle=shuffle) 148 | else: 149 | return iterate_batch(data, batch_size, unk_replace==unk_replace, shuffle=shuffle) 150 | -------------------------------------------------------------------------------- /neuronlp2/io/writer.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | 4 | class CoNLL03Writer(object): 5 | def __init__(self, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet): 6 | self.__source_file = None 7 | self.__word_alphabet = word_alphabet 8 | self.__char_alphabet = char_alphabet 9 | self.__pos_alphabet = pos_alphabet 10 | self.__chunk_alphabet = chunk_alphabet 11 | self.__ner_alphabet = ner_alphabet 12 | 13 | def start(self, file_path): 14 | self.__source_file = open(file_path, 'w') 15 | 16 | def close(self): 17 | self.__source_file.close() 18 | 19 | def write(self, word, pos, chunk, predictions, targets, lengths): 20 | batch_size, _ = word.shape 21 | for i in range(batch_size): 22 | for j in range(lengths[i]): 23 | w = self.__word_alphabet.get_instance(word[i, j]) 24 | p = self.__pos_alphabet.get_instance(pos[i, j]) 25 | ch = self.__chunk_alphabet.get_instance(chunk[i, j]) 26 | tgt = self.__ner_alphabet.get_instance(targets[i, j]) 27 | pred = self.__ner_alphabet.get_instance(predictions[i, j]) 28 | self.__source_file.write('%d %s %s %s %s %s\n' % (j + 1, w, p, ch, tgt, pred)) 29 | self.__source_file.write('\n') 30 | 31 | 32 | class POSWriter(object): 33 | def __init__(self, word_alphabet, char_alphabet, pos_alphabet): 34 | self.__source_file = None 35 | self.__word_alphabet = word_alphabet 36 | self.__char_alphabet = char_alphabet 37 | self.__pos_alphabet = pos_alphabet 38 | 39 | def start(self, file_path): 40 | self.__source_file = open(file_path, 'w') 41 | 42 | def close(self): 43 | self.__source_file.close() 44 | 45 | def write(self, word, predictions, targets, lengths, symbolic_root=False, symbolic_end=False): 46 | batch_size, _ = word.shape 47 | start = 1 if symbolic_root else 0 48 | end = 1 if symbolic_end else 0 49 | for i in range(batch_size): 50 | for j in range(start, lengths[i] - end): 51 | w = self.__word_alphabet.get_instance(word[i, j]) 52 | pred = self.__pos_alphabet.get_instance(predictions[i, j]) 53 | tgt = self.__pos_alphabet.get_instance(targets[i, j]) 54 | self.__source_file.write('%d\t%s\t_\t%s\t%s\n' % (j, w, tgt, pred)) 55 | self.__source_file.write('\n') 56 | 57 | 58 | class CoNLLXWriter(object): 59 | def __init__(self, word_alphabet, char_alphabet, pos_alphabet, type_alphabet): 60 | self.__source_file = None 61 | self.__word_alphabet = word_alphabet 62 | self.__char_alphabet = char_alphabet 63 | self.__pos_alphabet = pos_alphabet 64 | self.__type_alphabet = type_alphabet 65 | 66 | def start(self, file_path): 67 | self.__source_file = open(file_path, 'w') 68 | 69 | def close(self): 70 | self.__source_file.close() 71 | 72 | def write(self, word, pos, head, type, lengths, symbolic_root=False, symbolic_end=False): 73 | batch_size, _ = word.shape 74 | start = 1 if symbolic_root else 0 75 | end = 1 if symbolic_end else 0 76 | for i in range(batch_size): 77 | for j in range(start, lengths[i] - end): 78 | w = self.__word_alphabet.get_instance(word[i, j]) 79 | p = self.__pos_alphabet.get_instance(pos[i, j]) 80 | t = self.__type_alphabet.get_instance(type[i, j]) 81 | h = head[i, j] 82 | self.__source_file.write('%d\t%s\t_\t_\t%s\t_\t%d\t%s\n' % (j, w, p, h, t)) 83 | self.__source_file.write('\n') 84 | -------------------------------------------------------------------------------- /neuronlp2/models/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from neuronlp2.models.sequence_labeling import * 4 | from .parsing import DeepBiAffine, NeuroMST, StackPtrNet 5 | -------------------------------------------------------------------------------- /neuronlp2/models/sequence_labeling.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from overrides import overrides 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence 7 | from neuronlp2.nn import ChainCRF, VarGRU, VarRNN, VarLSTM, VarFastLSTM, CharCNN 8 | 9 | 10 | class BiRecurrentConv(nn.Module): 11 | def __init__(self, word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers, 12 | num_labels, embedd_word=None, embedd_char=None, p_in=0.33, p_out=0.5, p_rnn=(0.5, 0.5), activation='elu'): 13 | super(BiRecurrentConv, self).__init__() 14 | 15 | self.word_embed = nn.Embedding(num_words, word_dim, _weight=embedd_word, padding_idx=1) 16 | self.char_embed = nn.Embedding(num_chars, char_dim, _weight=embedd_char, padding_idx=1) 17 | self.char_cnn = CharCNN(2, char_dim, char_dim, hidden_channels=4 * char_dim, activation=activation) 18 | # dropout word 19 | self.dropout_in = nn.Dropout2d(p=p_in) 20 | # standard dropout 21 | self.dropout_rnn_in = nn.Dropout(p=p_rnn[0]) 22 | self.dropout_out = nn.Dropout(p_out) 23 | 24 | if rnn_mode == 'RNN': 25 | RNN = nn.RNN 26 | elif rnn_mode == 'LSTM' or rnn_mode == 'FastLSTM': 27 | RNN = nn.LSTM 28 | elif rnn_mode == 'GRU': 29 | RNN = nn.GRU 30 | else: 31 | raise ValueError('Unknown RNN mode: %s' % rnn_mode) 32 | 33 | self.rnn = RNN(word_dim + char_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=p_rnn[1]) 34 | 35 | self.fc = nn.Linear(hidden_size * 2, out_features) 36 | assert activation in ['elu', 'tanh'] 37 | if activation == 'elu': 38 | self.activation = nn.ELU() 39 | else: 40 | self.activation = nn.Tanh() 41 | self.readout = nn.Linear(out_features, num_labels) 42 | self.criterion = nn.CrossEntropyLoss(reduction='none') 43 | 44 | self.reset_parameters(embedd_word, embedd_char) 45 | 46 | def reset_parameters(self, embedd_word, embedd_char): 47 | if embedd_word is None: 48 | nn.init.uniform_(self.word_embed.weight, -0.1, 0.1) 49 | if embedd_char is None: 50 | nn.init.uniform_(self.char_embed.weight, -0.1, 0.1) 51 | with torch.no_grad(): 52 | self.word_embed.weight[self.word_embed.padding_idx].fill_(0) 53 | self.char_embed.weight[self.char_embed.padding_idx].fill_(0) 54 | 55 | for param in self.rnn.parameters(): 56 | if param.dim() == 1: 57 | nn.init.constant_(param, 0) 58 | else: 59 | nn.init.xavier_uniform_(param) 60 | 61 | nn.init.xavier_uniform_(self.fc.weight) 62 | nn.init.constant_(self.fc.bias, 0.) 63 | 64 | nn.init.uniform_(self.readout.weight, -0.1, 0.1) 65 | nn.init.constant_(self.readout.bias, 0.) 66 | 67 | def _get_rnn_output(self, input_word, input_char, mask=None): 68 | # [batch, length, word_dim] 69 | word = self.word_embed(input_word) 70 | 71 | # [batch, length, char_length, char_dim] 72 | char = self.char_cnn(self.char_embed(input_char)) 73 | 74 | # apply dropout word on input 75 | word = self.dropout_in(word) 76 | char = self.dropout_in(char) 77 | 78 | # concatenate word and char [batch, length, word_dim+char_filter] 79 | enc = torch.cat([word, char], dim=2) 80 | # apply dropout rnn input 81 | enc = self.dropout_rnn_in(enc) 82 | 83 | # output from rnn [batch, length, hidden_size * 2] 84 | if mask is not None: 85 | # prepare packed_sequence 86 | length = mask.sum(dim=1).long() 87 | packed_enc = pack_padded_sequence(enc, length, batch_first=True, enforce_sorted=False) 88 | packed_out, _ = self.rnn(packed_enc) 89 | output, _ = pad_packed_sequence(packed_out, batch_first=True) 90 | else: 91 | output, _ = self.rnn(enc) 92 | 93 | output = self.dropout_out(output) 94 | # [batch, length, out_features] 95 | output = self.dropout_out(self.activation(self.fc(output))) 96 | return output 97 | 98 | def forward(self, input_word, input_char, mask=None): 99 | # output from rnn [batch, length, out_features] 100 | output = self._get_rnn_output(input_word, input_char, mask=mask) 101 | return output 102 | 103 | def loss(self, input_word, input_char, target, mask=None): 104 | # [batch, length, out_features] 105 | output = self(input_word, input_char, mask=mask) 106 | # [batch, length, num_labels] -> [batch, num_labels, length] 107 | logits = self.readout(output).transpose(1, 2) 108 | 109 | # [batch, length] 110 | loss = self.criterion(logits, target) 111 | if mask is not None: 112 | loss = loss * mask 113 | # [batch] 114 | loss = loss.sum(dim=1) 115 | return loss 116 | 117 | def decode(self, input_word, input_char, mask=None, leading_symbolic=0): 118 | output = self(input_word, input_char, mask=mask) 119 | # [batch, length, num_labels] -> [batch, num_labels, length] 120 | logits = self.readout(output).transpose(1, 2) 121 | # [batch, length] 122 | _, preds = torch.max(logits[:, leading_symbolic:], dim=1) 123 | preds += leading_symbolic 124 | if mask is not None: 125 | preds = preds * mask.long() 126 | return preds 127 | 128 | 129 | class BiVarRecurrentConv(BiRecurrentConv): 130 | def __init__(self, word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers, 131 | num_labels, embedd_word=None, embedd_char=None, p_in=0.33, p_out=0.33, p_rnn=(0.33, 0.33), activation='elu'): 132 | super(BiVarRecurrentConv, self).__init__(word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers, 133 | num_labels, embedd_word=embedd_word, embedd_char=embedd_char, 134 | p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) 135 | 136 | self.dropout_rnn_in = None 137 | self.dropout_out = nn.Dropout2d(p_out) 138 | 139 | if rnn_mode == 'RNN': 140 | RNN = VarRNN 141 | elif rnn_mode == 'LSTM': 142 | RNN = VarLSTM 143 | elif rnn_mode == 'FastLSTM': 144 | RNN = VarFastLSTM 145 | elif rnn_mode == 'GRU': 146 | RNN = VarGRU 147 | else: 148 | raise ValueError('Unknown RNN mode: %s' % rnn_mode) 149 | 150 | self.rnn = RNN(word_dim + char_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=p_rnn) 151 | 152 | @overrides 153 | def _get_rnn_output(self, input_word, input_char, mask=None): 154 | # [batch, length, word_dim] 155 | word = self.word_embed(input_word) 156 | 157 | # [batch, length, char_length, char_dim] 158 | char = self.char_cnn(self.char_embed(input_char)) 159 | 160 | # apply dropout word on input 161 | word = self.dropout_in(word) 162 | char = self.dropout_in(char) 163 | 164 | # concatenate word and char [batch, length, word_dim+char_filter] 165 | enc = torch.cat([word, char], dim=2) 166 | # output from rnn [batch, length, 2 * hidden_size] 167 | output, _ = self.rnn(enc, mask) 168 | 169 | # apply dropout for the output of rnn 170 | # [batch, length, 2 * hidden_size] --> [batch, 2 * hidden_size, length] --> [batch, length, 2 * hidden_size] 171 | output = self.dropout_out(output.transpose(1, 2)).transpose(1, 2) 172 | # [batch, length, out_features] 173 | output = self.activation(self.fc(output)) 174 | # [batch, length, out_features] --> [batch, out_features, length] --> [batch, length, out_features] 175 | output = self.dropout_out(output.transpose(1, 2)).transpose(1, 2) 176 | return output 177 | 178 | 179 | class BiRecurrentConvCRF(BiRecurrentConv): 180 | def __init__(self, word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers, 181 | num_labels, embedd_word=None, embedd_char=None, p_in=0.33, p_out=0.5, p_rnn=(0.5, 0.5), bigram=False, activation='elu'): 182 | super(BiRecurrentConvCRF, self).__init__(word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers, 183 | num_labels, embedd_word=embedd_word, embedd_char=embedd_char, 184 | p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) 185 | 186 | self.crf = ChainCRF(out_features, num_labels, bigram=bigram) 187 | self.readout = None 188 | self.criterion = None 189 | 190 | def forward(self, input_word, input_char, mask=None): 191 | # output from rnn [batch, length, hidden_size] 192 | output = self._get_rnn_output(input_word, input_char, mask=mask) 193 | # [batch, length, num_label, num_label] 194 | return self.crf(output, mask=mask) 195 | 196 | @overrides 197 | def loss(self, input_word, input_char, target, mask=None): 198 | # output from rnn [batch, length, hidden_size] 199 | output = self._get_rnn_output(input_word, input_char, mask=mask) 200 | # [batch] 201 | return self.crf.loss(output, target, mask=mask) 202 | 203 | @overrides 204 | def decode(self, input_word, input_char, mask=None, leading_symbolic=0): 205 | # output from rnn [batch, length, hidden_size] 206 | output = self._get_rnn_output(input_word, input_char, mask=mask) 207 | # [batch, length] 208 | preds = self.crf.decode(output, mask=mask, leading_symbolic=leading_symbolic) 209 | if mask is not None: 210 | preds = preds * mask.long() 211 | return preds 212 | 213 | 214 | class BiVarRecurrentConvCRF(BiVarRecurrentConv): 215 | def __init__(self, word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers, 216 | num_labels, embedd_word=None, embedd_char=None, p_in=0.33, p_out=0.33, p_rnn=(0.33, 0.33), bigram=False, activation='elu'): 217 | super(BiVarRecurrentConvCRF, self).__init__(word_dim, num_words, char_dim, num_chars, rnn_mode, hidden_size, out_features, num_layers, 218 | num_labels, embedd_word=embedd_word, embedd_char=embedd_char, 219 | p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) 220 | 221 | self.crf = ChainCRF(out_features, num_labels, bigram=bigram) 222 | self.readout = None 223 | self.criterion = None 224 | 225 | def forward(self, input_word, input_char, mask=None): 226 | # output from rnn [batch, length, hidden_size] 227 | output = self._get_rnn_output(input_word, input_char, mask=mask) 228 | # [batch, length, num_label, num_label] 229 | return self.crf(output, mask=mask) 230 | 231 | @overrides 232 | def loss(self, input_word, input_char, target, mask=None): 233 | # output from rnn [batch, length, hidden_size] 234 | output = self._get_rnn_output(input_word, input_char, mask=mask) 235 | # [batch] 236 | return self.crf.loss(output, target, mask=mask) 237 | 238 | @overrides 239 | def decode(self, input_word, input_char, mask=None, leading_symbolic=0): 240 | # output from rnn [batch, length, hidden_size] 241 | output = self._get_rnn_output(input_word, input_char, mask=mask) 242 | preds = self.crf.decode(output, mask=mask, leading_symbolic=leading_symbolic) 243 | if mask is not None: 244 | preds = preds * mask.long() 245 | return preds 246 | -------------------------------------------------------------------------------- /neuronlp2/nn/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from neuronlp2.nn import init 4 | from neuronlp2.nn.crf import ChainCRF, TreeCRF 5 | from neuronlp2.nn.modules import BiLinear, BiAffine, CharCNN 6 | from neuronlp2.nn.variational_rnn import * 7 | from neuronlp2.nn.skip_rnn import * 8 | -------------------------------------------------------------------------------- /neuronlp2/nn/_functions/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from neuronlp2.nn._functions import variational_rnn 4 | from neuronlp2.nn._functions import skipconnect_rnn 5 | -------------------------------------------------------------------------------- /neuronlp2/nn/_functions/rnnFusedBackend.py: -------------------------------------------------------------------------------- 1 | from torch.autograd.function import Function, once_differentiable 2 | from torch._thnn import type2backend 3 | 4 | 5 | class GRUFused(Function): 6 | @staticmethod 7 | def forward(ctx, input_gate, hidden_gate, hx, ibias=None, hbias=None): 8 | ctx.backend = type2backend[input_gate.type()] 9 | 10 | hy = input_gate.new() 11 | workspace = input_gate.new(hx.numel() * 5) 12 | 13 | ctx.has_bias = False 14 | if ibias is not None: 15 | ctx.has_bias = True 16 | if ibias.dim() == 1: 17 | ibias = ibias.unsqueeze(0) 18 | if hbias.dim() == 1: 19 | hbias = hbias.unsqueeze(0) 20 | 21 | ctx.backend.GRUFused_updateOutput( 22 | ctx.backend.library_state, 23 | input_gate, hidden_gate, ibias, hbias, hx, hy, workspace) 24 | 25 | ctx.workspace = workspace 26 | ctx.igate_size = input_gate.size() 27 | ctx.hgate_size = hidden_gate.size() 28 | 29 | return hy 30 | 31 | @staticmethod 32 | @once_differentiable 33 | def backward(ctx, gradOutput): 34 | ctx.backend = type2backend[gradOutput.type()] 35 | 36 | gradInputHx = gradOutput.new() 37 | gradInInput = gradOutput.new(*ctx.igate_size) 38 | gradInHidden = gradOutput.new(*ctx.hgate_size) 39 | 40 | ctx.backend.GRUFused_updateGradInput( 41 | ctx.backend.library_state, 42 | gradInInput, gradInHidden, gradOutput, gradInputHx, ctx.workspace) 43 | 44 | gb1 = gb2 = None 45 | if ctx.has_bias: 46 | gb1 = gradInInput.sum(0, keepdim=False) 47 | gb2 = gradInHidden.sum(0, keepdim=False) 48 | return gradInInput, gradInHidden, gradInputHx, gb1, gb2 49 | 50 | 51 | class LSTMFused(Function): 52 | @staticmethod 53 | def forward(ctx, input_gate, hidden_gate, cx, ibias=None, hbias=None): 54 | ctx.backend = type2backend[input_gate.type()] 55 | hy = input_gate.new() 56 | cy = input_gate.new() 57 | 58 | ctx.has_bias = False 59 | if ibias is not None: 60 | ctx.has_bias = True 61 | if ibias.dim() == 1: 62 | ibias = ibias.unsqueeze(0) 63 | if hbias.dim() == 1: 64 | hbias = hbias.unsqueeze(0) 65 | 66 | # input_gate gets overwritten with some intermediate values to use in backwards 67 | ctx.backend.LSTMFused_updateOutput( 68 | ctx.backend.library_state, 69 | input_gate, hidden_gate, 70 | ibias, hbias, 71 | cx, hy, cy) 72 | 73 | ctx.hgate_size = hidden_gate.size() 74 | ctx.save_for_backward(input_gate, cx, cy) 75 | 76 | return hy, cy 77 | 78 | @staticmethod 79 | @once_differentiable 80 | def backward(ctx, *gradOutput): 81 | ctx.backend = type2backend[gradOutput[0].type()] 82 | gradInputCx = gradOutput[0].new() 83 | gradInGates = gradOutput[0].new(*ctx.hgate_size) 84 | 85 | saved_tens, cx, cy = ctx.saved_tensors 86 | ctx.backend.LSTMFused_updateGradInput( 87 | ctx.backend.library_state, 88 | saved_tens, gradInGates, cx, cy, 89 | gradOutput[0], gradOutput[1], gradInputCx) 90 | 91 | gb1 = gb2 = None 92 | if ctx.has_bias: 93 | gb1 = gradInGates.sum(0, keepdim=False) 94 | gb2 = gradInGates.sum(0, keepdim=False) 95 | 96 | return gradInGates, gradInGates, gradInputCx, gb1, gb2 97 | -------------------------------------------------------------------------------- /neuronlp2/nn/_functions/skipconnect_rnn.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | 6 | 7 | def SkipConnectRNNReLUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None, noise_skip=None): 8 | if noise_in is not None: 9 | input = input * noise_in 10 | 11 | hidden = torch.cat([hidden, hidden_skip], dim=1) 12 | if noise_hidden is not None: 13 | hidden = hidden * noise_hidden 14 | 15 | hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 16 | return hy 17 | 18 | 19 | def SkipConnectRNNTanhCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 20 | if noise_in is not None: 21 | input = input * noise_in 22 | 23 | hidden = torch.cat([hidden, hidden_skip], dim=1) 24 | if noise_hidden is not None: 25 | hidden = hidden * noise_hidden 26 | 27 | hy = torch.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 28 | return hy 29 | 30 | 31 | def SkipConnectLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 32 | input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 33 | 34 | hx, cx = hidden 35 | hx = torch.cat([hx, hidden_skip], dim=1) 36 | hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden 37 | 38 | gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 39 | 40 | ingate, forgetgate, cellgate, outgate = gates 41 | 42 | ingate = torch.sigmoid(ingate) 43 | forgetgate = torch.sigmoid(forgetgate) 44 | cellgate = torch.tanh(cellgate) 45 | outgate = torch.sigmoid(outgate) 46 | 47 | cy = (forgetgate * cx) + (ingate * cellgate) 48 | hy = outgate * torch.tanh(cy) 49 | 50 | return hy, cy 51 | 52 | 53 | def SkipConnectFastLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 54 | if noise_in is not None: 55 | input = input * noise_in 56 | 57 | hx, cx = hidden 58 | hx = torch.cat([hx, hidden_skip], dim=1) 59 | if noise_hidden is not None: 60 | hx = hx * noise_hidden 61 | 62 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) 63 | 64 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 65 | 66 | ingate = torch.sigmoid(ingate) 67 | forgetgate = torch.sigmoid(forgetgate) 68 | cellgate = torch.tanh(cellgate) 69 | outgate = torch.sigmoid(outgate) 70 | 71 | cy = (forgetgate * cx) + (ingate * cellgate) 72 | hy = outgate * torch.tanh(cy) 73 | 74 | return hy, cy 75 | 76 | 77 | def SkipConnectGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 78 | input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 79 | hx = torch.cat([hidden, hidden_skip], dim=1) 80 | hx = hx.expand(3, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden 81 | 82 | gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) 83 | gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 84 | i_r, i_i, i_n = gi 85 | h_r, h_i, h_n = gh 86 | 87 | resetgate = torch.sigmoid(i_r + h_r) 88 | inputgate = torch.sigmoid(i_i + h_i) 89 | newgate = torch.tanh(i_n + resetgate * h_n) 90 | hy = newgate + inputgate * (hidden - newgate) 91 | 92 | return hy 93 | 94 | 95 | def SkipConnectFastGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 96 | if noise_in is not None: 97 | input = input * noise_in 98 | 99 | hx = torch.cat([hidden, hidden_skip], dim=1) 100 | if noise_hidden is not None: 101 | hx = hx * noise_hidden 102 | 103 | gi = F.linear(input, w_ih, b_ih) 104 | gh = F.linear(hx, w_hh, b_hh) 105 | i_r, i_i, i_n = gi.chunk(3, 1) 106 | h_r, h_i, h_n = gh.chunk(3, 1) 107 | 108 | resetgate = torch.sigmoid(i_r + h_r) 109 | inputgate = torch.sigmoid(i_i + h_i) 110 | newgate = torch.tanh(i_n + resetgate * h_n) 111 | hy = newgate + inputgate * (hidden - newgate) 112 | 113 | return hy 114 | 115 | 116 | def SkipConnectRecurrent(reverse=False): 117 | def forward(input, skip_connect, hidden, cell, mask): 118 | # hack to handle LSTM 119 | h0 = hidden[0] if isinstance(hidden, tuple) else hidden 120 | # [length + 1, batch, hidden_size] 121 | output = input.new_zeros(input.size(0) + 1, *h0.size()) + h0 122 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) 123 | # create batch index 124 | batch_index = torch.arange(0, h0.size(0)).type_as(skip_connect) 125 | for i in steps: 126 | if mask is None or mask[i].data.min() > 0.5: 127 | hidden_skip = output[skip_connect[i], batch_index] 128 | hidden = cell(input[i], hidden, hidden_skip) 129 | elif mask[i].data.max() > 0.5: 130 | hidden_skip = output[skip_connect[i], batch_index] 131 | hidden_next = cell(input[i], hidden, hidden_skip) 132 | # hack to handle LSTM 133 | if isinstance(hidden, tuple): 134 | hx, cx = hidden 135 | hp1, cp1 = hidden_next 136 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i]) 137 | else: 138 | hidden = hidden + (hidden_next - hidden) * mask[i] 139 | # hack to handle LSTM 140 | if reverse: 141 | output[i] = hidden[0] if isinstance(hidden, tuple) else hidden 142 | else: 143 | output[i + 1] = hidden[0] if isinstance(hidden, tuple) else hidden 144 | 145 | if reverse: 146 | # remove last position 147 | output = output[:-1] 148 | else: 149 | # remove position 0 150 | output = output[1:] 151 | 152 | return hidden, output 153 | 154 | return forward 155 | 156 | 157 | def StackedRNN(inners, num_layers, lstm=False): 158 | num_directions = len(inners) 159 | total_layers = num_layers * num_directions 160 | 161 | def reverse_skip_connection(skip_connect): 162 | # TODO reverse skip connection for bidirectional rnn. 163 | return skip_connect 164 | 165 | def forward(input, skip_connect, hidden, cells, mask): 166 | assert (len(cells) == total_layers) 167 | next_hidden = [] 168 | 169 | skip_connect_forward = skip_connect 170 | skip_connec_backward = reverse_skip_connection(skip_connect) if num_directions == 2 else None 171 | 172 | if lstm: 173 | hidden = list(zip(*hidden)) 174 | 175 | for i in range(num_layers): 176 | all_output = [] 177 | for j, inner in enumerate(inners): 178 | l = i * num_directions + j 179 | skip_connect = skip_connect_forward if j == 0 else skip_connec_backward 180 | hy, output = inner(input, skip_connect, hidden[l], cells[l], mask) 181 | next_hidden.append(hy) 182 | all_output.append(output) 183 | 184 | input = torch.cat(all_output, input.dim() - 1) 185 | 186 | if lstm: 187 | next_h, next_c = zip(*next_hidden) 188 | next_hidden = ( 189 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), 190 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) 191 | ) 192 | else: 193 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) 194 | 195 | return next_hidden, input 196 | 197 | return forward 198 | 199 | 200 | def AutogradSkipConnectRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False): 201 | rec_factory = SkipConnectRecurrent 202 | 203 | if bidirectional: 204 | layer = (rec_factory(), rec_factory(reverse=True)) 205 | else: 206 | layer = (rec_factory(),) 207 | 208 | func = StackedRNN(layer, 209 | num_layers, 210 | lstm=lstm) 211 | 212 | def forward(input, skip_connect, cells, hidden, mask): 213 | if batch_first: 214 | input = input.transpose(0, 1) 215 | skip_connect = skip_connect.transpose(0, 1) 216 | if mask is not None: 217 | mask = mask.transpose(0, 1) 218 | 219 | nexth, output = func(input, skip_connect, hidden, cells, mask) 220 | 221 | if batch_first: 222 | output = output.transpose(0, 1) 223 | 224 | return output, nexth 225 | 226 | return forward 227 | 228 | 229 | def SkipConnectStep(): 230 | def forward(input, hidden, hidden_skip, cell, mask): 231 | if mask is None or mask.data.min() > 0.5: 232 | hidden = cell(input, hidden, hidden_skip) 233 | elif mask.data.max() > 0.5: 234 | hidden_next = cell(input, hidden, hidden_skip) 235 | # hack to handle LSTM 236 | if isinstance(hidden, tuple): 237 | hx, cx = hidden 238 | hp1, cp1 = hidden_next 239 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) 240 | else: 241 | hidden = hidden + (hidden_next - hidden) * mask 242 | # hack to handle LSTM 243 | output = hidden[0] if isinstance(hidden, tuple) else hidden 244 | 245 | return hidden, output 246 | 247 | return forward 248 | 249 | 250 | def StackedStep(layer, num_layers, lstm=False): 251 | def forward(input, hidden, hidden_skip, cells, mask): 252 | assert (len(cells) == num_layers) 253 | next_hidden = [] 254 | 255 | if lstm: 256 | hidden = list(zip(*hidden)) 257 | 258 | for l in range(num_layers): 259 | hy, output = layer(input, hidden[l], hidden_skip[l], cells[l], mask) 260 | next_hidden.append(hy) 261 | input = output 262 | 263 | if lstm: 264 | next_h, next_c = zip(*next_hidden) 265 | next_hidden = ( 266 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), 267 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) 268 | ) 269 | else: 270 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) 271 | 272 | return next_hidden, input 273 | 274 | return forward 275 | 276 | 277 | def AutogradSkipConnectStep(num_layers=1, lstm=False): 278 | layer = SkipConnectStep() 279 | 280 | func = StackedStep(layer, 281 | num_layers, 282 | lstm=lstm) 283 | 284 | def forward(input, cells, hidden, hidden_skip, mask): 285 | nexth, output = func(input, hidden, hidden_skip, cells, mask) 286 | return output, nexth 287 | 288 | return forward 289 | -------------------------------------------------------------------------------- /neuronlp2/nn/_functions/variational_rnn.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | 6 | 7 | def VarRNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 8 | if noise_in is not None: 9 | input = input * noise_in 10 | if noise_hidden is not None: 11 | hidden = hidden * noise_hidden 12 | hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 13 | return hy 14 | 15 | 16 | def VarRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 17 | if noise_in is not None: 18 | input = input * noise_in 19 | if noise_hidden is not None: 20 | hidden = hidden * noise_hidden 21 | hy = torch.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 22 | return hy 23 | 24 | 25 | def VarLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 26 | input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 27 | 28 | hx, cx = hidden 29 | hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden 30 | 31 | gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 32 | 33 | ingate, forgetgate, cellgate, outgate = gates 34 | 35 | ingate = torch.sigmoid(ingate) 36 | forgetgate = torch.sigmoid(forgetgate) 37 | cellgate = torch.tanh(cellgate) 38 | outgate = torch.sigmoid(outgate) 39 | 40 | cy = (forgetgate * cx) + (ingate * cellgate) 41 | hy = outgate * torch.tanh(cy) 42 | 43 | return hy, cy 44 | 45 | 46 | def VarFastLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 47 | if noise_in is not None: 48 | input = input * noise_in 49 | 50 | hx, cx = hidden 51 | if noise_hidden is not None: 52 | hx = hx * noise_hidden 53 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) 54 | 55 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 56 | 57 | ingate = torch.sigmoid(ingate) 58 | forgetgate = torch.sigmoid(forgetgate) 59 | cellgate = torch.tanh(cellgate) 60 | outgate = torch.sigmoid(outgate) 61 | 62 | cy = (forgetgate * cx) + (ingate * cellgate) 63 | hy = outgate * torch.tanh(cy) 64 | 65 | return hy, cy 66 | 67 | 68 | def VarGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 69 | input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 70 | hx = hidden.expand(3, *hidden.size()) if noise_hidden is None else hidden.unsqueeze(0) * noise_hidden 71 | 72 | gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) 73 | gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 74 | i_r, i_i, i_n = gi 75 | h_r, h_i, h_n = gh 76 | 77 | resetgate = torch.sigmoid(i_r + h_r) 78 | inputgate = torch.sigmoid(i_i + h_i) 79 | newgate = torch.tanh(i_n + resetgate * h_n) 80 | hy = newgate + inputgate * (hidden - newgate) 81 | 82 | return hy 83 | 84 | 85 | def VarFastGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 86 | if noise_in is not None: 87 | input = input * noise_in 88 | 89 | hx = hidden if noise_hidden is None else hidden * noise_hidden 90 | 91 | gi = F.linear(input, w_ih, b_ih) 92 | gh = F.linear(hx, w_hh, b_hh) 93 | i_r, i_i, i_n = gi.chunk(3, 1) 94 | h_r, h_i, h_n = gh.chunk(3, 1) 95 | 96 | resetgate = torch.sigmoid(i_r + h_r) 97 | inputgate = torch.sigmoid(i_i + h_i) 98 | newgate = torch.tanh(i_n + resetgate * h_n) 99 | hy = newgate + inputgate * (hidden - newgate) 100 | 101 | return hy 102 | 103 | 104 | def VarRecurrent(reverse=False): 105 | def forward(input, hidden, cell, mask): 106 | output = [] 107 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) 108 | for i in steps: 109 | if mask is None or mask[i].data.min() > 0.5: 110 | hidden = cell(input[i], hidden) 111 | elif mask[i].data.max() > 0.5: 112 | hidden_next = cell(input[i], hidden) 113 | # hack to handle LSTM 114 | if isinstance(hidden, tuple): 115 | hx, cx = hidden 116 | hp1, cp1 = hidden_next 117 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i]) 118 | else: 119 | hidden = hidden + (hidden_next - hidden) * mask[i] 120 | # hack to handle LSTM 121 | output.append(hidden[0] if isinstance(hidden, tuple) else hidden) 122 | 123 | if reverse: 124 | output.reverse() 125 | output = torch.cat(output, 0).view(input.size(0), *output[0].size()) 126 | 127 | return hidden, output 128 | 129 | return forward 130 | 131 | 132 | def StackedRNN(inners, num_layers, lstm=False): 133 | num_directions = len(inners) 134 | total_layers = num_layers * num_directions 135 | 136 | def forward(input, hidden, cells, mask): 137 | assert (len(cells) == total_layers) 138 | next_hidden = [] 139 | 140 | if lstm: 141 | hidden = list(zip(*hidden)) 142 | 143 | for i in range(num_layers): 144 | all_output = [] 145 | for j, inner in enumerate(inners): 146 | l = i * num_directions + j 147 | hy, output = inner(input, hidden[l], cells[l], mask) 148 | next_hidden.append(hy) 149 | all_output.append(output) 150 | 151 | input = torch.cat(all_output, input.dim() - 1) 152 | 153 | if lstm: 154 | next_h, next_c = zip(*next_hidden) 155 | next_hidden = ( 156 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), 157 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) 158 | ) 159 | else: 160 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) 161 | 162 | return next_hidden, input 163 | 164 | return forward 165 | 166 | 167 | def AutogradVarRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False): 168 | rec_factory = VarRecurrent 169 | 170 | if bidirectional: 171 | layer = (rec_factory(), rec_factory(reverse=True)) 172 | else: 173 | layer = (rec_factory(),) 174 | 175 | func = StackedRNN(layer, 176 | num_layers, 177 | lstm=lstm) 178 | 179 | def forward(input, cells, hidden, mask): 180 | if batch_first: 181 | input = input.transpose(0, 1) 182 | if mask is not None: 183 | mask = mask.transpose(0, 1) 184 | 185 | nexth, output = func(input, hidden, cells, mask) 186 | 187 | if batch_first: 188 | output = output.transpose(0, 1) 189 | 190 | return output, nexth 191 | 192 | return forward 193 | 194 | 195 | def VarRNNStep(): 196 | def forward(input, hidden, cell, mask): 197 | if mask is None or mask.data.min() > 0.5: 198 | hidden = cell(input, hidden) 199 | elif mask.data.max() > 0.5: 200 | hidden_next = cell(input, hidden) 201 | # hack to handle LSTM 202 | if isinstance(hidden, tuple): 203 | hx, cx = hidden 204 | hp1, cp1 = hidden_next 205 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) 206 | else: 207 | hidden = hidden + (hidden_next - hidden) * mask 208 | # hack to handle LSTM 209 | output = hidden[0] if isinstance(hidden, tuple) else hidden 210 | 211 | return hidden, output 212 | 213 | return forward 214 | 215 | 216 | def StackedStep(layer, num_layers, lstm=False): 217 | def forward(input, hidden, cells, mask): 218 | assert (len(cells) == num_layers) 219 | next_hidden = [] 220 | 221 | if lstm: 222 | hidden = list(zip(*hidden)) 223 | 224 | for l in range(num_layers): 225 | hy, output = layer(input, hidden[l], cells[l], mask) 226 | next_hidden.append(hy) 227 | input = output 228 | 229 | if lstm: 230 | next_h, next_c = zip(*next_hidden) 231 | next_hidden = ( 232 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), 233 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) 234 | ) 235 | else: 236 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) 237 | 238 | return next_hidden, input 239 | 240 | return forward 241 | 242 | 243 | def AutogradVarRNNStep(num_layers=1, lstm=False): 244 | layer = VarRNNStep() 245 | 246 | func = StackedStep(layer, 247 | num_layers, 248 | lstm=lstm) 249 | 250 | def forward(input, cells, hidden, mask): 251 | nexth, output = func(input, hidden, cells, mask) 252 | return output, nexth 253 | 254 | return forward 255 | -------------------------------------------------------------------------------- /neuronlp2/nn/crf.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn.parameter import Parameter 7 | from neuronlp2.nn.modules import BiAffine 8 | 9 | 10 | class ChainCRF(nn.Module): 11 | def __init__(self, input_size, num_labels, bigram=True): 12 | ''' 13 | 14 | Args: 15 | input_size: int 16 | the dimension of the input. 17 | num_labels: int 18 | the number of labels of the crf layer 19 | bigram: bool 20 | if apply bi-gram parameter. 21 | ''' 22 | super(ChainCRF, self).__init__() 23 | self.input_size = input_size 24 | self.num_labels = num_labels + 1 25 | self.pad_label_id = num_labels 26 | self.bigram = bigram 27 | 28 | 29 | # state weight tensor 30 | self.state_net = nn.Linear(input_size, self.num_labels) 31 | if bigram: 32 | # transition weight tensor 33 | self.transition_net = nn.Linear(input_size, self.num_labels * self.num_labels) 34 | self.register_parameter('transition_matrix', None) 35 | else: 36 | self.transition_net = None 37 | self.transition_matrix = Parameter(torch.Tensor(self.num_labels, self.num_labels)) 38 | 39 | self.reset_parameters() 40 | 41 | def reset_parameters(self): 42 | nn.init.constant_(self.state_net.bias, 0.) 43 | if self.bigram: 44 | nn.init.xavier_uniform_(self.transition_net.weight) 45 | nn.init.constant_(self.transition_net.bias, 0.) 46 | else: 47 | nn.init.normal_(self.transition_matrix) 48 | 49 | def forward(self, input, mask=None): 50 | ''' 51 | 52 | Args: 53 | input: Tensor 54 | the input tensor with shape = [batch, length, model_dim] 55 | mask: Tensor or None 56 | the mask tensor with shape = [batch, length] 57 | 58 | Returns: Tensor 59 | the energy tensor with shape = [batch, length, num_label, num_label] 60 | 61 | ''' 62 | batch, length, _ = input.size() 63 | 64 | # compute out_s by tensor dot [batch, length, model_dim] * [model_dim, num_label] 65 | # thus out_s should be [batch, length, num_label] --> [batch, length, num_label, 1] 66 | out_s = self.state_net(input).unsqueeze(2) 67 | 68 | if self.bigram: 69 | # compute out_s by tensor dot: [batch, length, model_dim] * [model_dim, num_label * num_label] 70 | # the output should be [batch, length, num_label, num_label] 71 | out_t = self.transition_net(input).view(batch, length, self.num_labels, self.num_labels) 72 | output = out_t + out_s 73 | else: 74 | # [batch, length, num_label, num_label] 75 | output = self.transition_matrix + out_s 76 | 77 | if mask is not None: 78 | output = output * mask.unsqueeze(2).unsqueeze(3) 79 | 80 | return output 81 | 82 | def loss(self, input, target, mask=None): 83 | ''' 84 | 85 | Args: 86 | input: Tensor 87 | the input tensor with shape = [batch, length, model_dim] 88 | target: Tensor 89 | the tensor of target labels with shape [batch, length] 90 | mask:Tensor or None 91 | the mask tensor with shape = [batch, length] 92 | 93 | Returns: Tensor 94 | A 1D tensor for minus log likelihood loss [batch] 95 | ''' 96 | batch, length, _ = input.size() 97 | energy = self(input, mask=mask) 98 | # shape = [length, batch, num_label, num_label] 99 | energy_transpose = energy.transpose(0, 1) 100 | # shape = [length, batch] 101 | target_transpose = target.transpose(0, 1) 102 | # shape = [length, batch, 1] 103 | mask_transpose = None 104 | if mask is not None: 105 | mask_transpose = mask.unsqueeze(2).transpose(0, 1) 106 | 107 | # shape = [batch, num_label] 108 | partition = None 109 | 110 | # shape = [batch] 111 | batch_index = torch.arange(0, batch).type_as(input).long() 112 | prev_label = input.new_full((batch, ), self.num_labels - 1).long() 113 | tgt_energy = input.new_zeros(batch) 114 | 115 | for t in range(length): 116 | # shape = [batch, num_label, num_label] 117 | curr_energy = energy_transpose[t] 118 | if t == 0: 119 | partition = curr_energy[:, -1, :] 120 | else: 121 | # shape = [batch, num_label] 122 | partition_new = torch.logsumexp(curr_energy + partition.unsqueeze(2), dim=1) 123 | if mask_transpose is None: 124 | partition = partition_new 125 | else: 126 | mask_t = mask_transpose[t] 127 | partition = partition + (partition_new - partition) * mask_t 128 | tgt_energy += curr_energy[batch_index, prev_label, target_transpose[t]] 129 | prev_label = target_transpose[t] 130 | 131 | return torch.logsumexp(partition, dim=1) - tgt_energy 132 | 133 | def decode(self, input, mask=None, leading_symbolic=0): 134 | """ 135 | 136 | Args: 137 | input: Tensor 138 | the input tensor with shape = [batch, length, model_dim] 139 | mask: Tensor or None 140 | the mask tensor with shape = [batch, length] 141 | leading_symbolic: nt 142 | number of symbolic labels leading in type alphabets (set it to 0 if you are not sure) 143 | 144 | Returns: Tensor 145 | decoding results in shape [batch, length] 146 | 147 | """ 148 | 149 | energy = self(input, mask=mask) 150 | 151 | # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) 152 | # For convenience, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) 153 | energy_transpose = energy.transpose(0, 1) 154 | 155 | # the last row and column is the tag for pad symbol. reduce these two dimensions by 1 to remove that. 156 | # also remove the first #symbolic rows and columns. 157 | # now the shape of energies_shuffled is [n_time_steps, b_batch, t, t] where t = num_labels - #symbolic - 1. 158 | energy_transpose = energy_transpose[:, :, leading_symbolic:-1, leading_symbolic:-1] 159 | 160 | length, batch_size, num_label, _ = energy_transpose.size() 161 | 162 | batch_index = torch.arange(0, batch_size).type_as(input).long() 163 | pi = input.new_zeros([length, batch_size, num_label]) 164 | pointer = batch_index.new_zeros(length, batch_size, num_label) 165 | back_pointer = batch_index.new_zeros(length, batch_size) 166 | 167 | pi[0] = energy[:, 0, -1, leading_symbolic:-1] 168 | pointer[0] = -1 169 | for t in range(1, length): 170 | pi_prev = pi[t - 1] 171 | pi[t], pointer[t] = torch.max(energy_transpose[t] + pi_prev.unsqueeze(2), dim=1) 172 | 173 | _, back_pointer[-1] = torch.max(pi[-1], dim=1) 174 | for t in reversed(range(length - 1)): 175 | pointer_last = pointer[t + 1] 176 | back_pointer[t] = pointer_last[batch_index, back_pointer[t + 1]] 177 | 178 | return back_pointer.transpose(0, 1) + leading_symbolic 179 | 180 | 181 | class TreeCRF(nn.Module): 182 | ''' 183 | Tree CRF layer. 184 | ''' 185 | def __init__(self, model_dim): 186 | """ 187 | 188 | Args: 189 | model_dim: int 190 | the dimension of the input. 191 | 192 | """ 193 | super(TreeCRF, self).__init__() 194 | self.model_dim = model_dim 195 | self.energy = BiAffine(model_dim, model_dim) 196 | 197 | def forward(self, heads, children, mask=None): 198 | ''' 199 | 200 | Args: 201 | heads: Tensor 202 | the head input tensor with shape = [batch, length, model_dim] 203 | children: Tensor 204 | the child input tensor with shape = [batch, length, model_dim] 205 | mask: Tensor or None 206 | the mask tensor with shape = [batch, length] 207 | lengths: Tensor or None 208 | the length tensor with shape = [batch] 209 | 210 | Returns: Tensor 211 | the energy tensor with shape = [batch, length, length] 212 | 213 | ''' 214 | batch, length, _ = heads.size() 215 | # [batch, length, length] 216 | output = self.energy(heads, children, mask_query=mask, mask_key=mask) 217 | return output 218 | 219 | def loss(self, heads, children, target_heads, mask=None): 220 | ''' 221 | 222 | Args: 223 | heads: Tensor 224 | the head input tensor with shape = [batch, length, model_dim] 225 | children: Tensor 226 | the child input tensor with shape = [batch, length, model_dim] 227 | target_heads: Tensor 228 | the tensor of target labels with shape [batch, length] 229 | mask:Tensor or None 230 | the mask tensor with shape = [batch, length] 231 | 232 | Returns: Tensor 233 | A 1D tensor for minus log likelihood loss 234 | ''' 235 | batch, length, _ = heads.size() 236 | # [batch, length, length] 237 | energy = self(heads, children, mask=mask).double() 238 | A = torch.exp(energy) 239 | # mask out invalid positions 240 | if mask is not None: 241 | mask = mask.double() 242 | A = A * mask.unsqueeze(2) * mask.unsqueeze(1) 243 | 244 | # set diagonal elements to 0 245 | diag_mask = 1.0 - torch.eye(length).unsqueeze(0).type_as(energy) 246 | A = A * diag_mask 247 | energy = energy * diag_mask 248 | 249 | # get D [batch, length] 250 | D = A.sum(dim=1) 251 | rtol = 1e-4 252 | atol = 1e-6 253 | D += atol 254 | if mask is not None: 255 | D = D * mask 256 | 257 | # [batch, length, length] 258 | D = torch.diag_embed(D) 259 | 260 | # compute laplacian matrix 261 | # [batch, length, length] 262 | L = D - A 263 | 264 | if mask is not None: 265 | L = L + torch.diag_embed(1. - mask) 266 | 267 | # compute partition Z(x) [batch] 268 | L = L[:, 1:, 1:] 269 | z = torch.logdet(L) 270 | 271 | # first create index matrix [length, batch] 272 | index = torch.arange(0, length).view(length, 1).expand(length, batch) 273 | index = index.type_as(energy).long() 274 | batch_index = torch.arange(0, batch).type_as(index) 275 | # compute target energy [length-1, batch] 276 | tgt_energy = energy[batch_index, target_heads.t(), index][1:] 277 | # sum over dim=0 shape = [batch] 278 | tgt_energy = tgt_energy.sum(dim=0) 279 | 280 | return (z - tgt_energy).float() 281 | -------------------------------------------------------------------------------- /neuronlp2/nn/init.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import torch 4 | 5 | 6 | def assign_tensor(tensor, val): 7 | """ 8 | copy val to tensor 9 | Args: 10 | tensor: an n-dimensional torch.Tensor or autograd.Variable 11 | val: an n-dimensional torch.Tensor to fill the tensor with 12 | 13 | Returns: 14 | 15 | """ 16 | with torch.no_grad(): 17 | return tensor.copy_(val) 18 | -------------------------------------------------------------------------------- /neuronlp2/nn/modules.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from overrides import overrides 4 | from collections import OrderedDict 5 | import math 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.nn.parameter import Parameter 11 | 12 | 13 | class BiLinear(nn.Module): 14 | """ 15 | Bi-linear layer 16 | """ 17 | def __init__(self, left_features, right_features, out_features, bias=True): 18 | """ 19 | 20 | Args: 21 | left_features: size of left input 22 | right_features: size of right input 23 | out_features: size of output 24 | bias: If set to False, the layer will not learn an additive bias. 25 | Default: True 26 | """ 27 | super(BiLinear, self).__init__() 28 | self.left_features = left_features 29 | self.right_features = right_features 30 | self.out_features = out_features 31 | 32 | self.U = Parameter(torch.Tensor(self.out_features, self.left_features, self.right_features)) 33 | self.weight_left = Parameter(torch.Tensor(self.out_features, self.left_features)) 34 | self.weight_right = Parameter(torch.Tensor(self.out_features, self.right_features)) 35 | 36 | if bias: 37 | self.bias = Parameter(torch.Tensor(out_features)) 38 | else: 39 | self.register_parameter('bias', None) 40 | 41 | self.reset_parameters() 42 | 43 | def reset_parameters(self): 44 | nn.init.xavier_uniform_(self.weight_left) 45 | nn.init.xavier_uniform_(self.weight_right) 46 | nn.init.constant_(self.bias, 0.) 47 | nn.init.xavier_uniform_(self.U) 48 | 49 | def forward(self, input_left, input_right): 50 | """ 51 | 52 | Args: 53 | input_left: Tensor 54 | the left input tensor with shape = [batch1, batch2, ..., left_features] 55 | input_right: Tensor 56 | the right input tensor with shape = [batch1, batch2, ..., right_features] 57 | 58 | Returns: 59 | 60 | """ 61 | 62 | batch_size = input_left.size()[:-1] 63 | batch = int(np.prod(batch_size)) 64 | 65 | # convert left and right input to matrices [batch, left_features], [batch, right_features] 66 | input_left = input_left.view(batch, self.left_features) 67 | input_right = input_right.view(batch, self.right_features) 68 | 69 | # output [batch, out_features] 70 | output = F.bilinear(input_left, input_right, self.U, self.bias) 71 | output = output + F.linear(input_left, self.weight_left, None) + F.linear(input_right, self.weight_right, None) 72 | # convert back to [batch1, batch2, ..., out_features] 73 | return output.view(batch_size + (self.out_features, )) 74 | 75 | def __repr__(self): 76 | return self.__class__.__name__ + ' (' \ 77 | + 'left_features=' + str(self.left_features) \ 78 | + ', right_features=' + str(self.right_features) \ 79 | + ', out_features=' + str(self.out_features) + ')' 80 | 81 | 82 | class BiAffine(nn.Module): 83 | ''' 84 | Bi-Affine energy layer. 85 | ''' 86 | 87 | def __init__(self, key_dim, query_dim): 88 | ''' 89 | 90 | Args: 91 | key_dim: int 92 | the dimension of the key. 93 | query_dim: int 94 | the dimension of the query. 95 | 96 | ''' 97 | super(BiAffine, self).__init__() 98 | self.key_dim = key_dim 99 | self.query_dim = query_dim 100 | 101 | self.q_weight = Parameter(torch.Tensor(self.query_dim)) 102 | self.key_weight = Parameter(torch.Tensor(self.key_dim)) 103 | self.b = Parameter(torch.Tensor(1)) 104 | self.U = Parameter(torch.Tensor(self.query_dim, self.key_dim)) 105 | self.reset_parameters() 106 | 107 | def reset_parameters(self): 108 | bound = 1 / math.sqrt(self.query_dim) 109 | nn.init.uniform_(self.q_weight, -bound, bound) 110 | bound = 1 / math.sqrt(self.key_dim) 111 | nn.init.uniform_(self.key_weight, -bound, bound) 112 | nn.init.constant_(self.b, 0.) 113 | nn.init.xavier_uniform_(self.U) 114 | 115 | def forward(self, query, key, mask_query=None, mask_key=None): 116 | """ 117 | 118 | Args: 119 | query: Tensor 120 | the decoder input tensor with shape = [batch, length_query, query_dim] 121 | key: Tensor 122 | the child input tensor with shape = [batch, length_key, key_dim] 123 | mask_query: Tensor or None 124 | the mask tensor for decoder with shape = [batch, length_query] 125 | mask_key: Tensor or None 126 | the mask tensor for encoder with shape = [batch, length_key] 127 | 128 | Returns: Tensor 129 | the energy tensor with shape = [batch, length_query, length_key] 130 | 131 | """ 132 | # output shape [batch, length_query, length_key] 133 | # compute bi-affine part 134 | # [batch, length_query, query_dim] * [query_dim, key_dim] 135 | # output shape [batch, length_query, key_dim] 136 | output = torch.matmul(query, self.U) 137 | # [batch, length_query, key_dim] * [batch, key_dim, length_key] 138 | # output shape [batch, length_query, length_key] 139 | output = torch.matmul(output, key.transpose(1, 2)) 140 | 141 | # compute query part: [query_dim] * [batch, query_dim, length_query] 142 | # the output shape is [batch, length_query, 1] 143 | out_q = torch.matmul(self.q_weight, query.transpose(1, 2)).unsqueeze(2) 144 | # compute decoder part: [key_dim] * [batch, key_dim, length_key] 145 | # the output shape is [batch, 1, length_key] 146 | out_k = torch.matmul(self.key_weight, key.transpose(1, 2)).unsqueeze(1) 147 | 148 | output = output + out_q + out_k + self.b 149 | 150 | if mask_query is not None: 151 | output = output * mask_query.unsqueeze(2) 152 | if mask_key is not None: 153 | output = output * mask_key.unsqueeze(1) 154 | return output 155 | 156 | @overrides 157 | def extra_repr(self): 158 | s = '{key_dim}, {query_dim}' 159 | return s.format(**self.__dict__) 160 | 161 | 162 | class CharCNN(nn.Module): 163 | """ 164 | CNN layers for characters 165 | """ 166 | def __init__(self, num_layers, in_channels, out_channels, hidden_channels=None, activation='elu'): 167 | super(CharCNN, self).__init__() 168 | assert activation in ['elu', 'tanh'] 169 | if activation == 'elu': 170 | ACT = nn.ELU 171 | else: 172 | ACT = nn.Tanh 173 | layers = list() 174 | for i in range(num_layers - 1): 175 | layers.append(('conv{}'.format(i), nn.Conv1d(in_channels, hidden_channels, kernel_size=3, padding=1))) 176 | layers.append(('act{}'.format(i), ACT())) 177 | in_channels = hidden_channels 178 | layers.append(('conv_top', nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1))) 179 | layers.append(('act_top', ACT())) 180 | self.act = ACT 181 | self.net = nn.Sequential(OrderedDict(layers)) 182 | 183 | self.reset_parameters() 184 | 185 | def reset_parameters(self): 186 | for layer in self.net: 187 | if isinstance(layer, nn.Conv1d): 188 | nn.init.xavier_uniform_(layer.weight) 189 | nn.init.constant_(layer.bias, 0.) 190 | else: 191 | assert isinstance(layer, self.act) 192 | 193 | def forward(self, char): 194 | """ 195 | 196 | Args: 197 | char: Tensor 198 | the input tensor of character [batch, sent_length, char_length, in_channels] 199 | 200 | Returns: Tensor 201 | output character encoding with shape [batch, sent_length, in_channels] 202 | 203 | """ 204 | # [batch, sent_length, char_length, in_channels] 205 | char_size = char.size() 206 | # first transform to [batch * sent_length, char_length, in_channels] 207 | # then transpose to [batch * sent_length, in_channels, char_length] 208 | char = char.view(-1, char_size[2], char_size[3]).transpose(1, 2) 209 | # [batch * sent_length, out_channels, char_length] 210 | char = self.net(char).max(dim=2)[0] 211 | # [batch, sent_length, out_channels] 212 | return char.view(char_size[0], char_size[1], -1) 213 | -------------------------------------------------------------------------------- /neuronlp2/nn/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from itertools import repeat 3 | import torch 4 | import torch.nn as nn 5 | from torch._six import inf 6 | 7 | 8 | def _ntuple(n): 9 | def parse(x): 10 | if isinstance(x, collections.Iterable): 11 | return x 12 | return tuple(repeat(x, n)) 13 | return parse 14 | 15 | _single = _ntuple(1) 16 | _pair = _ntuple(2) 17 | _triple = _ntuple(3) 18 | _quadruple = _ntuple(4) 19 | 20 | 21 | def freeze_embedding(embedding): 22 | assert isinstance(embedding, nn.Embedding), "input should be an Embedding module." 23 | embedding.weight.detach_() 24 | 25 | def total_grad_norm(parameters, norm_type=2): 26 | if isinstance(parameters, torch.Tensor): 27 | parameters = [parameters] 28 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 29 | norm_type = float(norm_type) 30 | if norm_type == inf: 31 | total_norm = max(p.grad.data.abs().max() for p in parameters) 32 | else: 33 | total_norm = 0 34 | for p in parameters: 35 | param_norm = p.grad.data.norm(norm_type) 36 | total_norm += param_norm.item() ** norm_type 37 | total_norm = total_norm ** (1. / norm_type) 38 | return total_norm 39 | -------------------------------------------------------------------------------- /neuronlp2/optim/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from neuronlp2.optim.lr_scheduler import InverseSquareRootScheduler, ExponentialScheduler 4 | -------------------------------------------------------------------------------- /neuronlp2/optim/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from collections import defaultdict 4 | from torch.optim.optimizer import Optimizer 5 | 6 | 7 | class _LRScheduler(object): 8 | def __init__(self, optimizer, last_epoch=-1): 9 | if not isinstance(optimizer, Optimizer): 10 | raise TypeError('{} is not an Optimizer'.format( 11 | type(optimizer).__name__)) 12 | self.optimizer = optimizer 13 | if last_epoch == -1: 14 | for group in optimizer.param_groups: 15 | group.setdefault('initial_lr', group['lr']) 16 | last_epoch = 0 17 | else: 18 | for i, group in enumerate(optimizer.param_groups): 19 | if 'initial_lr' not in group: 20 | raise KeyError("param 'initial_lr' is not specified " 21 | "in param_groups[{}] when resuming an optimizer".format(i)) 22 | self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) 23 | 24 | def state_dict(self): 25 | """Returns the state of the scheduler as a :class:`dict`. 26 | 27 | It contains an entry for every variable in self.__dict__ which 28 | is not the optimizer. 29 | """ 30 | return {key: value for key, value in self.__dict__.items() if key != 'optimizer'} 31 | 32 | def load_state_dict(self, state_dict): 33 | """Loads the schedulers state. 34 | 35 | Arguments: 36 | state_dict (dict): scheduler state. Should be an object returned 37 | from a call to :meth:`state_dict`. 38 | """ 39 | self.__dict__.update(state_dict) 40 | 41 | def get_lr(self): 42 | raise NotImplementedError 43 | 44 | def step(self, epoch=None): 45 | if epoch is None: 46 | epoch = self.last_epoch + 1 47 | self.last_epoch = epoch 48 | for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): 49 | param_group['lr'] = lr 50 | 51 | def reset_state(self): 52 | self.optimizer.state.clear() 53 | 54 | 55 | class InverseSquareRootScheduler(_LRScheduler): 56 | """ 57 | Decay the LR based on the inverse square root of the update number. 58 | We also support a warmup phase where we linearly increase the learning rate 59 | from zero until the configured learning rate (``--lr``). 60 | Thereafter we decay proportional to the number of 61 | updates, with a decay factor set to align with the configured learning rate. 62 | During warmup:: 63 | lrs = torch.linspace(0, args.lr, args.warmup_updates) 64 | lr = lrs[update_num] 65 | After warmup:: 66 | decay_factor = args.lr * sqrt(args.warmup_updates) 67 | lr = decay_factor / sqrt(update_num) 68 | """ 69 | def __init__(self, optimizer, warmup_steps, init_lr, last_epoch=-1): 70 | assert warmup_steps > 0, 'warmup steps should be larger than 0.' 71 | super(InverseSquareRootScheduler, self).__init__(optimizer, last_epoch) 72 | self.warmup_steps = float(warmup_steps) 73 | self.init_lr = init_lr 74 | self.lr_steps = [(base_lr - init_lr) / warmup_steps for base_lr in self.base_lrs] 75 | self.decay_factor = self.warmup_steps ** 0.5 76 | if last_epoch == -1: 77 | last_epoch = 0 78 | self.step(last_epoch) 79 | 80 | def get_lr(self): 81 | if self.last_epoch < self.warmup_steps: 82 | return [self.init_lr + lr_step * self.last_epoch for lr_step in self.lr_steps] 83 | else: 84 | lr_factor = self.decay_factor * self.last_epoch**-0.5 85 | return [base_lr * lr_factor for base_lr in self.base_lrs] 86 | 87 | 88 | class ExponentialScheduler(_LRScheduler): 89 | """Set the learning rate of each parameter group to the initial lr decayed 90 | by gamma every epoch. When last_epoch=-1, sets initial lr as lr. 91 | We also support a warmup phase where we linearly increase the learning rate 92 | from zero until the configured learning rate (``--lr``). 93 | Args: 94 | optimizer (Optimizer): Wrapped optimizer. 95 | gamma (float): Multiplicative factor of learning rate decay. 96 | warmup_steps (int): Warmup steps.. 97 | last_epoch (int): The index of last epoch. Default: -1. 98 | """ 99 | 100 | def __init__(self, optimizer, gamma, warmup_steps, init_lr, last_epoch=-1): 101 | super(ExponentialScheduler, self).__init__(optimizer, last_epoch) 102 | self.gamma = gamma 103 | # handle warmup <= 0 104 | self.warmup_steps = max(1, warmup_steps) 105 | self.init_lr = init_lr 106 | self.lr_steps = [(base_lr - init_lr) / self.warmup_steps for base_lr in self.base_lrs] 107 | if last_epoch == -1: 108 | last_epoch = 0 109 | self.step(last_epoch) 110 | 111 | def get_lr(self): 112 | if self.last_epoch < self.warmup_steps: 113 | return [self.init_lr + lr_step * self.last_epoch for lr_step in self.lr_steps] 114 | else: 115 | lr_factor = self.gamma ** (self.last_epoch - self.warmup_steps) 116 | return [base_lr * lr_factor for base_lr in self.base_lrs] 117 | -------------------------------------------------------------------------------- /neuronlp2/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from neuronlp2.tasks.parser import * 4 | -------------------------------------------------------------------------------- /neuronlp2/tasks/parser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import re 4 | import numpy as np 5 | 6 | def is_uni_punctuation(word): 7 | match = re.match("^[^\w\s]+$]", word, flags=re.UNICODE) 8 | return match is not None 9 | 10 | 11 | def is_punctuation(word, pos, punct_set=None): 12 | if punct_set is None: 13 | return is_uni_punctuation(word) 14 | else: 15 | return pos in punct_set 16 | 17 | 18 | def eval(words, postags, heads_pred, types_pred, heads, types, word_alphabet, pos_alphabet, lengths, 19 | punct_set=None, symbolic_root=False, symbolic_end=False): 20 | batch_size, _ = words.shape 21 | ucorr = 0. 22 | lcorr = 0. 23 | total = 0. 24 | ucomplete_match = 0. 25 | lcomplete_match = 0. 26 | 27 | ucorr_nopunc = 0. 28 | lcorr_nopunc = 0. 29 | total_nopunc = 0. 30 | ucomplete_match_nopunc = 0. 31 | lcomplete_match_nopunc = 0. 32 | 33 | corr_root = 0. 34 | total_root = 0. 35 | start = 1 if symbolic_root else 0 36 | end = 1 if symbolic_end else 0 37 | for i in range(batch_size): 38 | ucm = 1. 39 | lcm = 1. 40 | ucm_nopunc = 1. 41 | lcm_nopunc = 1. 42 | for j in range(start, lengths[i] - end): 43 | word = word_alphabet.get_instance(words[i, j]) 44 | pos = pos_alphabet.get_instance(postags[i, j]) 45 | 46 | total += 1 47 | if heads[i, j] == heads_pred[i, j]: 48 | ucorr += 1 49 | if types[i, j] == types_pred[i, j]: 50 | lcorr += 1 51 | else: 52 | lcm = 0 53 | else: 54 | ucm = 0 55 | lcm = 0 56 | 57 | if not is_punctuation(word, pos, punct_set): 58 | total_nopunc += 1 59 | if heads[i, j] == heads_pred[i, j]: 60 | ucorr_nopunc += 1 61 | if types[i, j] == types_pred[i, j]: 62 | lcorr_nopunc += 1 63 | else: 64 | lcm_nopunc = 0 65 | else: 66 | ucm_nopunc = 0 67 | lcm_nopunc = 0 68 | 69 | if heads[i, j] == 0: 70 | total_root += 1 71 | corr_root += 1 if heads_pred[i, j] == 0 else 0 72 | 73 | ucomplete_match += ucm 74 | lcomplete_match += lcm 75 | ucomplete_match_nopunc += ucm_nopunc 76 | lcomplete_match_nopunc += lcm_nopunc 77 | 78 | return (ucorr, lcorr, total, ucomplete_match, lcomplete_match), \ 79 | (ucorr_nopunc, lcorr_nopunc, total_nopunc, ucomplete_match_nopunc, lcomplete_match_nopunc), \ 80 | (corr_root, total_root), batch_size 81 | 82 | 83 | def decode_MST(energies, lengths, leading_symbolic=0, labeled=True): 84 | """ 85 | decode best parsing tree with MST algorithm. 86 | :param energies: energies: numpy 4D tensor 87 | energies of each edge. the shape is [batch_size, num_labels, n_steps, n_steps], 88 | where the summy root is at index 0. 89 | :param masks: numpy 2D tensor 90 | masks in the shape [batch_size, n_steps]. 91 | :param leading_symbolic: int 92 | number of symbolic dependency types leading in type alphabets) 93 | :return: 94 | """ 95 | 96 | def find_cycle(par): 97 | added = np.zeros([length], np.bool) 98 | added[0] = True 99 | cycle = set() 100 | findcycle = False 101 | for i in range(1, length): 102 | if findcycle: 103 | break 104 | 105 | if added[i] or not curr_nodes[i]: 106 | continue 107 | 108 | # init cycle 109 | tmp_cycle = set() 110 | tmp_cycle.add(i) 111 | added[i] = True 112 | findcycle = True 113 | l = i 114 | 115 | while par[l] not in tmp_cycle: 116 | l = par[l] 117 | if added[l]: 118 | findcycle = False 119 | break 120 | added[l] = True 121 | tmp_cycle.add(l) 122 | 123 | if findcycle: 124 | lorg = l 125 | cycle.add(lorg) 126 | l = par[lorg] 127 | while l != lorg: 128 | cycle.add(l) 129 | l = par[l] 130 | break 131 | 132 | return findcycle, cycle 133 | 134 | def chuLiuEdmonds(): 135 | par = np.zeros([length], dtype=np.int32) 136 | # create best graph 137 | par[0] = -1 138 | for i in range(1, length): 139 | # only interested at current nodes 140 | if curr_nodes[i]: 141 | max_score = score_matrix[0, i] 142 | par[i] = 0 143 | for j in range(1, length): 144 | if j == i or not curr_nodes[j]: 145 | continue 146 | 147 | new_score = score_matrix[j, i] 148 | if new_score > max_score: 149 | max_score = new_score 150 | par[i] = j 151 | 152 | # find a cycle 153 | findcycle, cycle = find_cycle(par) 154 | # no cycles, get all edges and return them. 155 | if not findcycle: 156 | final_edges[0] = -1 157 | for i in range(1, length): 158 | if not curr_nodes[i]: 159 | continue 160 | 161 | pr = oldI[par[i], i] 162 | ch = oldO[par[i], i] 163 | final_edges[ch] = pr 164 | return 165 | 166 | cyc_len = len(cycle) 167 | cyc_weight = 0.0 168 | cyc_nodes = np.zeros([cyc_len], dtype=np.int32) 169 | for id, cyc_node in enumerate(cycle): 170 | cyc_nodes[id] = cyc_node 171 | cyc_weight += score_matrix[par[cyc_node], cyc_node] 172 | 173 | rep = cyc_nodes[0] 174 | for i in range(length): 175 | if not curr_nodes[i] or i in cycle: 176 | continue 177 | 178 | max1 = float("-inf") 179 | wh1 = -1 180 | max2 = float("-inf") 181 | wh2 = -1 182 | 183 | for j in cyc_nodes: 184 | if score_matrix[j, i] > max1: 185 | max1 = score_matrix[j, i] 186 | wh1 = j 187 | 188 | scr = cyc_weight + score_matrix[i, j] - score_matrix[par[j], j] 189 | 190 | if scr > max2: 191 | max2 = scr 192 | wh2 = j 193 | 194 | score_matrix[rep, i] = max1 195 | oldI[rep, i] = oldI[wh1, i] 196 | oldO[rep, i] = oldO[wh1, i] 197 | score_matrix[i, rep] = max2 198 | oldO[i, rep] = oldO[i, wh2] 199 | oldI[i, rep] = oldI[i, wh2] 200 | 201 | rep_cons = [] 202 | for i in range(cyc_len): 203 | rep_cons.append(set()) 204 | cyc_node = cyc_nodes[i] 205 | for cc in reps[cyc_node]: 206 | rep_cons[i].add(cc) 207 | 208 | for cyc_node in cyc_nodes[1:]: 209 | curr_nodes[cyc_node] = False 210 | for cc in reps[cyc_node]: 211 | reps[rep].add(cc) 212 | 213 | chuLiuEdmonds() 214 | 215 | # check each node in cycle, if one of its representatives is a key in the final_edges, it is the one. 216 | found = False 217 | wh = -1 218 | for i in range(cyc_len): 219 | for repc in rep_cons[i]: 220 | if repc in final_edges: 221 | wh = cyc_nodes[i] 222 | found = True 223 | break 224 | if found: 225 | break 226 | 227 | l = par[wh] 228 | while l != wh: 229 | ch = oldO[par[l], l] 230 | pr = oldI[par[l], l] 231 | final_edges[ch] = pr 232 | l = par[l] 233 | 234 | if labeled: 235 | assert energies.ndim == 4, 'dimension of energies is not equal to 4' 236 | else: 237 | assert energies.ndim == 3, 'dimension of energies is not equal to 3' 238 | input_shape = energies.shape 239 | batch_size = input_shape[0] 240 | max_length = input_shape[2] 241 | 242 | pars = np.zeros([batch_size, max_length], dtype=np.int32) 243 | types = np.zeros([batch_size, max_length], dtype=np.int32) if labeled else None 244 | for i in range(batch_size): 245 | energy = energies[i] 246 | 247 | # calc the realy length of this instance 248 | length = lengths[i] 249 | 250 | # calc real energy matrix shape = [length, length, num_labels - #symbolic] (remove the label for symbolic types). 251 | if labeled: 252 | energy = energy[leading_symbolic:, :length, :length] 253 | energy = energy - energy.min() + 1e-6 254 | # get best label for each edge. 255 | label_id_matrix = energy.argmax(axis=0) + leading_symbolic 256 | energy = energy.max(axis=0) 257 | else: 258 | energy = energy[:length, :length] 259 | energy = energy - energy.min() + 1e-6 260 | label_id_matrix = None 261 | # get original score matrix 262 | orig_score_matrix = energy 263 | # initialize score matrix to original score matrix 264 | score_matrix = np.array(orig_score_matrix, copy=True) 265 | 266 | oldI = np.zeros([length, length], dtype=np.int32) 267 | oldO = np.zeros([length, length], dtype=np.int32) 268 | curr_nodes = np.zeros([length], dtype=np.bool) 269 | reps = [] 270 | 271 | for s in range(length): 272 | orig_score_matrix[s, s] = 0.0 273 | score_matrix[s, s] = 0.0 274 | curr_nodes[s] = True 275 | reps.append(set()) 276 | reps[s].add(s) 277 | for t in range(s + 1, length): 278 | oldI[s, t] = s 279 | oldO[s, t] = t 280 | 281 | oldI[t, s] = t 282 | oldO[t, s] = s 283 | 284 | final_edges = dict() 285 | chuLiuEdmonds() 286 | par = np.zeros([max_length], np.int32) 287 | if labeled: 288 | type = np.ones([max_length], np.int32) 289 | type[0] = 0 290 | else: 291 | type = None 292 | 293 | for ch, pr in final_edges.items(): 294 | par[ch] = pr 295 | if labeled and ch != 0: 296 | type[ch] = label_id_matrix[pr, ch] 297 | 298 | par[0] = 0 299 | pars[i] = par 300 | if labeled: 301 | types[i] = type 302 | 303 | return pars, types 304 | -------------------------------------------------------------------------------- /neuronlp2/utils.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from collections import OrderedDict 4 | import pickle 5 | import numpy as np 6 | from gensim.models.word2vec import Word2Vec 7 | import gzip 8 | 9 | from neuronlp2.io.common import DIGIT_RE 10 | 11 | 12 | def load_embedding_dict(embedding, embedding_path, normalize_digits=True): 13 | """ 14 | load word embeddings from file 15 | :param embedding: 16 | :param embedding_path: 17 | :return: embedding dict, embedding dimention, caseless 18 | """ 19 | print("loading embedding: %s from %s" % (embedding, embedding_path)) 20 | if embedding == 'word2vec': 21 | # loading word2vec 22 | word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True) 23 | embedd_dim = word2vec.vector_size 24 | return word2vec, embedd_dim 25 | elif embedding == 'glove': 26 | # loading GloVe 27 | embedd_dim = -1 28 | embedd_dict = OrderedDict() 29 | with gzip.open(embedding_path, 'rt') as file: 30 | for line in file: 31 | line = line.strip() 32 | if len(line) == 0: 33 | continue 34 | 35 | tokens = line.split() 36 | if embedd_dim < 0: 37 | embedd_dim = len(tokens) - 1 38 | else: 39 | assert (embedd_dim + 1 == len(tokens)) 40 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 41 | embedd[:] = tokens[1:] 42 | word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] 43 | embedd_dict[word] = embedd 44 | return embedd_dict, embedd_dim 45 | elif embedding == 'senna': 46 | # loading Senna 47 | embedd_dim = -1 48 | embedd_dict = OrderedDict() 49 | with gzip.open(embedding_path, 'rt') as file: 50 | for line in file: 51 | line = line.strip() 52 | if len(line) == 0: 53 | continue 54 | 55 | tokens = line.split() 56 | if embedd_dim < 0: 57 | embedd_dim = len(tokens) - 1 58 | else: 59 | assert (embedd_dim + 1 == len(tokens)) 60 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 61 | embedd[:] = tokens[1:] 62 | word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0] 63 | embedd_dict[word] = embedd 64 | return embedd_dict, embedd_dim 65 | elif embedding == 'sskip': 66 | embedd_dim = -1 67 | embedd_dict = OrderedDict() 68 | with gzip.open(embedding_path, 'rt') as file: 69 | # skip the first line 70 | file.readline() 71 | for line in file: 72 | line = line.strip() 73 | try: 74 | if len(line) == 0: 75 | continue 76 | 77 | tokens = line.split() 78 | if len(tokens) < embedd_dim: 79 | continue 80 | 81 | if embedd_dim < 0: 82 | embedd_dim = len(tokens) - 1 83 | 84 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 85 | start = len(tokens) - embedd_dim 86 | word = ' '.join(tokens[0:start]) 87 | embedd[:] = tokens[start:] 88 | word = DIGIT_RE.sub("0", word) if normalize_digits else word 89 | embedd_dict[word] = embedd 90 | except UnicodeDecodeError: 91 | continue 92 | return embedd_dict, embedd_dim 93 | elif embedding == 'polyglot': 94 | words, embeddings = pickle.load(open(embedding_path, 'rb'), encoding='latin1') 95 | _, embedd_dim = embeddings.shape 96 | embedd_dict = OrderedDict() 97 | for i, word in enumerate(words): 98 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 99 | embedd[:] = embeddings[i, :] 100 | word = DIGIT_RE.sub("0", word) if normalize_digits else word 101 | embedd_dict[word] = embedd 102 | return embedd_dict, embedd_dim 103 | 104 | else: 105 | raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]") 106 | --------------------------------------------------------------------------------