├── .gitignore ├── LICENSE ├── README.md ├── SECURITY.md ├── conlleval.pl ├── data_prep ├── bio_dataset.py └── multi_lingual_amazon.py ├── data_processing_scripts └── amazon │ ├── pickle_dataset.py │ └── process_dataset.py ├── layers.py ├── models.py ├── options.py ├── scripts ├── get_overall_perf_amazon.py ├── get_overall_perf_ner.py ├── train_amazon_3to1.sh └── train_conll_ner_3to1.sh ├── train_cls_man_moe.py ├── train_tagging_man_moe.py ├── utils.py └── vocab.py /.gitignore: -------------------------------------------------------------------------------- 1 | save/ 2 | data/ 3 | logs/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zero-Resource Multilingual Model Transfer 2 | 3 | This repo contains the source code for our ACL 2019 paper: 4 | 5 | [**Multi-Source Cross-Lingual Model Transfer: Learning What to Share**](https://www.aclweb.org/anthology/P19-1299) 6 |
7 | [Xilun Chen](http://www.cs.cornell.edu/~xlchen/), 8 | [Ahmed Hassan Awadallah](https://www.microsoft.com/en-us/research/people/hassanam/), 9 | [Hany Hassan](https://www.microsoft.com/en-us/research/people/hanyh/), 10 | [Wei Wang](https://www.microsoft.com/en-us/research/people/wawe/), 11 | [Claire Cardie](http://www.cs.cornell.edu/home/cardie/) 12 |
13 | The 57th Annual Meeting of the Association for Computational Linguistics (ACL 2019) 14 |
15 | [paper](https://www.aclweb.org/anthology/P19-1299), 16 | [arXiv](https://arxiv.org/abs/1810.03552), 17 | [bibtex](https://www.aclweb.org/anthology/P19-1299.bib) 18 | 19 | ## Introduction 20 | Modern NLP applications have enjoyed a great boost utilizing neural networks models. Such deep neural models, however, are not applicable to most human languages due to the lack of annotated training data for various NLP tasks. Cross-lingual transfer learning (CLTL) is a viable method for building NLP models for a low-resource target language by leveraging labeled data from other (source) languages. In this work, we focus on the multilingual transfer setting where training data in multiple source languages is leveraged to further boost target language performance. 21 | 22 | Unlike most existing methods that rely only on language-invariant features for CLTL, our approach coherently utilizes both **language-invariant** and **language-specific** features at instance level. Our model leverages adversarial networks to learn language-invariant features, and mixture-of-experts models to dynamically exploit the similarity between the target language and each individual source language. This enables our model to learn effectively what to share between various languages in the multilingual setup. Moreover, when coupled with unsupervised multilingual embeddings, our model can operate in a **zero-resource** setting where neither **target language training data** nor **cross-lingual resources** (e.g. parallel corpora or Machine Translation systems) are available. Our model achieves significant performance gains over prior art, as shown in an extensive set of experiments over multiple text classification and sequence tagging tasks including a large-scale industry dataset. 23 | 24 | ## Requirements 25 | - Python 3.6 26 | - PyTorch 0.4 27 | - PyTorchNet (for confusion matrix) 28 | - tqdm (for progress bar) 29 | 30 | 31 | ## File Structure 32 | ``` 33 | . 34 | ├── LICENSE 35 | ├── README.md 36 | ├── conlleval.pl (official CoNLL evaluation script) 37 | ├── data_prep (data processing scripts) 38 | │   ├── bio_dataset.py (processing the CoNLL dataset) 39 | │   └── multi_lingual_amazon.py (processing the Amazon Review dataset) 40 | ├── data_processing_scripts (auxiliary scripts for dataset pre-processing) 41 | │   └── amazon 42 | │   ├── pickle_dataset.py 43 | │   └── process_dataset.py 44 | ├── layers.py (lower-level helper modules) 45 | ├── models.py (higher-level modules) 46 | ├── options.py (hyper-parameters aka. all the knobs you may want to turn) 47 | ├── scripts (scripts for training and evaluating the models) 48 | │   ├── get_overall_perf_amazon.py (evaluation script for Amazon Reviews) 49 | │   ├── get_overall_perf_ner.py (evaluation script for CoNLL NER) 50 | │   ├── train_amazon_3to1.sh (training script for Amazon Reviews) 51 | │   └── train_conll_ner_3to1.sh (training script for CoNLL NER) 52 | ├── train_cls_man_moe.py (training code for text classification) 53 | ├── train_tagging_man_moe.py (training code for sequence tagging) 54 | ├── utils.py (helper functions) 55 | └── vocab.py (building the vocabulary) 56 | ``` 57 | 58 | 59 | ## Dataset 60 | The CoNLL [2002](https://www.clips.uantwerpen.be/conll2002/ner/), [2003](https://www.clips.uantwerpen.be/conll2003/ner/) and [Amazon](https://webis.de/data/webis-cls-10.html) datasets, as well as the multilingual word embeddings ([MUSE](https://github.com/facebookresearch/MUSE), [VecMap](https://github.com/artetxem/vecmap), [UMWE](https://github.com/ccsasuke/umwe)) are all publicly available online. 61 | 62 | ## Run Experiments 63 | 64 | ### CoNLL Named Entity Recogintion 65 | ```bash 66 | ./scripts/train_conll_ner_3to1.sh {exp_name} 67 | ``` 68 | 69 | The following script can print out a compiled dev/test F1 scores for all languages: 70 | 71 | ```bash 72 | python scripts/get_overall_perf_ner.py save {exp_name} 73 | ``` 74 | 75 | ### Multilingual Amazon Reviews 76 | ```bash 77 | ./scripts/train_amazon_3to1.sh {exp_name} 78 | ``` 79 | 80 | The following script can print out a compiled dev/test F1 scores for all languages: 81 | 82 | ```bash 83 | python scripts/get_overall_perf_amazon.py save {exp_name} 84 | ``` 85 | 86 | ## Citation 87 | 88 | If you find this project useful for your research, please kindly cite our ACL 2019 paper: 89 | 90 | ```bibtex 91 | @InProceedings{chen-etal-acl2019-multi-source, 92 | author = {Chen, Xilun and Hassan Awadallah, Ahmed and Hassan, Hany and Wang, Wei and Cardie, Claire}, 93 | title = {Multi-Source Cross-Lingual Model Transfer: Learning What to Share}, 94 | booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", 95 | month = jul, 96 | year = "2019", 97 | address = "Florence, Italy", 98 | publisher = "Association for Computational Linguistics", 99 | } 100 | ``` 101 | 102 | ## Contributing 103 | 104 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 105 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 106 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 107 | 108 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 109 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 110 | provided by the bot. You will only need to do this once across all repos using our CLA. 111 | 112 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 113 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 114 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 115 | 116 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /conlleval.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 86 | elsif ($nbrOfFeatures != $#features and @features != 0) { 87 | printf STDERR "unexpected number of features: %d (%d)\n", 88 | $#features+1,$nbrOfFeatures+1; 89 | exit(1); 90 | } 91 | if (@features == 0 or 92 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 93 | if (@features < 2) { 94 | die "conlleval: unexpected number of features in line $line\n"; 95 | } 96 | if ($raw) { 97 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 98 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 99 | if ($features[$#features] ne "O") { 100 | $features[$#features] = "B-$features[$#features]"; 101 | } 102 | if ($features[$#features-1] ne "O") { 103 | $features[$#features-1] = "B-$features[$#features-1]"; 104 | } 105 | } 106 | # 20040126 ET code which allows hyphens in the types 107 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 108 | $guessed = $1; 109 | $guessedType = $2; 110 | } else { 111 | $guessed = $features[$#features]; 112 | $guessedType = ""; 113 | } 114 | pop(@features); 115 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 116 | $correct = $1; 117 | $correctType = $2; 118 | } else { 119 | $correct = $features[$#features]; 120 | $correctType = ""; 121 | } 122 | pop(@features); 123 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 124 | # ($correct,$correctType) = split(/-/,pop(@features)); 125 | $guessedType = $guessedType ? $guessedType : ""; 126 | $correctType = $correctType ? $correctType : ""; 127 | $firstItem = shift(@features); 128 | 129 | # 1999-06-26 sentence breaks should always be counted as out of chunk 130 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 131 | 132 | if ($inCorrect) { 133 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 134 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 135 | $lastGuessedType eq $lastCorrectType) { 136 | $inCorrect=$false; 137 | $correctChunk++; 138 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 139 | $correctChunk{$lastCorrectType}+1 : 1; 140 | } elsif ( 141 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 142 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 143 | $guessedType ne $correctType ) { 144 | $inCorrect=$false; 145 | } 146 | } 147 | 148 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 149 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 150 | $guessedType eq $correctType) { $inCorrect = $true; } 151 | 152 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 153 | $foundCorrect++; 154 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 155 | $foundCorrect{$correctType}+1 : 1; 156 | } 157 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 158 | $foundGuessed++; 159 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 160 | $foundGuessed{$guessedType}+1 : 1; 161 | } 162 | if ( $firstItem ne $boundary ) { 163 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 164 | $correctTags++; 165 | } 166 | $tokenCounter++; 167 | } 168 | 169 | $lastGuessed = $guessed; 170 | $lastCorrect = $correct; 171 | $lastGuessedType = $guessedType; 172 | $lastCorrectType = $correctType; 173 | } 174 | if ($inCorrect) { 175 | $correctChunk++; 176 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 177 | $correctChunk{$lastCorrectType}+1 : 1; 178 | } 179 | 180 | if (not $latex) { 181 | # compute overall precision, recall and FB1 (default values are 0.0) 182 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 183 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 184 | $FB1 = 2*$precision*$recall/($precision+$recall) 185 | if ($precision+$recall > 0); 186 | 187 | # print overall performance 188 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 189 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 190 | if ($tokenCounter>0) { 191 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 192 | printf "precision: %6.2f%%; ",$precision; 193 | printf "recall: %6.2f%%; ",$recall; 194 | printf "FB1: %6.2f\n",$FB1; 195 | } 196 | } 197 | 198 | # sort chunk type names 199 | undef($lastType); 200 | @sortedTypes = (); 201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 202 | if (not($lastType) or $lastType ne $i) { 203 | push(@sortedTypes,($i)); 204 | } 205 | $lastType = $i; 206 | } 207 | # print performance per chunk type 208 | if (not $latex) { 209 | for $i (@sortedTypes) { 210 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 211 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 212 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 213 | if (not($foundCorrect{$i})) { $recall = 0.0; } 214 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 215 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 216 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 217 | printf "%17s: ",$i; 218 | printf "precision: %6.2f%%; ",$precision; 219 | printf "recall: %6.2f%%; ",$recall; 220 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 221 | } 222 | } else { 223 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 224 | for $i (@sortedTypes) { 225 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 226 | if (not($foundGuessed{$i})) { $precision = 0.0; } 227 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 228 | if (not($foundCorrect{$i})) { $recall = 0.0; } 229 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 230 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 231 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 232 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 233 | $i,$precision,$recall,$FB1; 234 | } 235 | print "\\hline\n"; 236 | $precision = 0.0; 237 | $recall = 0; 238 | $FB1 = 0.0; 239 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 240 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 241 | $FB1 = 2*$precision*$recall/($precision+$recall) 242 | if ($precision+$recall > 0); 243 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 244 | $precision,$recall,$FB1; 245 | } 246 | 247 | exit 0; 248 | 249 | # endOfChunk: checks if a chunk ended between the previous and current word 250 | # arguments: previous and current chunk tags, previous and current types 251 | # note: this code is capable of handling other chunk representations 252 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 253 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 254 | 255 | sub endOfChunk { 256 | my $prevTag = shift(@_); 257 | my $tag = shift(@_); 258 | my $prevType = shift(@_); 259 | my $type = shift(@_); 260 | my $chunkEnd = $false; 261 | 262 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 263 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 264 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 265 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 266 | 267 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 268 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 269 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 270 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 271 | 272 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 273 | $chunkEnd = $true; 274 | } 275 | 276 | # corrected 1998-12-22: these chunks are assumed to have length 1 277 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 278 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 279 | 280 | return($chunkEnd); 281 | } 282 | 283 | # startOfChunk: checks if a chunk started between the previous and current word 284 | # arguments: previous and current chunk tags, previous and current types 285 | # note: this code is capable of handling other chunk representations 286 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 287 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 288 | 289 | sub startOfChunk { 290 | my $prevTag = shift(@_); 291 | my $tag = shift(@_); 292 | my $prevType = shift(@_); 293 | my $type = shift(@_); 294 | my $chunkStart = $false; 295 | 296 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 297 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 298 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 299 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 300 | 301 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 302 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 303 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 304 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 305 | 306 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 307 | $chunkStart = $true; 308 | } 309 | 310 | # corrected 1998-12-22: these chunks are assumed to have length 1 311 | if ( $tag eq "[" ) { $chunkStart = $true; } 312 | if ( $tag eq "]" ) { $chunkStart = $true; } 313 | 314 | return($chunkStart); 315 | } 316 | -------------------------------------------------------------------------------- /data_prep/bio_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import copy 5 | import os 6 | import random 7 | 8 | import torch 9 | from torch.utils.data import Dataset 10 | 11 | from options import opt 12 | from utils import read_bio_samples 13 | 14 | class BioDataset(Dataset): 15 | """ 16 | tag_vocab is updated 17 | vocab is updated if update_vocab is True 18 | self.raw_X, self.raw_Y: list of lists of strings 19 | self.X: list of dict {'words': list of int, 'chars': list of list of int} 20 | self.Y: list of lists of integers 21 | """ 22 | def __init__(self, input_file, vocab, char_vocab, tag_vocab, 23 | update_vocab, encoding='utf-8', remove_empty=False): 24 | self.raw_X = [] 25 | self.raw_Y = [] 26 | with open(input_file, encoding=encoding) as inf: 27 | for lines in read_bio_samples(inf): 28 | x = [l.split()[0] for l in lines] 29 | y = [l.split()[-1] for l in lines] 30 | if remove_empty: 31 | if all([l=='O' for l in y]): 32 | continue 33 | self.raw_X.append(x) 34 | self.raw_Y.append(y) 35 | if vocab and update_vocab: 36 | for w in x: 37 | vocab.add_word(w) 38 | if char_vocab and update_vocab: 39 | for w in x: 40 | for ch in w: 41 | char_vocab.add_word(ch, normalize=opt.lowercase_char) 42 | tag_vocab.add_tags(y) 43 | self.X = [] 44 | self.Y = [] 45 | for xs, ys in zip(self.raw_X, self.raw_Y): 46 | x = {} 47 | if vocab: 48 | x['words'] = [vocab.lookup(w) for w in xs] 49 | if char_vocab: 50 | x['chars'] = [[char_vocab.lookup(ch, normalize=opt.lowercase_char) for ch in w] for w in xs] 51 | self.X.append(x) 52 | self.Y.append([tag_vocab.lookup(y) for y in ys]) 53 | assert len(self.X) == len(self.Y) 54 | 55 | def __len__(self): 56 | return len(self.Y) 57 | 58 | def __getitem__(self, idx): 59 | return (self.X[idx], self.Y[idx]) 60 | 61 | def get_subset(self, num_samples, shuffle=True): 62 | subset = copy.copy(self) 63 | if shuffle: 64 | idx = random.sample(range(len(self)), num_samples) 65 | else: 66 | idx = list(range(min(len(self), num_samples))) 67 | subset.raw_X = [subset.raw_X[i] for i in idx] 68 | subset.raw_Y = [subset.raw_Y[i] for i in idx] 69 | subset.X = [subset.X[i] for i in idx] 70 | subset.Y = [subset.Y[i] for i in idx] 71 | return subset 72 | 73 | class ConllDataset(BioDataset): 74 | def __init__(self, input_file, vocab, char_vocab, tag_vocab, 75 | update_vocab, remove_empty=False): 76 | if (input_file[17:20] == 'esp') or (input_file[17:20] == 'ned'): 77 | encoding = 'ISO-8859-1' 78 | else: 79 | encoding = 'utf-8' 80 | super().__init__(input_file, vocab, char_vocab, tag_vocab, 81 | update_vocab, encoding, remove_empty=False) 82 | 83 | 84 | def get_conll_ner_datasets(vocab, char_vocab, tag_vocab, data_dir, lang): 85 | print(f'Loading CoNLL NER data for {lang} Language..') 86 | train_set = ConllDataset(os.path.join(data_dir, f'{lang}.train'), 87 | vocab, char_vocab, tag_vocab, update_vocab=True, remove_empty=opt.remove_empty_samples) 88 | dev_set = ConllDataset(os.path.join(data_dir, f'{lang}.dev'), 89 | vocab, char_vocab, tag_vocab, update_vocab=True) 90 | test_set = ConllDataset(os.path.join(data_dir, f'{lang}.test'), 91 | vocab, char_vocab, tag_vocab, update_vocab=True) 92 | return train_set, dev_set, test_set, train_set 93 | 94 | 95 | def get_train_on_translation_conll_ner_datasets(vocab, char_vocab, tag_vocab, data_dir, lang): 96 | print(f'Loading Train-on-Translation CoNLL NER data for {lang} Language..') 97 | train_set = ConllDataset(os.path.join(data_dir, f'eng2{lang}_{lang}.train'), 98 | vocab, char_vocab, tag_vocab, update_vocab=True) 99 | dev_set = ConllDataset(os.path.join(data_dir, f'{lang}.dev'), 100 | vocab, char_vocab, tag_vocab, update_vocab=True) 101 | test_set = ConllDataset(os.path.join(data_dir, f'{lang}.test'), 102 | vocab, char_vocab, tag_vocab, update_vocab=False) 103 | return train_set, dev_set, test_set, train_set 104 | 105 | 106 | def get_test_on_translation_conll_ner_datasets(vocab, char_vocab, tag_vocab, data_dir, lang): 107 | print(f'Loading Test-on-Translation CoNLL NER data for {lang} Language..') 108 | train_set = ConllDataset(os.path.join(data_dir, f'eng.train'), 109 | vocab, char_vocab, tag_vocab, update_vocab=True) 110 | dev_set = ConllDataset(os.path.join(data_dir, f'eng2{lang}_eng.dev'), 111 | vocab, char_vocab, tag_vocab, update_vocab=True) 112 | test_set = ConllDataset(os.path.join(data_dir, f'eng2{lang}_eng.test'), 113 | vocab, char_vocab, tag_vocab, update_vocab=False) 114 | return train_set, dev_set, test_set, train_set 115 | -------------------------------------------------------------------------------- /data_prep/multi_lingual_amazon.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import glob 5 | import os 6 | import pickle 7 | import random 8 | 9 | import torch 10 | from torch.utils.data import Dataset 11 | 12 | from options import opt 13 | 14 | 15 | class MultiLangAmazonDataset(Dataset): 16 | """ 17 | vocab is updated if update_vocab is True 18 | self.raw_X: list of lists of strings 19 | self.X, self.Y: list of lists of integers 20 | """ 21 | num_labels = 2 22 | def __init__(self, input_file=None, vocab=None, char_vocab=None, max_seq_len=0, num_train_lines=0, update_vocab=False): 23 | self.raw_X = [] 24 | self.X = [] 25 | self.Y = [] 26 | if input_file is None: 27 | return 28 | if input_file.endswith('.pkl'): 29 | with open(input_file, 'rb') as inf: 30 | reviews = pickle.load(inf) 31 | self.raw_X = reviews['X'] 32 | self.Y = reviews['Y'] 33 | if num_train_lines > 0: 34 | self.raw_X = self.raw_X[:num_train_lines] 35 | self.Y = self.Y[:num_train_lines] 36 | for x in self.raw_X: 37 | if max_seq_len > 0: 38 | x = x[:max_seq_len] 39 | if update_vocab: 40 | for w in x: 41 | vocab.add_word(w) 42 | if char_vocab and update_vocab: 43 | for w in x: 44 | for ch in w: 45 | char_vocab.add_word(ch, normalize=opt.lowercase_char) 46 | sample = {} 47 | if vocab: 48 | sample['words'] = [vocab.lookup(w) for w in x] 49 | if char_vocab: 50 | sample['chars'] = [[char_vocab.lookup(ch, 51 | normalize=opt.lowercase_char) for ch in w] for w in x] 52 | self.X.append(sample) 53 | else: 54 | with open(input_file) as inf: 55 | for cnt, line in enumerate(inf): 56 | parts = line.split('\t') 57 | assert len(parts) == 2, f"Incorrect format {line}" 58 | x = parts[1].rstrip().split(' ') 59 | self.raw_X.append(x) 60 | if max_seq_len > 0: 61 | x = x[:max_seq_len] 62 | if update_vocab: 63 | for w in x: 64 | vocab.add_word(w) 65 | if char_vocab and update_vocab: 66 | for w in x: 67 | for ch in w: 68 | char_vocab.add_word(ch, normalize=opt.lowercase_char) 69 | sample = {} 70 | if vocab: 71 | sample['words'] = [vocab.lookup(w) for w in x] 72 | if char_vocab: 73 | sample['chars'] = [[char_vocab.lookup(ch, 74 | normalize=opt.lowercase_char) for ch in w] for w in x] 75 | self.X.append(sample) 76 | self.Y.append(int(parts[0])) 77 | if num_train_lines > 0 and cnt+1 >= num_train_lines: 78 | break 79 | print(f"Loaded {cnt+1} samples") 80 | assert len(self.X) == len(self.Y) 81 | 82 | def __len__(self): 83 | return len(self.Y) 84 | 85 | def __getitem__(self, idx): 86 | return (self.X[idx], self.Y[idx]) 87 | 88 | def get_subset(self, num_samples, shuffle=True): 89 | subset = MultiLangAmazonDataset() 90 | if shuffle: 91 | idx = random.sample(range(len(self)), num_samples) 92 | else: 93 | idx = list(range(min(len(self), num_samples))) 94 | subset.raw_X = [subset.raw_X[i] for i in idx] 95 | subset.raw_Y = [subset.raw_Y[i] for i in idx] 96 | subset.X = [subset.X[i] for i in idx] 97 | subset.Y = [subset.Y[i] for i in idx] 98 | return subset 99 | 100 | def train_dev_split(self, dev_samples, shuffle=True): 101 | devset = MultiLangAmazonDataset() 102 | idx = list(range(len(self))) 103 | if shuffle: 104 | random.shuffle(idx) 105 | # dev set 106 | dev_idx = idx[:dev_samples] 107 | devset.raw_X = [self.raw_X[i] for i in dev_idx] 108 | devset.X = [self.X[i] for i in dev_idx] 109 | devset.Y = [self.Y[i] for i in dev_idx] 110 | # update train set 111 | train_idx = idx[dev_samples:] 112 | self.raw_X = [self.raw_X[i] for i in train_idx] 113 | self.X = [self.X[i] for i in train_idx] 114 | self.Y = [self.Y[i] for i in train_idx] 115 | self.max_seq_len = None 116 | return devset 117 | 118 | def get_max_seq_len(self): 119 | if not hasattr(self, 'max_seq_len') or self.max_seq_len is None: 120 | self.max_seq_len = max([len(x) for x in self.X]) 121 | return self.max_seq_len 122 | 123 | 124 | def get_multi_lingual_amazon_datasets(vocab, char_vocab, root_dir, domain, lang, max_seq_len): 125 | print(f'Loading Multi-Lingual Amazon Review data for {domain} Domain and {lang} Language..') 126 | data_dir = os.path.join(root_dir, lang, domain) 127 | 128 | train_set = MultiLangAmazonDataset(os.path.join(data_dir, 'train.pkl'), 129 | vocab, char_vocab, max_seq_len, 0, update_vocab=True) 130 | # train-dev split 131 | dev_set = train_set.train_dev_split(400, shuffle=True) 132 | test_set = MultiLangAmazonDataset(os.path.join(data_dir, 'test.pkl'), 133 | vocab, char_vocab, max_seq_len, 0, update_vocab=True) 134 | unlabeled_set = MultiLangAmazonDataset(os.path.join(data_dir, 'unlabeled.pkl'), 135 | vocab, char_vocab, max_seq_len, 50000, update_vocab=True) 136 | return train_set, dev_set, test_set, unlabeled_set 137 | -------------------------------------------------------------------------------- /data_processing_scripts/amazon/pickle_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import sys 4 | 5 | 6 | def process_all_data(root_dir, out_dir): 7 | for lang in ['en', 'de', 'fr', 'ja']: 8 | for domain in ['books', 'dvd', 'music']: 9 | for split in ['train', 'test', 'unlabeled']: 10 | fn = os.path.join(out_dir, lang, domain, f'{split}.tok.txt') 11 | ofn = os.path.join(root_dir, lang, domain, f'{split}.pkl') 12 | with open(fn) as inf, open(ofn, 'wb') as ouf: 13 | reviews = {'X': [], 'Y': []} 14 | print(f"Processing file: {fn}") 15 | for line in inf: 16 | parts = line.split('\t') 17 | x = parts[1].rstrip().split(' ') 18 | y = int(parts[0]) 19 | reviews['X'].append(x) 20 | reviews['Y'].append(y) 21 | pickle.dump(reviews, ouf) 22 | 23 | 24 | if __name__ == '__main__': 25 | process_all_data(sys.argv[1], sys.argv[2]) 26 | -------------------------------------------------------------------------------- /data_processing_scripts/amazon/process_dataset.py: -------------------------------------------------------------------------------- 1 | # Usage: python process_dataset.py dataset_dir output_dir 2 | 3 | import sys 4 | import os 5 | import gzip 6 | from xml.etree.cElementTree import iterparse,tostring 7 | import traceback 8 | 9 | from stanfordcorenlp import StanfordCoreNLP as corenlp 10 | import tinysegmenter 11 | 12 | 13 | # CoreNLP for tokenizing the translation output 14 | CORENLP_DIR = "./stanford-corenlp-full-2018-02-27" 15 | nlp = {} 16 | for lang in ['en', 'de', 'fr']: 17 | nlp[lang] = corenlp(CORENLP_DIR, lang=lang) 18 | 19 | 20 | def parse(itemfile): 21 | for event, elem in iterparse(itemfile): 22 | if elem.tag == "item": 23 | yield processItem(elem) 24 | elem.clear() 25 | 26 | 27 | def processItem(item): 28 | """ Process a review. 29 | Implement custom code here. Use 'item.find('tagname').text' to access the properties of a review. 30 | """ 31 | review = {} 32 | # review.category = item.find("category").text 33 | review['rating'] = int(float(item.find("rating").text)) 34 | # review.asin = item.find("asin").text 35 | # review.date = item.find("date").text 36 | review['text'] = item.find("text").text 37 | # review.summary = item.find("summary").text 38 | return review 39 | 40 | 41 | def process_all_data(root_dir, out_dir): 42 | for lang in ['en', 'de', 'fr', 'ja']: 43 | for domain in ['books', 'dvd', 'music']: 44 | for split in ['train', 'test', 'unlabeled']: 45 | fn = os.path.join(root_dir, lang, domain, f'{split}.review') 46 | ofn = os.path.join(out_dir, lang, domain, f'{split}.tok.txt') 47 | with open(fn) as inf, open(ofn, 'w') as ouf: 48 | print(f"Processing file: {fn}") 49 | for review in parse(inf): 50 | # binarize label 51 | label = 1 if review['rating'] > 3 else 0 52 | try: 53 | # remove line breaks 54 | raw_text = review['text'].replace('\n', ' ').replace('\t', ' ') 55 | if lang == 'ja': 56 | tok_text = tinysegmenter.tokenize(raw_text) 57 | else: 58 | tok_text = nlp[lang].word_tokenize(raw_text) 59 | except: 60 | print("Exception tokenizing", review) 61 | continue 62 | print(f"{label}\t{' '.join(tok_text)}", file=ouf) 63 | 64 | 65 | if __name__ == "__main__": 66 | process_all_data(sys.argv[1], sys.argv[2]) 67 | 68 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | import torch 5 | from torch import autograd, nn 6 | import torch.nn.functional as functional 7 | 8 | import utils 9 | 10 | class AveragingLayer(nn.Module): 11 | def __init__(self, word_emb): 12 | super(AveragingLayer, self).__init__() 13 | self.word_emb = word_emb 14 | 15 | def forward(self, input): 16 | """ 17 | input: (data, lengths): (IntTensor(batch_size, max_sent_len), IntTensor(batch_size)) 18 | """ 19 | data, lengths = input 20 | data = autograd.Variable(data) 21 | lengths = autograd.Variable(lengths) 22 | embeds = self.word_emb(data) 23 | X = embeds.sum(1).squeeze(1) 24 | lengths = lengths.view(-1, 1).expand_as(X) 25 | return torch.div(X, lengths.float()) 26 | 27 | 28 | class SummingLayer(nn.Module): 29 | def __init__(self, word_emb): 30 | super(SummingLayer, self).__init__() 31 | self.word_emb = word_emb 32 | 33 | def forward(self, input): 34 | """ 35 | input: (data, lengths): (IntTensor(batch_size, max_sent_len), IntTensor(batch_size)) 36 | """ 37 | data, _ = input 38 | data = autograd.Variable(data) 39 | embeds = self.word_emb(data) 40 | X = embeds.sum(1).squeeze() 41 | return X 42 | 43 | 44 | class DotAttentionLayer(nn.Module): 45 | def __init__(self, hidden_size): 46 | super(DotAttentionLayer, self).__init__() 47 | self.hidden_size = hidden_size 48 | self.W = nn.Linear(hidden_size, 1, bias=False) 49 | 50 | def forward(self, input): 51 | """ 52 | input: (unpacked_padded_output: batch_size x seq_len x hidden_size, lengths: batch_size) 53 | """ 54 | inputs, lengths = input 55 | batch_size, max_len, _ = inputs.size() 56 | flat_input = inputs.contiguous().view(-1, self.hidden_size) 57 | logits = self.W(flat_input).view(batch_size, max_len) 58 | alphas = functional.softmax(logits, dim=-1) 59 | 60 | # computing mask 61 | idxes = torch.arange(0, max_len, out=torch.LongTensor(max_len)) 62 | mask = (idxes