├── CONTRIBUTING.md ├── DUC ├── eval.sh ├── make_DUC.py ├── make_rouge.py └── setup.sh ├── LICENSE ├── PATENTS ├── README.md ├── construct_data.sh ├── dataset ├── filter.py ├── make_dict.py ├── process_agiga.py ├── pull.py ├── small_train.splits ├── test.splits ├── train.splits └── valid.splits ├── prep_torch_data.sh ├── summary ├── beam_search.lua ├── build.lua ├── build_dict.lua ├── data.lua ├── encoder.lua ├── features.lua ├── nnlm.lua ├── run.lua ├── train.lua └── util.lua ├── test_model.sh ├── train_model.sh └── tuning ├── SDecoder_cfg.txt ├── SDecoder_cmd.tpl ├── SDecoder_test.py ├── ZMERT_cfg.txt └── params.txt /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Neural Attention Model for Abstractive Summarization software 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | ... (in particular how this is synced with internal changes to the project) 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `master`. 12 | 2. If you've added code that should be tested, add tests 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Facebook's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## Coding Style 33 | * 2 spaces for indentation rather than tabs 34 | * 80 character line length 35 | * ... 36 | 37 | ## License 38 | By contributing to Neural Attention Model for Abstractive Summarization, you agree that your contributions will be licensed 39 | under its BSD license. 40 | -------------------------------------------------------------------------------- /DUC/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd $1 4 | rm -fr $1/tmp_GOLD 5 | rm -fr $1/tmp_SYSTEM 6 | rm -fr $1/tmp_OUTPUT 7 | mkdir -p $1/tmp_GOLD 8 | mkdir -p $1/tmp_SYSTEM 9 | 10 | python $ABS/DUC/make_rouge.py --base $1 --gold tmp_GOLD --system tmp_SYSTEM --input input.txt 11 | perl $ABS/DUC/prepare4rouge-simple.pl tmp_SYSTEM tmp_GOLD tmp_OUTPUT 12 | 13 | cd tmp_OUTPUT 14 | export PERL5LIB=/data/users/sashar/summary/duc/RELEASE-1.5.5/ 15 | 16 | echo "FULL LENGTH" 17 | perl $ROUGE/ROUGE-1.5.5.pl -m -n 2 -w 1.2 -e $ROUGE -a settings.xml 18 | 19 | 20 | echo "LIMITED LENGTH" 21 | perl $ROUGE/ROUGE-1.5.5.pl -m -b 75 -n 2 -w 1.2 -e $ROUGE -a settings.xml 22 | -------------------------------------------------------------------------------- /DUC/make_DUC.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. An additional grant 7 | # of patent rights can be found in the PATENTS file in the same directory. 8 | # 9 | # Author: Alexander M Rush 10 | # Sumit Chopra 11 | # Jason Weston 12 | 13 | """Construct the DUC test set. """ 14 | 15 | import sys 16 | import argparse 17 | import glob 18 | import re 19 | import nltk.data 20 | from nltk.tokenize.treebank import TreebankWordTokenizer 21 | #@lint-avoid-python-3-compatibility-imports 22 | 23 | sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 24 | tokenizer = TreebankWordTokenizer() 25 | def main(arguments): 26 | 27 | parser = argparse.ArgumentParser(description=__doc__, 28 | formatter_class= 29 | argparse.RawDescriptionHelpFormatter) 30 | parser.add_argument('--sum_docs', help="Article directory.", type=str) 31 | parser.add_argument('--year', help="DUC year to process.", type=str) 32 | parser.add_argument('--result_docs', help="Reference directory.", type=str) 33 | parser.add_argument('--ref_dir', 34 | help="Directory to output the references.", type=str) 35 | parser.add_argument('--sys_dir', 36 | help="Directory to output the references.", type=str) 37 | parser.add_argument('--article_file', 38 | help="File to output the article sentences..", type=str) 39 | args = parser.parse_args(arguments) 40 | 41 | refs = [open("{0}/task1_ref{1}.txt".format(args.ref_dir, i), "w") 42 | for i in range(4)] 43 | article = open(args.article_file, "w") 44 | prefix = open(args.sys_dir + "/task1_prefix.txt", "w") 45 | if args.year == "2003": 46 | files = glob.glob("{0}/*/*".format(args.sum_docs)) 47 | else: 48 | files = glob.glob("{0}/*/*".format(args.sum_docs)) 49 | files.sort() 50 | for f in files: 51 | docset = f.split("/")[-2][:-1].upper() 52 | name = f.split("/")[-1].upper() 53 | 54 | # Find references. 55 | if args.year == "2003": 56 | matches = list(glob.glob("{0}/{1}*.10.*{2}*".format( 57 | args.result_docs, docset, name))) 58 | else: 59 | matches = list(glob.glob("{0}/{1}*{2}*".format( 60 | args.result_docs, docset, name))) 61 | matches.sort() 62 | assert len(matches) == 4, matches 63 | for i, m in enumerate(matches): 64 | print >>refs[i], open(m).read().strip() 65 | 66 | # Make input. 67 | mode = 0 68 | text = "" 69 | for l in open(f): 70 | if l.strip() in ["

", "

"]: 71 | continue 72 | if mode == 1 and l.strip() != "

": 73 | text += l.strip() + " " 74 | if l.strip() == "": 75 | mode = 1 76 | text = " ".join([w for w in text.split() if w[0] != "&"]) 77 | 78 | sents = sent_detector.tokenize(text) 79 | if len(sents) == 0: 80 | print >>article 81 | print >>prefix 82 | continue 83 | first = sents[0] 84 | 85 | # If the sentence is too short, add the second as well. 86 | if len(sents[0]) < 130 and len(sents) > 1: 87 | first = first.strip()[:-1] + " , " + sents[1] 88 | 89 | first = " ".join(tokenizer.tokenize(first.lower())) 90 | if ")" in first or ("_" in first and args.year == "2003"): 91 | first = re.split(" ((--)|-|_) ", first, 1)[-1] 92 | first = first.replace("(", "-lrb-") \ 93 | .replace(")", "-rrb-").replace("_", ",") 94 | print >>article, first 95 | print >>prefix, first[:75] 96 | if __name__ == '__main__': 97 | sys.exit(main(sys.argv[1:])) 98 | -------------------------------------------------------------------------------- /DUC/make_rouge.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. An additional grant 7 | # of patent rights can be found in the PATENTS file in the same directory. 8 | # 9 | # Author: Alexander M Rush 10 | # Sumit Chopra 11 | # Jason Weston 12 | 13 | """Prep ROUGE eval. """ 14 | 15 | import sys 16 | import glob 17 | import os 18 | import argparse 19 | import itertools 20 | #@lint-avoid-python-3-compatibility-imports 21 | 22 | parser = argparse.ArgumentParser(description=__doc__, 23 | formatter_class= 24 | argparse.RawDescriptionHelpFormatter) 25 | parser.add_argument('--base', help="Base directory.", type=str) 26 | parser.add_argument('--gold', help="Base directory.", type=str) 27 | parser.add_argument('--system', help="Base directory.", type=str) 28 | parser.add_argument('--input', help="Input text.", type=str) 29 | 30 | args = parser.parse_args(sys.argv[1:]) 31 | 32 | for f in glob.glob("{0}/references/*".format(args.base)): 33 | task, ref = f.split("/")[-1].split("_") 34 | ref = int(ref.split(".")[0][-1]) 35 | 36 | for i, l in enumerate(open(f)): 37 | os.system("mkdir -p %s/%s%04d"%(args.gold, task, i)) 38 | with open("%s/%s%04d/%s%04d.%04d.gold" % (args.gold, task, i, task, i, ref), "w") as out: 39 | print >>out, l.strip() 40 | 41 | 42 | for f in glob.glob("{0}/system/*".format(args.base)): 43 | task, ref = f.split("/")[-1].split("_", 1) 44 | #if ref.startswith("ducsystem"): continue 45 | system = ref.split(".")[0] 46 | os.system("mkdir -p %s/%s"%(args.system, system)) 47 | for i, (l, input_line) in enumerate(itertools.izip(open(f), open(args.input))): 48 | words = [] 49 | numbers = dict([(len(w), w) for w in input_line.strip().split() if w[0].isdigit()]) 50 | for w in l.strip().split(): 51 | # Replace # with numbers from the input. 52 | if w[0] == "#" and len(w) in numbers: 53 | words.append(numbers[len(w)]) 54 | elif w == "": 55 | continue 56 | else: 57 | words.append(w) 58 | 59 | with open("%s/%s/%s%04d.%s.system" % (args.system, system, task, i, system),"w") as out: 60 | if words: 61 | print >>out, " ".join(words) 62 | else: 63 | print >>out, "fail" 64 | -------------------------------------------------------------------------------- /DUC/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Step 1: Extracting DUC files" 4 | cd $1 5 | tar xvf DUC2003_Summarization_Documents.tgz 6 | tar xvf DUC2004_Summarization_Documents.tgz 7 | tar xvf duc2004_results.tgz 8 | tar xvf detagged.duc2003.abstracts.tar.gz 9 | 10 | cd duc2004_results/ROUGE/; tar xvf duc2004.task1.ROUGE.models.tar.gz 11 | cd $1 12 | cd DUC2003_Summarization_Documents/duc2003_testdata/task1/; tar xvf task1.docs.tar.gz 13 | 14 | 15 | echo "Step 2: Make reference files." 16 | cd $1 17 | mkdir $1/clean_2004/ 18 | mkdir $1/clean_2004/references 19 | mkdir $1/clean_2004/system 20 | python $ABS/DUC/make_DUC.py --result_docs duc2004_results/ROUGE/eval/models/1/ \ 21 | --sum_docs DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs/ \ 22 | --ref_dir clean_2004/references --year 2004 --article_file clean_2004/input.txt \ 23 | --sys_dir clean_2004/system 24 | 25 | mkdir $1/clean_2003/ 26 | mkdir $1/clean_2003/references 27 | mkdir $1/clean_2003/system 28 | python $ABS/DUC/make_DUC.py --result_docs detagged.duc2003.abstracts/models/ \ 29 | --sum_docs DUC2003_Summarization_Documents/duc2003_testdata/task1/docs.without.headlines/ \ 30 | --ref_dir clean_2003/references --year 2003 --article_file clean_2003/input.txt \ 31 | --sys_dir clean_2003/system 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For Neural Attention Model for Abstractive Summarization software 4 | 5 | Copyright (c) 2015-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /PATENTS: -------------------------------------------------------------------------------- 1 | Additional Grant of Patent Rights Version 2 2 | 3 | "Software" means the Neural Attention Model for Abstractive Summarization software distributed by Facebook, Inc. 4 | 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable 7 | (subject to the termination provision below) license under any Necessary 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise 9 | transfer the Software. For avoidance of doubt, no license is granted under 10 | Facebook’s rights in any patent claims that are infringed by (i) modifications 11 | to the Software made by you or any third party or (ii) the Software in 12 | combination with any software or other technology. 13 | 14 | The license granted hereunder will terminate, automatically and without notice, 15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate 16 | directly or indirectly, or take a direct financial interest in, any Patent 17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate 18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or 19 | in part from any software, technology, product or service of Facebook or any of 20 | its subsidiaries or corporate affiliates, or (iii) against any party relating 21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its 22 | subsidiaries or corporate affiliates files a lawsuit alleging patent 23 | infringement against you in the first instance, and you respond by filing a 24 | patent infringement counterclaim in that lawsuit against that party that is 25 | unrelated to the Software, the license granted hereunder will not terminate 26 | under section (i) of this paragraph due to such counterclaim. 27 | 28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is 29 | necessarily infringed by the Software standing alone. 30 | 31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, 32 | or contributory infringement or inducement to infringe any patent, including a 33 | cross-claim or counterclaim. 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Attention-Based Summarization 2 | 3 | This project contains the Abs. neural abstractive summarization system from the paper 4 | 5 | A Neural Attention Model for Abstractive Summarization. 6 | Alexander M. Rush, Sumit Chopra, Jason Weston. 7 | 8 | The release includes code for: 9 | 10 | * Extracting the summarization data set 11 | * Training the neural summarization model 12 | * Constructing evaluation sets with ROUGE 13 | * Tuning extractive features 14 | 15 | ## Setup 16 | 17 | To run the system, you will need to have [Torch7](http://torch.ch/)) 18 | and [fbcunn](https://github.com/facebook/fbcunn) (Facebook's deep 19 | learning library) installed. You will also need Python 2.7, NLTK, and 20 | GNU Parallel to run the data processing scripts. Additionally the 21 | code currently requires a CUDA GPU for training and decoding. 22 | 23 | Finally the scripts require that you set the $ABS environment variable. 24 | 25 | > export ABS=$PWD 26 | > export LUA_PATH="$LUA_PATH;$ABS/?.lua" 27 | 28 | ## Constructing the Data Set 29 | 30 | The model is trained to perform title generation from the first line 31 | of newspaper articles. Since the system is completely data-driven it 32 | requires a large set of aligned input-title pairs for training. 33 | 34 | To provide these pairs we use the [Annotated Gigaword 35 | corpus](https://catalog.ldc.upenn.edu/LDC2012T21) as our main data 36 | set. The corpus is available on LDC, but it requires membership. Once 37 | the annotated gigaword is obtained, you can simply run the provided 38 | script to extract the data set in text format. 39 | 40 | ### Generating the data 41 | 42 | To construct the data set run the following script to produce `working_dir/`, 43 | where `working_dir/' is the path to the directory where you want to store the 44 | processed data. The script 'construct_data.sh' makes use of the 'parallel' 45 | utility, so please make sure that it is in your path. 46 | WARNING: This may take a couple hours to run. 47 | 48 | > ./construct_data.sh agiga/ working_dir/ 49 | 50 | ### Format of the data files 51 | 52 | The above command builds aligned files of the form split.type.txt where split 53 | is train/valid/test and type is title/article. 54 | 55 | The output of the script is several aligned plain-text files. 56 | Each has one title or article per line. 57 | 58 | > head train.title.txt 59 | australian current account deficit narrows sharply 60 | at least two dead in southern philippines blast 61 | australian stocks close down #.# percent 62 | envoy urges north korea to restart nuclear disablement 63 | skorea announces tax cuts to stimulate economy 64 | 65 | These files can be used to train the ABS system or be used by other baseline models. 66 | 67 | ## Training the Model 68 | 69 | Once the data set has been constructed, we provide a simple script to train 70 | the model. 71 | 72 | > ./train_model.sh working_dir/ model.th 73 | 74 | 75 | The training process consists of two stages. First we convert the text 76 | files into generic input-title matrices and then we train a 77 | conditional NNLM on this representation. 78 | 79 | Once the model has been fully trained (this may require 3-4 days), 80 | you can use the test script to produce summaries of any plain text file.w 81 | 82 | > ./test_model.sh working_dir/valid.article.filter.txt model.th length_of_summary 83 | 84 | 85 | ### Training options 86 | 87 | These scripts utilize the Torch code available in `$ABS/summary/` 88 | 89 | There are two main torch entry points. One for training the model 90 | from data matrices and the other for evaluating the model on plain-text. 91 | 92 | > th summary/train.lua -help 93 | 94 | Train a summarization model. 95 | 96 | -articleDir Directory containing article training matrices. [] 97 | -titleDir Directory containing title training matrices. [] 98 | -validArticleDir Directory containing article matricess for validation. [] 99 | -validTitleDir Directory containing title matrices for validation. [] 100 | -auxModel The encoder model to use. [bow] 101 | -bowDim Article embedding size. [50] 102 | -attenPool Attention model pooling size. [5] 103 | -hiddenUnits Conv net encoder hidden units. [1000] 104 | -kernelWidth Conv net encoder kernel width. [5] 105 | -epochs Number of epochs to train. [5] 106 | -miniBatchSize Size of training minibatch. [64] 107 | -printEvery How often to print during training. [1000] 108 | -modelFilename File for saving loading/model. [] 109 | -window Size of NNLM window. [5] 110 | -embeddingDim Size of NNLM embeddings. [50] 111 | -hiddenSize Size of NNLM hidden layer. [100] 112 | -learningRate SGD learning rate. [0.1] 113 | 114 | 115 | 116 | ### Testing options 117 | 118 | 119 | The run script is used for beam-search decoding with a trained 120 | model. See the paper for a description of the extractive 121 | features used at decoding time. 122 | 123 | > th summary/run.lua -help 124 | 125 | -blockRepeatWords Disallow generating a repeated word. [false] 126 | -allowUNK Allow generating . [false] 127 | -fixedLength Produce exactly -length words. [true] 128 | -lmWeight Weight for main model. [1] 129 | -beamSize Size of the beam. [100] 130 | -extractive Force fully extractive summary. [false] 131 | -lmWeight Feature weight for the neural model. [1] 132 | -unigramBonus Feature weight for unigram extraction. [0] 133 | -bigramBonus Feature weight for bigram extraction. [0] 134 | -trigramBonus Feature weight for trigram extraction. [0] 135 | -lengthBonus Feature weight for length. [0] 136 | -unorderBonus Feature weight for out-of-order extraction. [0] 137 | -modelFilename Model to test. [] 138 | -inputf Input article files. [] 139 | -nbest Write out the nbest list in ZMert format. [false] 140 | -length Maximum length of summary.. [5] 141 | 142 | 143 | 144 | ## Evaluation Data Sets 145 | 146 | We evaluate the ABS model using the shared task from the Document Understanding Conference (DUC). 147 | 148 | This release also includes code for interactive with the DUC shared 149 | task on headline generation. The scripts for processing and evaluating 150 | on this data set are in the DUC/ directory. 151 | 152 | The [DUC data set](http://duc.nist.gov/duc2004/tasks.html) is 153 | available online, unfortunately you must manually fill out a form to 154 | request the data from NIST. Send the request to 155 | [Angela Ellis](mailto:angela.ellis@nist.gov). 156 | 157 | ### Processing DUC 158 | 159 | After receiving credentials you should obtain a series of 160 | tar files containing the data used as part of this shared task. 161 | 162 | 1. Make a directory DUC_data/ which should contain the given files 163 | 164 | 165 | >DUC2003\_Summarization\_Documents.tgz 166 | >DUC2004\_Summarization\_Documents.tgz 167 | >duc2004\_results.tgz 168 | >detagged.duc2003.abstracts.tar.gz 169 | 170 | 2. Run the setup script (this requires python and NLTK for tokenization) 171 | 172 | 173 | > ./DUC/setup.sh DUC_data/ 174 | 175 | 176 | After running the scripts there should be directories 177 | 178 | DUC_data/clean_2003/ 179 | DUC_data/clean_2004/ 180 | 181 | 182 | Each contains a file input.txt where each line is a tokenized first line of an article. 183 | 184 | 185 | > head DUC_data/clean_2003/input.txt 186 | schizophrenia patients whose medication could n't stop the imaginary voices in their heads gained some relief after researchers repeatedly sent a magnetic field into a small area of their brains . 187 | scientists trying to fathom the mystery of schizophrenia say they have found the strongest evidence to date that the disabling psychiatric disorder is caused by gene abnormalities , according to a researcher at two state universities . 188 | a yale school of medicine study is expanding upon what scientists know about the link between schizophrenia and nicotine addiction . 189 | exploring chaos in a search for order , scientists who study the reality-shattering mental disease schizophrenia are becoming fascinated by the chemical environment of areas of the brain where perception is regulated . 190 | 191 | As well as a set of references: 192 | 193 | 194 | > head DUC_data/clean_2003/references/task1_ref0.txt 195 | Magnetic treatment may ease or lessen occurrence of schizophrenic voices. 196 | Evidence shows schizophrenia caused by gene abnormalities of Chromosome 1. 197 | Researchers examining evidence of link between schizophrenia and nicotine addiction. 198 | Scientists focusing on chemical environment of brain to understand schizophrenia. 199 | Schizophrenia study shows disparity between what's known and what's provided to patients. 200 | 201 | System output should be added to the directory system/task1_{name}.txt. For instance the script includes a baseline PREFIX system. 202 | 203 | 204 | DUC_data/clean_2003/references/task1_prefix.txt 205 | 206 | 207 | ### ROUGE for Eval 208 | 209 | To evaluate the summaries you will need the [ROUGE eval system](http://research.microsoft.com/~cyl/download/ROUGE-1.5.5.tgz). 210 | 211 | The ROUGE script requires output in a very complex HTML form. 212 | To simplify this process we include a script to convert the 213 | simple output to one that ROUGE can handle. 214 | 215 | Export the ROUGE directory `export ROUGE={path_to_rouge}` and then run the eval scripts 216 | 217 | 218 | > ./DUC/eval.sh DUC_data/clean_2003/ 219 | FULL LENGTH 220 | --------------------------------------------- 221 | prefix ROUGE-1 Average_R: 0.17831 (95%-conf.int. 0.16916 - 0.18736) 222 | prefix ROUGE-1 Average_P: 0.15445 (95%-conf.int. 0.14683 - 0.16220) 223 | prefix ROUGE-1 Average_F: 0.16482 (95%-conf.int. 0.15662 - 0.17318) 224 | --------------------------------------------- 225 | prefix ROUGE-2 Average_R: 0.04936 (95%-conf.int. 0.04420 - 0.05452) 226 | prefix ROUGE-2 Average_P: 0.04257 (95%-conf.int. 0.03794 - 0.04710) 227 | prefix ROUGE-2 Average_F: 0.04550 (95%-conf.int. 0.04060 - 0.05026) 228 | 229 | 230 | ## Tuning Feature Weights 231 | 232 | For our system ABS+ we additionally tune extractive features on the DUC 233 | summarization data. The final features we obtained our distributed with the 234 | system as `tuning/params.best.txt`. 235 | 236 | The MERT tuning code itself is located in the `tuning/` directory. Our setup 237 | uses [ZMert](http://cs.jhu.edu/~ozaidan/zmert/) for this process. 238 | 239 | It should be straightforward to tune the system on any developments 240 | summarization data. Take the following steps to run tuning on the 241 | DUC-2003 data set described above. 242 | 243 | First copy over reference files to the tuning directoy. For instance to tune on DUC-2003: 244 | 245 | ln -s DUC_data/clean_2003/references/task1_ref0.txt tuning/ref.0 246 | ln -s DUC_data/clean_2003/references/task1_ref1.txt tuning/ref.1 247 | ln -s DUC_data/clean_2003/references/task1_ref2.txt tuning/ref.2 248 | ln -s DUC_data/clean_2003/references/task1_ref3.txt tuning/ref.3 249 | 250 | Next copy the SDecoder template, `cp SDecoder_cmd.tpl SDecoder_cmd.py` 251 | and modify the `SDecoder_cmd.py` to point to the model and input text. 252 | 253 | {"model" : "model.th", 254 | "src" : "/data/users/sashar/DUC_data/clean_2003/input.txt", 255 | "title_len" : 14} 256 | 257 | 258 | Now you should be able to run Z-MERT and let it do its thing. 259 | 260 | > cd tuning/; java -cp zmert/lib/zmert.jar ZMERT ZMERT_cfg.txt 261 | 262 | When Z-MERT has finished you can run on new data using command: 263 | 264 | > python SDecoder_test.py input.txt model.th 265 | -------------------------------------------------------------------------------- /construct_data.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | export AGIGA=$1 4 | export WORK=$2 5 | export THREADS=30 6 | export SCRIPTS=$ABS/dataset 7 | export SPLITS=$ABS/dataset 8 | export UNK=5 9 | 10 | echo "Step 1: Construct the title-article pairs from gigaword" 11 | mkdir -p $WORK 12 | find $AGIGA/???/*.xml.gz | parallel --gnu --progress -j $THREADS python2.7 $SCRIPTS/process_agiga.py \{\} $WORK 13 | 14 | 15 | echo "Step 2: Compile the data into train/dev/test." 16 | cd $WORK 17 | cat $SPLITS/train.splits | xargs cat > train.data.txt 18 | cat $SPLITS/valid.splits | xargs cat > valid.data.txt 19 | cat $SPLITS/test.splits | xargs cat > test.data.txt 20 | 21 | 22 | echo "Step 3: Basic filtering on train/dev." 23 | python2.7 $SCRIPTS/filter.py train.data.txt > train.data.filter.txt 24 | python2.7 $SCRIPTS/filter.py valid.data.txt > valid.data.filter.txt 25 | 26 | 27 | echo "Step 4: Compile dictionary." 28 | python2.7 $SCRIPTS/make_dict.py $WORK/train.data.filter.txt $WORK/train $UNK 29 | 30 | 31 | echo "Step 5: Construct title-article files." 32 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict < $WORK/train.data.filter.txt > $WORK/train.title.txt 33 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/train.data.filter.txt > $WORK/train.article.txt 34 | 35 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict < $WORK/valid.data.txt > $WORK/valid.title.txt 36 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/valid.data.txt > $WORK/valid.article.txt 37 | 38 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict < $WORK/valid.data.filter.txt > $WORK/valid.title.filter.txt 39 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/valid.data.filter.txt > $WORK/valid.article.filter.txt 40 | 41 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict < $WORK/test.data.txt > $WORK/test.title.txt 42 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/test.data.txt > $WORK/test.article.txt 43 | 44 | 45 | echo "Step 6: Constructing torch data files." 46 | bash $ABS/prep_torch_data.sh $WORK 47 | -------------------------------------------------------------------------------- /dataset/filter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. An additional grant 7 | # of patent rights can be found in the PATENTS file in the same directory. 8 | # 9 | # Author: Alexander M Rush 10 | # Sumit Chopra 11 | # Jason Weston 12 | 13 | import sys 14 | #@lint-avoid-python-3-compatibility-imports 15 | 16 | def get_words(parse): 17 | return [w.strip(")") 18 | for w in parse.split() 19 | if w[-1] == ')'] 20 | 21 | for l in open(sys.argv[1]): 22 | splits = l.strip().split("\t") 23 | if len(splits) != 4: 24 | continue 25 | title_parse, article_parse, title, article = splits 26 | title_words = title.split() 27 | article_words = article.split() 28 | 29 | # No blanks. 30 | if any((word == "" for word in title_words)): 31 | continue 32 | 33 | if any((word == "" for word in article_words)): 34 | continue 35 | 36 | if not any((word == "." for word in article_words)): 37 | continue 38 | 39 | # Spurious words to blacklist. 40 | # First set is words that never appear in input and output 41 | # Second set is punctuation and non-title words. 42 | bad_words = ['update#', 'update', 'recasts', 'undated', 'grafs', 'corrects', 43 | 'retransmitting', 'updates', 'dateline', 'writethru', 44 | 'recaps', 'inserts', 'incorporates', 'adv##', 45 | 'ld-writethru', 'djlfx', 'edits', 'byline', 46 | 'repetition', 'background', 'thruout', 'quotes', 47 | 'attention', 'ny###', 'overline', 'embargoed', 'ap', 'gmt', 48 | 'adds', 'embargo', 49 | 'urgent', '?', ' i ', ' : ', ' - ', ' by ', '-lrb-', '-rrb-'] 50 | if any((bad in title.lower() 51 | for bad in bad_words)): 52 | continue 53 | 54 | # Reasonable lengths 55 | if not (10 < len(article_words) < 100 and 56 | 3 < len(title_words) < 50): 57 | continue 58 | 59 | # Some word match. 60 | matches = len(set([w.lower() for w in title_words if len(w) > 3]) & 61 | set([w.lower() for w in article_words if len(w) > 3])) 62 | if matches < 1: 63 | continue 64 | 65 | # Okay, print. 66 | print(l.strip()) 67 | -------------------------------------------------------------------------------- /dataset/make_dict.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. An additional grant 7 | # of patent rights can be found in the PATENTS file in the same directory. 8 | # 9 | # Author: Alexander M Rush 10 | # Sumit Chopra 11 | # Jason Weston 12 | 13 | import sys 14 | from collections import Counter 15 | #@lint-avoid-python-3-compatibility-imports 16 | 17 | title_words = Counter() 18 | article_words = Counter() 19 | limit = int(sys.argv[3]) 20 | 21 | for l in open(sys.argv[1]): 22 | splits = l.strip().split("\t") 23 | if len(splits) != 4: 24 | continue 25 | title_parse, article_parse, title, article = l.strip().split("\t") 26 | title_words.update(title.lower().split()) 27 | article_words.update(article.lower().split()) 28 | 29 | with open(sys.argv[2] + ".article.dict", "w") as f: 30 | print >>f, "", 1e5 31 | print >>f, "", 1e5 32 | print >>f, "", 1e5 33 | for word, count in article_words.most_common(): 34 | if count < limit: 35 | break 36 | print >>f, word, count 37 | 38 | with open(sys.argv[2] + ".title.dict", "w") as f: 39 | print >>f, "", 1e5 40 | print >>f, "", 1e5 41 | print >>f, "", 1e5 42 | for word, count in title_words.most_common(): 43 | if count < limit: 44 | break 45 | print >>f, word, count 46 | -------------------------------------------------------------------------------- /dataset/process_agiga.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. An additional grant 7 | # of patent rights can be found in the PATENTS file in the same directory. 8 | # 9 | # Author: Alexander M Rush 10 | # Sumit Chopra 11 | # Jason Weston 12 | 13 | #/usr/bin/env python 14 | 15 | import sys 16 | import os 17 | import re 18 | import gzip 19 | #@lint-avoid-python-3-compatibility-imports 20 | 21 | # Make directory for output if it doesn't exist 22 | 23 | try: 24 | os.mkdir(sys.argv[2] + "/" + sys.argv[1].split("/")[-2]) 25 | except OSError: 26 | pass 27 | 28 | # Strip off .gz ending 29 | end = "/".join(sys.argv[1].split("/")[-2:])[:-len(".xml.gz")] + ".txt" 30 | 31 | out = open(sys.argv[2] + end, "w") 32 | 33 | # Parse and print titles and articles 34 | NONE, HEAD, NEXT, TEXT = 0, 1, 2, 3 35 | MODE = NONE 36 | title_parse = "" 37 | article_parse = [] 38 | 39 | # FIX: Some parses are mis-parenthesized. 40 | def fix_paren(parse): 41 | if len(parse) < 2: 42 | return parse 43 | if parse[0] == "(" and parse[1] == " ": 44 | return parse[2:-1] 45 | return parse 46 | 47 | def get_words(parse): 48 | words = [] 49 | for w in parse.split(): 50 | if w[-1] == ')': 51 | words.append(w.strip(")")) 52 | if words[-1] == ".": 53 | break 54 | return words 55 | 56 | def remove_digits(parse): 57 | return re.sub(r'\d', '#', parse) 58 | 59 | for l in gzip.open(sys.argv[1]): 60 | if MODE == HEAD: 61 | title_parse = remove_digits(fix_paren(l.strip())) 62 | MODE = NEXT 63 | 64 | if MODE == TEXT: 65 | article_parse.append(remove_digits(fix_paren(l.strip()))) 66 | 67 | if MODE == NONE and l.strip() == "": 68 | MODE = HEAD 69 | 70 | if MODE == NEXT and l.strip() == "

": 71 | MODE = TEXT 72 | 73 | if MODE == TEXT and l.strip() == "

": 74 | articles = [] 75 | # Annotated gigaword has a poor sentence segmenter. 76 | # Ensure there is a least a period. 77 | 78 | for i in range(len(article_parse)): 79 | articles.append(article_parse[i]) 80 | if "(. .)" in article_parse[i]: 81 | break 82 | 83 | article_parse = "(TOP " + " ".join(articles) + ")" 84 | 85 | # title_parse \t article_parse \t title \t article 86 | print >>out, "\t".join([title_parse, article_parse, 87 | " ".join(get_words(title_parse)), 88 | " ".join(get_words(article_parse))]) 89 | article_parse = [] 90 | MODE = NONE 91 | -------------------------------------------------------------------------------- /dataset/pull.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. An additional grant 7 | # of patent rights can be found in the PATENTS file in the same directory. 8 | # 9 | # Author: Alexander M Rush 10 | # Sumit Chopra 11 | # Jason Weston 12 | 13 | """ 14 | Pull out elements of the title-article file. 15 | """ 16 | import sys 17 | #@lint-avoid-python-3-compatibility-imports 18 | 19 | words_dict = set([l.split()[0] 20 | for l in open(sys.argv[2])]) 21 | 22 | for l in sys.stdin: 23 | splits = l.strip().split("\t") 24 | if len(splits) != 4: 25 | continue 26 | title_parse, article_parse, title, article = l.strip().split("\t") 27 | if sys.argv[1] == "src": 28 | print(article) 29 | elif sys.argv[1] == "trg": 30 | print(title) 31 | elif sys.argv[1] == "src_lc": 32 | words = [w if w in words_dict else "" 33 | for w in article.lower().split()] 34 | print(" ".join(words)) 35 | elif sys.argv[1] == "trg_lc": 36 | t = title.lower() 37 | words = [w if w in words_dict else "" 38 | for w in t.split() 39 | if w not in ['"', "'", "''", "!", "=", "-", 40 | "--", ",", "?", ".", 41 | "``", "`", "-rrb-", "-llb-", "\\/"]] 42 | print(" ".join(words)) 43 | elif sys.argv[1] == "srctree": 44 | print(article_parse) 45 | elif sys.argv[1] == "interleave": 46 | # Format needed for T3 47 | print(article_parse) 48 | print(title_parse) 49 | -------------------------------------------------------------------------------- /dataset/small_train.splits: -------------------------------------------------------------------------------- 1 | AFP/afp_eng_201004.txt 2 | AFP/afp_eng_200212.txt 3 | AFP/afp_eng_200401.txt 4 | AFP/afp_eng_199508.txt 5 | AFP/afp_eng_200610.txt 6 | AFP/afp_eng_201007.txt 7 | APW/apw_eng_200105.txt 8 | APW/apw_eng_200408.txt 9 | APW/apw_eng_201001.txt 10 | APW/apw_eng_200906.txt 11 | APW/apw_eng_200606.txt 12 | APW/apw_eng_200211.txt 13 | APW/apw_eng_200512.txt 14 | APW/apw_eng_200505.txt 15 | CNA/cna_eng_199910.txt 16 | CNA/cna_eng_199905.txt 17 | CNA/cna_eng_200905.txt 18 | CNA/cna_eng_200101.txt 19 | CNA/cna_eng_200105.txt 20 | CNA/cna_eng_200201.txt 21 | LTW/ltw_eng_199806.txt 22 | LTW/ltw_eng_200702.txt 23 | LTW/ltw_eng_200607.txt 24 | LTW/ltw_eng_200708.txt 25 | LTW/ltw_eng_200501.txt 26 | NYT/nyt_eng_200807.txt 27 | NYT/nyt_eng_200612.txt 28 | NYT/nyt_eng_199608.txt 29 | NYT/nyt_eng_200106.txt 30 | NYT/nyt_eng_200311.txt 31 | NYT/nyt_eng_200702.txt 32 | NYT/nyt_eng_201007.txt 33 | NYT/nyt_eng_200212.txt 34 | XIN/xin_eng_199506.txt 35 | XIN/xin_eng_200311.txt 36 | XIN/xin_eng_199703.txt 37 | XIN/xin_eng_200305.txt 38 | XIN/xin_eng_199808.txt 39 | XIN/xin_eng_199609.txt 40 | XIN/xin_eng_200109.txt 41 | XIN/xin_eng_200706.txt 42 | -------------------------------------------------------------------------------- /dataset/test.splits: -------------------------------------------------------------------------------- 1 | AFP/afp_eng_199511.txt 2 | AFP/afp_eng_200606.txt 3 | AFP/afp_eng_199703.txt 4 | AFP/afp_eng_200811.txt 5 | AFP/afp_eng_199604.txt 6 | AFP/afp_eng_200704.txt 7 | AFP/afp_eng_200701.txt 8 | APW/apw_eng_200412.txt 9 | APW/apw_eng_200908.txt 10 | APW/apw_eng_199605.txt 11 | APW/apw_eng_200305.txt 12 | APW/apw_eng_200506.txt 13 | APW/apw_eng_199608.txt 14 | APW/apw_eng_199808.txt 15 | APW/apw_eng_200708.txt 16 | APW/apw_eng_199707.txt 17 | CNA/cna_eng_200910.txt 18 | CNA/cna_eng_200103.txt 19 | CNA/cna_eng_200308.txt 20 | CNA/cna_eng_200904.txt 21 | CNA/cna_eng_201012.txt 22 | CNA/cna_eng_201007.txt 23 | CNA/cna_eng_200112.txt 24 | LTW/ltw_eng_200605.txt 25 | LTW/ltw_eng_200608.txt 26 | LTW/ltw_eng_200312.txt 27 | LTW/ltw_eng_200906.txt 28 | LTW/ltw_eng_200606.txt 29 | LTW/ltw_eng_200805.txt 30 | NYT/nyt_eng_201005.txt 31 | NYT/nyt_eng_200305.txt 32 | NYT/nyt_eng_200505.txt 33 | NYT/nyt_eng_199612.txt 34 | NYT/nyt_eng_199910.txt 35 | NYT/nyt_eng_199809.txt 36 | NYT/nyt_eng_201004.txt 37 | NYT/nyt_eng_200808.txt 38 | NYT/nyt_eng_200601.txt 39 | XIN/xin_eng_199704.txt 40 | XIN/xin_eng_200310.txt 41 | XIN/xin_eng_200711.txt 42 | XIN/xin_eng_200804.txt 43 | XIN/xin_eng_200902.txt 44 | XIN/xin_eng_200106.txt 45 | XIN/xin_eng_199802.txt 46 | XIN/xin_eng_200411.txt 47 | XIN/xin_eng_200511.txt 48 | -------------------------------------------------------------------------------- /dataset/train.splits: -------------------------------------------------------------------------------- 1 | AFP/afp_eng_200809.txt 2 | AFP/afp_eng_199412.txt 3 | AFP/afp_eng_200311.txt 4 | AFP/afp_eng_199512.txt 5 | AFP/afp_eng_200203.txt 6 | AFP/afp_eng_200204.txt 7 | AFP/afp_eng_200608.txt 8 | AFP/afp_eng_200509.txt 9 | AFP/afp_eng_200410.txt 10 | AFP/afp_eng_200405.txt 11 | AFP/afp_eng_200211.txt 12 | AFP/afp_eng_200205.txt 13 | AFP/afp_eng_199405.txt 14 | AFP/afp_eng_199510.txt 15 | AFP/afp_eng_199611.txt 16 | AFP/afp_eng_199612.txt 17 | AFP/afp_eng_200907.txt 18 | AFP/afp_eng_200412.txt 19 | AFP/afp_eng_201002.txt 20 | AFP/afp_eng_200910.txt 21 | AFP/afp_eng_199504.txt 22 | AFP/afp_eng_200207.txt 23 | AFP/afp_eng_199501.txt 24 | AFP/afp_eng_200812.txt 25 | AFP/afp_eng_200307.txt 26 | AFP/afp_eng_199608.txt 27 | AFP/afp_eng_200303.txt 28 | AFP/afp_eng_200304.txt 29 | AFP/afp_eng_199409.txt 30 | AFP/afp_eng_200202.txt 31 | AFP/afp_eng_199610.txt 32 | AFP/afp_eng_199503.txt 33 | AFP/afp_eng_200904.txt 34 | AFP/afp_eng_200212.txt 35 | AFP/afp_eng_201010.txt 36 | AFP/afp_eng_200901.txt 37 | AFP/afp_eng_200702.txt 38 | AFP/afp_eng_199609.txt 39 | AFP/afp_eng_200806.txt 40 | AFP/afp_eng_200805.txt 41 | AFP/afp_eng_200408.txt 42 | AFP/afp_eng_200611.txt 43 | AFP/afp_eng_201012.txt 44 | AFP/afp_eng_200501.txt 45 | AFP/afp_eng_200706.txt 46 | AFP/afp_eng_200505.txt 47 | AFP/afp_eng_199602.txt 48 | AFP/afp_eng_199601.txt 49 | AFP/afp_eng_200607.txt 50 | AFP/afp_eng_200404.txt 51 | AFP/afp_eng_200406.txt 52 | AFP/afp_eng_200912.txt 53 | AFP/afp_eng_200306.txt 54 | AFP/afp_eng_200312.txt 55 | AFP/afp_eng_199506.txt 56 | AFP/afp_eng_199701.txt 57 | AFP/afp_eng_199505.txt 58 | AFP/afp_eng_199606.txt 59 | AFP/afp_eng_200512.txt 60 | AFP/afp_eng_200711.txt 61 | AFP/afp_eng_200603.txt 62 | AFP/afp_eng_200504.txt 63 | AFP/afp_eng_200310.txt 64 | AFP/afp_eng_200209.txt 65 | AFP/afp_eng_199411.txt 66 | AFP/afp_eng_199509.txt 67 | AFP/afp_eng_200903.txt 68 | AFP/afp_eng_200707.txt 69 | AFP/afp_eng_200705.txt 70 | AFP/afp_eng_199603.txt 71 | AFP/afp_eng_200112.txt 72 | AFP/afp_eng_200502.txt 73 | AFP/afp_eng_200508.txt 74 | AFP/afp_eng_200403.txt 75 | AFP/afp_eng_199705.txt 76 | AFP/afp_eng_200908.txt 77 | AFP/afp_eng_200206.txt 78 | AFP/afp_eng_200906.txt 79 | AFP/afp_eng_199507.txt 80 | AFP/afp_eng_201001.txt 81 | AFP/afp_eng_199407.txt 82 | AFP/afp_eng_201004.txt 83 | AFP/afp_eng_200208.txt 84 | AFP/afp_eng_200902.txt 85 | AFP/afp_eng_200710.txt 86 | AFP/afp_eng_200503.txt 87 | AFP/afp_eng_200905.txt 88 | AFP/afp_eng_200712.txt 89 | AFP/afp_eng_200402.txt 90 | AFP/afp_eng_200807.txt 91 | AFP/afp_eng_200804.txt 92 | AFP/afp_eng_201006.txt 93 | AFP/afp_eng_200511.txt 94 | AFP/afp_eng_200802.txt 95 | AFP/afp_eng_201008.txt 96 | AFP/afp_eng_200309.txt 97 | AFP/afp_eng_200301.txt 98 | AFP/afp_eng_200612.txt 99 | AFP/afp_eng_199704.txt 100 | AFP/afp_eng_200604.txt 101 | AFP/afp_eng_199410.txt 102 | AFP/afp_eng_200911.txt 103 | AFP/afp_eng_200510.txt 104 | AFP/afp_eng_200803.txt 105 | AFP/afp_eng_201009.txt 106 | AFP/afp_eng_200810.txt 107 | AFP/afp_eng_200610.txt 108 | AFP/afp_eng_200507.txt 109 | AFP/afp_eng_200708.txt 110 | AFP/afp_eng_200201.txt 111 | AFP/afp_eng_200801.txt 112 | AFP/afp_eng_200407.txt 113 | AFP/afp_eng_200305.txt 114 | AFP/afp_eng_199408.txt 115 | AFP/afp_eng_200210.txt 116 | AFP/afp_eng_199607.txt 117 | AFP/afp_eng_201003.txt 118 | AFP/afp_eng_200605.txt 119 | AFP/afp_eng_201011.txt 120 | AFP/afp_eng_201007.txt 121 | AFP/afp_eng_200401.txt 122 | AFP/afp_eng_200602.txt 123 | AFP/afp_eng_201005.txt 124 | AFP/afp_eng_200709.txt 125 | AFP/afp_eng_200302.txt 126 | AFP/afp_eng_200909.txt 127 | AFP/afp_eng_200609.txt 128 | AFP/afp_eng_200808.txt 129 | AFP/afp_eng_200411.txt 130 | AFP/afp_eng_199508.txt 131 | AFP/afp_eng_199605.txt 132 | AFP/afp_eng_200409.txt 133 | APW/apw_eng_201001.txt 134 | APW/apw_eng_199501.txt 135 | APW/apw_eng_200307.txt 136 | APW/apw_eng_200902.txt 137 | APW/apw_eng_200303.txt 138 | APW/apw_eng_200304.txt 139 | APW/apw_eng_200503.txt 140 | APW/apw_eng_200905.txt 141 | APW/apw_eng_200111.txt 142 | APW/apw_eng_200301.txt 143 | APW/apw_eng_199712.txt 144 | APW/apw_eng_199612.txt 145 | APW/apw_eng_200011.txt 146 | APW/apw_eng_199503.txt 147 | APW/apw_eng_200106.txt 148 | APW/apw_eng_200802.txt 149 | APW/apw_eng_200007.txt 150 | APW/apw_eng_199905.txt 151 | APW/apw_eng_201009.txt 152 | APW/apw_eng_200109.txt 153 | APW/apw_eng_200612.txt 154 | APW/apw_eng_200702.txt 155 | APW/apw_eng_199609.txt 156 | APW/apw_eng_199909.txt 157 | APW/apw_eng_199702.txt 158 | APW/apw_eng_200805.txt 159 | APW/apw_eng_199902.txt 160 | APW/apw_eng_201011.txt 161 | APW/apw_eng_200107.txt 162 | APW/apw_eng_200611.txt 163 | APW/apw_eng_200904.txt 164 | APW/apw_eng_200006.txt 165 | APW/apw_eng_200505.txt 166 | APW/apw_eng_200810.txt 167 | APW/apw_eng_199801.txt 168 | APW/apw_eng_200808.txt 169 | APW/apw_eng_200607.txt 170 | APW/apw_eng_200404.txt 171 | APW/apw_eng_199803.txt 172 | APW/apw_eng_199611.txt 173 | APW/apw_eng_200406.txt 174 | APW/apw_eng_200211.txt 175 | APW/apw_eng_199911.txt 176 | APW/apw_eng_200912.txt 177 | APW/apw_eng_200809.txt 178 | APW/apw_eng_199710.txt 179 | APW/apw_eng_199907.txt 180 | APW/apw_eng_199607.txt 181 | APW/apw_eng_199506.txt 182 | APW/apw_eng_200605.txt 183 | APW/apw_eng_199502.txt 184 | APW/apw_eng_199505.txt 185 | APW/apw_eng_200811.txt 186 | APW/apw_eng_200401.txt 187 | APW/apw_eng_200602.txt 188 | APW/apw_eng_200512.txt 189 | APW/apw_eng_200711.txt 190 | APW/apw_eng_200909.txt 191 | APW/apw_eng_200201.txt 192 | APW/apw_eng_200202.txt 193 | APW/apw_eng_200103.txt 194 | APW/apw_eng_199604.txt 195 | APW/apw_eng_199508.txt 196 | APW/apw_eng_199711.txt 197 | APW/apw_eng_200310.txt 198 | APW/apw_eng_200209.txt 199 | APW/apw_eng_199809.txt 200 | APW/apw_eng_199411.txt 201 | APW/apw_eng_200003.txt 202 | APW/apw_eng_200903.txt 203 | APW/apw_eng_199903.txt 204 | APW/apw_eng_199512.txt 205 | APW/apw_eng_200104.txt 206 | APW/apw_eng_201006.txt 207 | APW/apw_eng_200005.txt 208 | APW/apw_eng_200405.txt 209 | APW/apw_eng_199906.txt 210 | APW/apw_eng_199904.txt 211 | APW/apw_eng_199510.txt 212 | APW/apw_eng_200112.txt 213 | APW/apw_eng_200508.txt 214 | APW/apw_eng_200108.txt 215 | APW/apw_eng_200403.txt 216 | APW/apw_eng_201010.txt 217 | APW/apw_eng_200906.txt 218 | APW/apw_eng_201002.txt 219 | APW/apw_eng_200910.txt 220 | APW/apw_eng_199806.txt 221 | APW/apw_eng_200806.txt 222 | APW/apw_eng_199504.txt 223 | APW/apw_eng_200207.txt 224 | APW/apw_eng_201004.txt 225 | APW/apw_eng_200208.txt 226 | APW/apw_eng_199709.txt 227 | APW/apw_eng_200812.txt 228 | APW/apw_eng_200710.txt 229 | APW/apw_eng_200410.txt 230 | APW/apw_eng_200712.txt 231 | APW/apw_eng_200001.txt 232 | APW/apw_eng_201012.txt 233 | APW/apw_eng_200402.txt 234 | APW/apw_eng_200804.txt 235 | APW/apw_eng_199610.txt 236 | APW/apw_eng_200009.txt 237 | APW/apw_eng_200511.txt 238 | APW/apw_eng_199602.txt 239 | APW/apw_eng_199601.txt 240 | APW/apw_eng_200901.txt 241 | APW/apw_eng_199704.txt 242 | APW/apw_eng_200308.txt 243 | APW/apw_eng_200604.txt 244 | APW/apw_eng_200701.txt 245 | APW/apw_eng_200704.txt 246 | APW/apw_eng_199603.txt 247 | APW/apw_eng_200408.txt 248 | APW/apw_eng_200911.txt 249 | APW/apw_eng_199511.txt 250 | APW/apw_eng_200510.txt 251 | APW/apw_eng_200803.txt 252 | APW/apw_eng_199802.txt 253 | APW/apw_eng_200501.txt 254 | APW/apw_eng_200706.txt 255 | APW/apw_eng_200610.txt 256 | APW/apw_eng_199804.txt 257 | APW/apw_eng_200507.txt 258 | APW/apw_eng_200801.txt 259 | APW/apw_eng_199908.txt 260 | APW/apw_eng_201007.txt 261 | APW/apw_eng_200601.txt 262 | APW/apw_eng_200306.txt 263 | APW/apw_eng_200407.txt 264 | APW/apw_eng_200212.txt 265 | APW/apw_eng_199910.txt 266 | APW/apw_eng_200004.txt 267 | APW/apw_eng_200312.txt 268 | APW/apw_eng_201003.txt 269 | APW/apw_eng_199701.txt 270 | APW/apw_eng_200008.txt 271 | APW/apw_eng_200012.txt 272 | APW/apw_eng_201005.txt 273 | APW/apw_eng_200709.txt 274 | APW/apw_eng_200105.txt 275 | APW/apw_eng_200302.txt 276 | APW/apw_eng_200101.txt 277 | APW/apw_eng_200609.txt 278 | APW/apw_eng_200603.txt 279 | APW/apw_eng_199901.txt 280 | APW/apw_eng_200002.txt 281 | APW/apw_eng_200504.txt 282 | APW/apw_eng_200606.txt 283 | APW/apw_eng_200409.txt 284 | APW/apw_eng_199509.txt 285 | APW/apw_eng_199412.txt 286 | APW/apw_eng_200311.txt 287 | APW/apw_eng_200203.txt 288 | APW/apw_eng_200703.txt 289 | APW/apw_eng_200707.txt 290 | APW/apw_eng_200509.txt 291 | APW/apw_eng_200102.txt 292 | APW/apw_eng_200705.txt 293 | APW/apw_eng_201008.txt 294 | APW/apw_eng_200807.txt 295 | APW/apw_eng_200502.txt 296 | APW/apw_eng_200110.txt 297 | APW/apw_eng_200010.txt 298 | APW/apw_eng_199705.txt 299 | APW/apw_eng_199706.txt 300 | APW/apw_eng_200206.txt 301 | APW/apw_eng_199703.txt 302 | APW/apw_eng_199805.txt 303 | APW/apw_eng_200411.txt 304 | APW/apw_eng_199507.txt 305 | CNA/cna_eng_200608.txt 306 | CNA/cna_eng_200906.txt 307 | CNA/cna_eng_200110.txt 308 | CNA/cna_eng_199712.txt 309 | CNA/cna_eng_200609.txt 310 | CNA/cna_eng_199903.txt 311 | CNA/cna_eng_200111.txt 312 | CNA/cna_eng_200712.txt 313 | CNA/cna_eng_200808.txt 314 | CNA/cna_eng_200006.txt 315 | CNA/cna_eng_199803.txt 316 | CNA/cna_eng_200811.txt 317 | CNA/cna_eng_200004.txt 318 | CNA/cna_eng_199906.txt 319 | CNA/cna_eng_200009.txt 320 | CNA/cna_eng_200401.txt 321 | CNA/cna_eng_200602.txt 322 | CNA/cna_eng_200802.txt 323 | CNA/cna_eng_200108.txt 324 | CNA/cna_eng_200501.txt 325 | CNA/cna_eng_200106.txt 326 | CNA/cna_eng_200203.txt 327 | CNA/cna_eng_200903.txt 328 | CNA/cna_eng_200812.txt 329 | CNA/cna_eng_200911.txt 330 | CNA/cna_eng_200505.txt 331 | CNA/cna_eng_199710.txt 332 | CNA/cna_eng_200806.txt 333 | CNA/cna_eng_200311.txt 334 | CNA/cna_eng_200507.txt 335 | CNA/cna_eng_200809.txt 336 | CNA/cna_eng_200010.txt 337 | CNA/cna_eng_200312.txt 338 | CNA/cna_eng_199802.txt 339 | CNA/cna_eng_200807.txt 340 | CNA/cna_eng_199908.txt 341 | CNA/cna_eng_200202.txt 342 | CNA/cna_eng_201002.txt 343 | CNA/cna_eng_200512.txt 344 | CNA/cna_eng_200309.txt 345 | CNA/cna_eng_200607.txt 346 | CNA/cna_eng_199711.txt 347 | CNA/cna_eng_199809.txt 348 | CNA/cna_eng_200805.txt 349 | CNA/cna_eng_200610.txt 350 | CNA/cna_eng_200109.txt 351 | CNA/cna_eng_200007.txt 352 | CNA/cna_eng_200703.txt 353 | CNA/cna_eng_200201.txt 354 | CNA/cna_eng_199904.txt 355 | CNA/cna_eng_199806.txt 356 | CNA/cna_eng_200410.txt 357 | CNA/cna_eng_200001.txt 358 | CNA/cna_eng_200709.txt 359 | CNA/cna_eng_200408.txt 360 | CNA/cna_eng_200711.txt 361 | CNA/cna_eng_200101.txt 362 | CNA/cna_eng_201003.txt 363 | CNA/cna_eng_199805.txt 364 | CNA/cna_eng_200012.txt 365 | CNA/cna_eng_199804.txt 366 | CNA/cna_eng_200907.txt 367 | CNA/cna_eng_200502.txt 368 | CNA/cna_eng_200603.txt 369 | CNA/cna_eng_199911.txt 370 | CNA/cna_eng_200902.txt 371 | CNA/cna_eng_200605.txt 372 | CNA/cna_eng_200107.txt 373 | CNA/cna_eng_200611.txt 374 | CNA/cna_eng_201008.txt 375 | CNA/cna_eng_200409.txt 376 | CNA/cna_eng_200412.txt 377 | CNA/cna_eng_200503.txt 378 | CNA/cna_eng_200005.txt 379 | CNA/cna_eng_200905.txt 380 | CNA/cna_eng_200105.txt 381 | CNA/cna_eng_199905.txt 382 | CNA/cna_eng_200511.txt 383 | CNA/cna_eng_199902.txt 384 | CNA/cna_eng_200704.txt 385 | CNA/cna_eng_200901.txt 386 | CNA/cna_eng_199808.txt 387 | CNA/cna_eng_201009.txt 388 | CNA/cna_eng_200810.txt 389 | CNA/cna_eng_201011.txt 390 | CNA/cna_eng_200708.txt 391 | CNA/cna_eng_200402.txt 392 | CNA/cna_eng_200604.txt 393 | CNA/cna_eng_201006.txt 394 | CNA/cna_eng_200008.txt 395 | CNA/cna_eng_201001.txt 396 | CNA/cna_eng_200509.txt 397 | CNA/cna_eng_200510.txt 398 | CNA/cna_eng_200405.txt 399 | CNA/cna_eng_200801.txt 400 | CNA/cna_eng_199912.txt 401 | CNA/cna_eng_200104.txt 402 | CNA/cna_eng_200307.txt 403 | CNA/cna_eng_201010.txt 404 | CNA/cna_eng_200506.txt 405 | CNA/cna_eng_200612.txt 406 | CNA/cna_eng_200706.txt 407 | CNA/cna_eng_200701.txt 408 | CNA/cna_eng_200804.txt 409 | CNA/cna_eng_199709.txt 410 | CNA/cna_eng_200411.txt 411 | CNA/cna_eng_199901.txt 412 | CNA/cna_eng_200002.txt 413 | CNA/cna_eng_200508.txt 414 | CNA/cna_eng_200310.txt 415 | CNA/cna_eng_200908.txt 416 | CNA/cna_eng_199907.txt 417 | CNA/cna_eng_200606.txt 418 | CNA/cna_eng_200601.txt 419 | CNA/cna_eng_200702.txt 420 | CNA/cna_eng_200909.txt 421 | CNA/cna_eng_199807.txt 422 | CNA/cna_eng_199909.txt 423 | CNA/cna_eng_200404.txt 424 | CNA/cna_eng_200403.txt 425 | CNA/cna_eng_200406.txt 426 | CNA/cna_eng_200707.txt 427 | CNA/cna_eng_199910.txt 428 | CNA/cna_eng_200705.txt 429 | CNA/cna_eng_200011.txt 430 | CNA/cna_eng_201004.txt 431 | CNA/cna_eng_199801.txt 432 | LTW/ltw_eng_200405.txt 433 | LTW/ltw_eng_199710.txt 434 | LTW/ltw_eng_200311.txt 435 | LTW/ltw_eng_200507.txt 436 | LTW/ltw_eng_200809.txt 437 | LTW/ltw_eng_199801.txt 438 | LTW/ltw_eng_199406.txt 439 | LTW/ltw_eng_200506.txt 440 | LTW/ltw_eng_199704.txt 441 | LTW/ltw_eng_199508.txt 442 | LTW/ltw_eng_200409.txt 443 | LTW/ltw_eng_200412.txt 444 | LTW/ltw_eng_200710.txt 445 | LTW/ltw_eng_200904.txt 446 | LTW/ltw_eng_199603.txt 447 | LTW/ltw_eng_199512.txt 448 | LTW/ltw_eng_200411.txt 449 | LTW/ltw_eng_200603.txt 450 | LTW/ltw_eng_200810.txt 451 | LTW/ltw_eng_200401.txt 452 | LTW/ltw_eng_200410.txt 453 | LTW/ltw_eng_199411.txt 454 | LTW/ltw_eng_200404.txt 455 | LTW/ltw_eng_199705.txt 456 | LTW/ltw_eng_200510.txt 457 | LTW/ltw_eng_199804.txt 458 | LTW/ltw_eng_200705.txt 459 | LTW/ltw_eng_200812.txt 460 | LTW/ltw_eng_200911.txt 461 | LTW/ltw_eng_200502.txt 462 | LTW/ltw_eng_199501.txt 463 | LTW/ltw_eng_199506.txt 464 | LTW/ltw_eng_200611.txt 465 | LTW/ltw_eng_200804.txt 466 | LTW/ltw_eng_199701.txt 467 | LTW/ltw_eng_199711.txt 468 | LTW/ltw_eng_199601.txt 469 | LTW/ltw_eng_199606.txt 470 | LTW/ltw_eng_200704.txt 471 | LTW/ltw_eng_199702.txt 472 | LTW/ltw_eng_200703.txt 473 | LTW/ltw_eng_200308.txt 474 | LTW/ltw_eng_200602.txt 475 | LTW/ltw_eng_199703.txt 476 | LTW/ltw_eng_200708.txt 477 | LTW/ltw_eng_200604.txt 478 | LTW/ltw_eng_200711.txt 479 | LTW/ltw_eng_200909.txt 480 | LTW/ltw_eng_200509.txt 481 | LTW/ltw_eng_200406.txt 482 | LTW/ltw_eng_199612.txt 483 | LTW/ltw_eng_199608.txt 484 | LTW/ltw_eng_200505.txt 485 | LTW/ltw_eng_200912.txt 486 | LTW/ltw_eng_199412.txt 487 | LTW/ltw_eng_200709.txt 488 | LTW/ltw_eng_200910.txt 489 | LTW/ltw_eng_200612.txt 490 | LTW/ltw_eng_199405.txt 491 | LTW/ltw_eng_199510.txt 492 | LTW/ltw_eng_199407.txt 493 | LTW/ltw_eng_200803.txt 494 | LTW/ltw_eng_200607.txt 495 | LTW/ltw_eng_199712.txt 496 | LTW/ltw_eng_199611.txt 497 | LTW/ltw_eng_200609.txt 498 | LTW/ltw_eng_200503.txt 499 | LTW/ltw_eng_199605.txt 500 | LTW/ltw_eng_199709.txt 501 | LTW/ltw_eng_200808.txt 502 | LTW/ltw_eng_200907.txt 503 | LTW/ltw_eng_200902.txt 504 | LTW/ltw_eng_199707.txt 505 | LTW/ltw_eng_200811.txt 506 | LTW/ltw_eng_199409.txt 507 | LTW/ltw_eng_199410.txt 508 | LTW/ltw_eng_200908.txt 509 | LTW/ltw_eng_199609.txt 510 | LTW/ltw_eng_199408.txt 511 | LTW/ltw_eng_200601.txt 512 | LTW/ltw_eng_200402.txt 513 | LTW/ltw_eng_200501.txt 514 | LTW/ltw_eng_199504.txt 515 | LTW/ltw_eng_199805.txt 516 | LTW/ltw_eng_199511.txt 517 | LTW/ltw_eng_199505.txt 518 | LTW/ltw_eng_199610.txt 519 | LTW/ltw_eng_200801.txt 520 | LTW/ltw_eng_200806.txt 521 | LTW/ltw_eng_199802.txt 522 | LTW/ltw_eng_200807.txt 523 | LTW/ltw_eng_199507.txt 524 | LTW/ltw_eng_200309.txt 525 | LTW/ltw_eng_200706.txt 526 | LTW/ltw_eng_200701.txt 527 | LTW/ltw_eng_199708.txt 528 | LTW/ltw_eng_199502.txt 529 | LTW/ltw_eng_200712.txt 530 | LTW/ltw_eng_200511.txt 531 | LTW/ltw_eng_200610.txt 532 | LTW/ltw_eng_200905.txt 533 | LTW/ltw_eng_200901.txt 534 | LTW/ltw_eng_200903.txt 535 | LTW/ltw_eng_199806.txt 536 | LTW/ltw_eng_200508.txt 537 | LTW/ltw_eng_200802.txt 538 | LTW/ltw_eng_200702.txt 539 | LTW/ltw_eng_200408.txt 540 | LTW/ltw_eng_199604.txt 541 | LTW/ltw_eng_200403.txt 542 | LTW/ltw_eng_199607.txt 543 | LTW/ltw_eng_199602.txt 544 | LTW/ltw_eng_200504.txt 545 | LTW/ltw_eng_200707.txt 546 | LTW/ltw_eng_199706.txt 547 | NYT/nyt_eng_200110.txt 548 | NYT/nyt_eng_200904.txt 549 | NYT/nyt_eng_200903.txt 550 | NYT/nyt_eng_200707.txt 551 | NYT/nyt_eng_199505.txt 552 | NYT/nyt_eng_200703.txt 553 | NYT/nyt_eng_200704.txt 554 | NYT/nyt_eng_200103.txt 555 | NYT/nyt_eng_199701.txt 556 | NYT/nyt_eng_199502.txt 557 | NYT/nyt_eng_200511.txt 558 | NYT/nyt_eng_200701.txt 559 | NYT/nyt_eng_200602.txt 560 | NYT/nyt_eng_200902.txt 561 | NYT/nyt_eng_200411.txt 562 | NYT/nyt_eng_199411.txt 563 | NYT/nyt_eng_200506.txt 564 | NYT/nyt_eng_201007.txt 565 | NYT/nyt_eng_199711.txt 566 | NYT/nyt_eng_200407.txt 567 | NYT/nyt_eng_200612.txt 568 | NYT/nyt_eng_200709.txt 569 | NYT/nyt_eng_199806.txt 570 | NYT/nyt_eng_201009.txt 571 | NYT/nyt_eng_200509.txt 572 | NYT/nyt_eng_200212.txt 573 | NYT/nyt_eng_200302.txt 574 | NYT/nyt_eng_200909.txt 575 | NYT/nyt_eng_200804.txt 576 | NYT/nyt_eng_200803.txt 577 | NYT/nyt_eng_200812.txt 578 | NYT/nyt_eng_200507.txt 579 | NYT/nyt_eng_200211.txt 580 | NYT/nyt_eng_199705.txt 581 | NYT/nyt_eng_200905.txt 582 | NYT/nyt_eng_200911.txt 583 | NYT/nyt_eng_200907.txt 584 | NYT/nyt_eng_200105.txt 585 | NYT/nyt_eng_199608.txt 586 | NYT/nyt_eng_199808.txt 587 | NYT/nyt_eng_200207.txt 588 | NYT/nyt_eng_200004.txt 589 | NYT/nyt_eng_199703.txt 590 | NYT/nyt_eng_200006.txt 591 | NYT/nyt_eng_199905.txt 592 | NYT/nyt_eng_201006.txt 593 | NYT/nyt_eng_199802.txt 594 | NYT/nyt_eng_199903.txt 595 | NYT/nyt_eng_200705.txt 596 | NYT/nyt_eng_201012.txt 597 | NYT/nyt_eng_200610.txt 598 | NYT/nyt_eng_199801.txt 599 | NYT/nyt_eng_199410.txt 600 | NYT/nyt_eng_200001.txt 601 | NYT/nyt_eng_200202.txt 602 | NYT/nyt_eng_199412.txt 603 | NYT/nyt_eng_199702.txt 604 | NYT/nyt_eng_200112.txt 605 | NYT/nyt_eng_200311.txt 606 | NYT/nyt_eng_199611.txt 607 | NYT/nyt_eng_199912.txt 608 | NYT/nyt_eng_200011.txt 609 | NYT/nyt_eng_200002.txt 610 | NYT/nyt_eng_200710.txt 611 | NYT/nyt_eng_200609.txt 612 | NYT/nyt_eng_201002.txt 613 | NYT/nyt_eng_200403.txt 614 | NYT/nyt_eng_199504.txt 615 | NYT/nyt_eng_200809.txt 616 | NYT/nyt_eng_200504.txt 617 | NYT/nyt_eng_199708.txt 618 | NYT/nyt_eng_201001.txt 619 | NYT/nyt_eng_199610.txt 620 | NYT/nyt_eng_200405.txt 621 | NYT/nyt_eng_200005.txt 622 | NYT/nyt_eng_200611.txt 623 | NYT/nyt_eng_200605.txt 624 | NYT/nyt_eng_199907.txt 625 | NYT/nyt_eng_199601.txt 626 | NYT/nyt_eng_200512.txt 627 | NYT/nyt_eng_199510.txt 628 | NYT/nyt_eng_199901.txt 629 | NYT/nyt_eng_199607.txt 630 | NYT/nyt_eng_200508.txt 631 | NYT/nyt_eng_200908.txt 632 | NYT/nyt_eng_200810.txt 633 | NYT/nyt_eng_199902.txt 634 | NYT/nyt_eng_199501.txt 635 | NYT/nyt_eng_199707.txt 636 | NYT/nyt_eng_200607.txt 637 | NYT/nyt_eng_200608.txt 638 | NYT/nyt_eng_199804.txt 639 | NYT/nyt_eng_200109.txt 640 | NYT/nyt_eng_199908.txt 641 | NYT/nyt_eng_200805.txt 642 | NYT/nyt_eng_200310.txt 643 | NYT/nyt_eng_200502.txt 644 | NYT/nyt_eng_199606.txt 645 | NYT/nyt_eng_200312.txt 646 | NYT/nyt_eng_200401.txt 647 | NYT/nyt_eng_199409.txt 648 | NYT/nyt_eng_199909.txt 649 | NYT/nyt_eng_200409.txt 650 | NYT/nyt_eng_199509.txt 651 | NYT/nyt_eng_199503.txt 652 | NYT/nyt_eng_199604.txt 653 | NYT/nyt_eng_200901.txt 654 | NYT/nyt_eng_199506.txt 655 | NYT/nyt_eng_200708.txt 656 | NYT/nyt_eng_200204.txt 657 | NYT/nyt_eng_200301.txt 658 | NYT/nyt_eng_200304.txt 659 | NYT/nyt_eng_200910.txt 660 | NYT/nyt_eng_200008.txt 661 | NYT/nyt_eng_199407.txt 662 | NYT/nyt_eng_199508.txt 663 | NYT/nyt_eng_199609.txt 664 | NYT/nyt_eng_199710.txt 665 | NYT/nyt_eng_200101.txt 666 | NYT/nyt_eng_199602.txt 667 | NYT/nyt_eng_200210.txt 668 | NYT/nyt_eng_200107.txt 669 | NYT/nyt_eng_200108.txt 670 | NYT/nyt_eng_200308.txt 671 | NYT/nyt_eng_200801.txt 672 | NYT/nyt_eng_199712.txt 673 | NYT/nyt_eng_200802.txt 674 | NYT/nyt_eng_200912.txt 675 | NYT/nyt_eng_200807.txt 676 | NYT/nyt_eng_200201.txt 677 | NYT/nyt_eng_200706.txt 678 | NYT/nyt_eng_200007.txt 679 | NYT/nyt_eng_200404.txt 680 | NYT/nyt_eng_199803.txt 681 | NYT/nyt_eng_200712.txt 682 | NYT/nyt_eng_199408.txt 683 | NYT/nyt_eng_200408.txt 684 | NYT/nyt_eng_199603.txt 685 | NYT/nyt_eng_200412.txt 686 | NYT/nyt_eng_200106.txt 687 | NYT/nyt_eng_200309.txt 688 | NYT/nyt_eng_201010.txt 689 | NYT/nyt_eng_200811.txt 690 | NYT/nyt_eng_200702.txt 691 | NYT/nyt_eng_200501.txt 692 | NYT/nyt_eng_200209.txt 693 | NYT/nyt_eng_200906.txt 694 | NYT/nyt_eng_200402.txt 695 | NYT/nyt_eng_200104.txt 696 | NYT/nyt_eng_199911.txt 697 | NYT/nyt_eng_200206.txt 698 | NYT/nyt_eng_199805.txt 699 | NYT/nyt_eng_200009.txt 700 | NYT/nyt_eng_200711.txt 701 | NYT/nyt_eng_200806.txt 702 | NYT/nyt_eng_200603.txt 703 | NYT/nyt_eng_201003.txt 704 | NYT/nyt_eng_200604.txt 705 | NYT/nyt_eng_200303.txt 706 | NYT/nyt_eng_200208.txt 707 | NYT/nyt_eng_199511.txt 708 | NYT/nyt_eng_200010.txt 709 | NYT/nyt_eng_199605.txt 710 | NYT/nyt_eng_200102.txt 711 | NYT/nyt_eng_199904.txt 712 | NYT/nyt_eng_199807.txt 713 | NYT/nyt_eng_200510.txt 714 | NYT/nyt_eng_199507.txt 715 | NYT/nyt_eng_200410.txt 716 | NYT/nyt_eng_199906.txt 717 | NYT/nyt_eng_199706.txt 718 | NYT/nyt_eng_200012.txt 719 | NYT/nyt_eng_200111.txt 720 | NYT/nyt_eng_201008.txt 721 | NYT/nyt_eng_200606.txt 722 | NYT/nyt_eng_200503.txt 723 | WPB/wpb_eng_201012.txt 724 | WPB/wpb_eng_201007.txt 725 | WPB/wpb_eng_201008.txt 726 | WPB/wpb_eng_201003.txt 727 | WPB/wpb_eng_201004.txt 728 | WPB/wpb_eng_201010.txt 729 | WPB/wpb_eng_201001.txt 730 | WPB/wpb_eng_201006.txt 731 | WPB/wpb_eng_201009.txt 732 | WPB/wpb_eng_201002.txt 733 | WPB/wpb_eng_201005.txt 734 | WPB/wpb_eng_201011.txt 735 | XIN/xin_eng_199708.txt 736 | XIN/xin_eng_200303.txt 737 | XIN/xin_eng_199701.txt 738 | XIN/xin_eng_200305.txt 739 | XIN/xin_eng_200208.txt 740 | XIN/xin_eng_200203.txt 741 | XIN/xin_eng_199807.txt 742 | XIN/xin_eng_199912.txt 743 | XIN/xin_eng_200302.txt 744 | XIN/xin_eng_201010.txt 745 | XIN/xin_eng_200612.txt 746 | XIN/xin_eng_199706.txt 747 | XIN/xin_eng_200104.txt 748 | XIN/xin_eng_200912.txt 749 | XIN/xin_eng_200412.txt 750 | XIN/xin_eng_201005.txt 751 | XIN/xin_eng_200507.txt 752 | XIN/xin_eng_199609.txt 753 | XIN/xin_eng_199910.txt 754 | XIN/xin_eng_200506.txt 755 | XIN/xin_eng_200404.txt 756 | XIN/xin_eng_200712.txt 757 | XIN/xin_eng_200401.txt 758 | XIN/xin_eng_200110.txt 759 | XIN/xin_eng_199502.txt 760 | XIN/xin_eng_200312.txt 761 | XIN/xin_eng_200005.txt 762 | XIN/xin_eng_200602.txt 763 | XIN/xin_eng_200002.txt 764 | XIN/xin_eng_199907.txt 765 | XIN/xin_eng_199608.txt 766 | XIN/xin_eng_199711.txt 767 | XIN/xin_eng_200207.txt 768 | XIN/xin_eng_201006.txt 769 | XIN/xin_eng_200710.txt 770 | XIN/xin_eng_199506.txt 771 | XIN/xin_eng_200201.txt 772 | XIN/xin_eng_200706.txt 773 | XIN/xin_eng_200909.txt 774 | XIN/xin_eng_199504.txt 775 | XIN/xin_eng_200705.txt 776 | XIN/xin_eng_200806.txt 777 | XIN/xin_eng_201003.txt 778 | XIN/xin_eng_200604.txt 779 | XIN/xin_eng_200109.txt 780 | XIN/xin_eng_199606.txt 781 | XIN/xin_eng_200410.txt 782 | XIN/xin_eng_200905.txt 783 | XIN/xin_eng_200101.txt 784 | XIN/xin_eng_199909.txt 785 | XIN/xin_eng_200105.txt 786 | XIN/xin_eng_200102.txt 787 | XIN/xin_eng_199503.txt 788 | XIN/xin_eng_200408.txt 789 | XIN/xin_eng_200107.txt 790 | XIN/xin_eng_200004.txt 791 | XIN/xin_eng_199604.txt 792 | XIN/xin_eng_199610.txt 793 | XIN/xin_eng_200606.txt 794 | XIN/xin_eng_200409.txt 795 | XIN/xin_eng_200403.txt 796 | XIN/xin_eng_200301.txt 797 | XIN/xin_eng_200608.txt 798 | XIN/xin_eng_200903.txt 799 | XIN/xin_eng_199801.txt 800 | XIN/xin_eng_199508.txt 801 | XIN/xin_eng_200502.txt 802 | XIN/xin_eng_200701.txt 803 | XIN/xin_eng_199705.txt 804 | XIN/xin_eng_199702.txt 805 | XIN/xin_eng_200111.txt 806 | XIN/xin_eng_201012.txt 807 | XIN/xin_eng_199808.txt 808 | XIN/xin_eng_199507.txt 809 | XIN/xin_eng_200509.txt 810 | XIN/xin_eng_199911.txt 811 | XIN/xin_eng_200802.txt 812 | XIN/xin_eng_200901.txt 813 | XIN/xin_eng_201009.txt 814 | XIN/xin_eng_199501.txt 815 | XIN/xin_eng_199805.txt 816 | XIN/xin_eng_200007.txt 817 | XIN/xin_eng_200309.txt 818 | XIN/xin_eng_199804.txt 819 | XIN/xin_eng_200209.txt 820 | XIN/xin_eng_200205.txt 821 | XIN/xin_eng_201001.txt 822 | XIN/xin_eng_201002.txt 823 | XIN/xin_eng_200103.txt 824 | XIN/xin_eng_199511.txt 825 | XIN/xin_eng_200210.txt 826 | XIN/xin_eng_200611.txt 827 | XIN/xin_eng_199601.txt 828 | XIN/xin_eng_199605.txt 829 | XIN/xin_eng_199602.txt 830 | XIN/xin_eng_201008.txt 831 | XIN/xin_eng_199607.txt 832 | XIN/xin_eng_199906.txt 833 | XIN/xin_eng_200508.txt 834 | XIN/xin_eng_199902.txt 835 | XIN/xin_eng_199806.txt 836 | XIN/xin_eng_200609.txt 837 | XIN/xin_eng_200009.txt 838 | XIN/xin_eng_200211.txt 839 | XIN/xin_eng_200603.txt 840 | XIN/xin_eng_199803.txt 841 | XIN/xin_eng_201004.txt 842 | XIN/xin_eng_200703.txt 843 | XIN/xin_eng_200704.txt 844 | XIN/xin_eng_200405.txt 845 | XIN/xin_eng_200010.txt 846 | XIN/xin_eng_200911.txt 847 | XIN/xin_eng_201011.txt 848 | XIN/xin_eng_199612.txt 849 | XIN/xin_eng_200501.txt 850 | XIN/xin_eng_199509.txt 851 | XIN/xin_eng_201007.txt 852 | XIN/xin_eng_200503.txt 853 | XIN/xin_eng_200003.txt 854 | XIN/xin_eng_200908.txt 855 | XIN/xin_eng_200601.txt 856 | XIN/xin_eng_200402.txt 857 | XIN/xin_eng_200012.txt 858 | XIN/xin_eng_200808.txt 859 | XIN/xin_eng_199707.txt 860 | XIN/xin_eng_199903.txt 861 | XIN/xin_eng_200803.txt 862 | XIN/xin_eng_200512.txt 863 | XIN/xin_eng_200904.txt 864 | XIN/xin_eng_200008.txt 865 | XIN/xin_eng_199505.txt 866 | XIN/xin_eng_200805.txt 867 | XIN/xin_eng_200307.txt 868 | XIN/xin_eng_199603.txt 869 | XIN/xin_eng_200001.txt 870 | XIN/xin_eng_200907.txt 871 | XIN/xin_eng_200311.txt 872 | XIN/xin_eng_200510.txt 873 | XIN/xin_eng_200906.txt 874 | XIN/xin_eng_200006.txt 875 | XIN/xin_eng_199905.txt 876 | XIN/xin_eng_199809.txt 877 | XIN/xin_eng_199512.txt 878 | XIN/xin_eng_199709.txt 879 | XIN/xin_eng_200809.txt 880 | XIN/xin_eng_200304.txt 881 | XIN/xin_eng_200308.txt 882 | XIN/xin_eng_200812.txt 883 | XIN/xin_eng_200504.txt 884 | XIN/xin_eng_200707.txt 885 | XIN/xin_eng_200810.txt 886 | XIN/xin_eng_200202.txt 887 | XIN/xin_eng_199710.txt 888 | XIN/xin_eng_200607.txt 889 | XIN/xin_eng_200605.txt 890 | XIN/xin_eng_200811.txt 891 | XIN/xin_eng_200108.txt 892 | XIN/xin_eng_200011.txt 893 | XIN/xin_eng_200708.txt 894 | XIN/xin_eng_199703.txt 895 | XIN/xin_eng_200801.txt 896 | XIN/xin_eng_200505.txt 897 | XIN/xin_eng_200709.txt 898 | XIN/xin_eng_199712.txt 899 | XIN/xin_eng_200807.txt 900 | XIN/xin_eng_200206.txt 901 | XIN/xin_eng_200204.txt 902 | XIN/xin_eng_200610.txt 903 | XIN/xin_eng_200910.txt 904 | XIN/xin_eng_199611.txt 905 | -------------------------------------------------------------------------------- /dataset/valid.splits: -------------------------------------------------------------------------------- 1 | AFP/afp_eng_200601.txt 2 | AFP/afp_eng_199702.txt 3 | AFP/afp_eng_200506.txt 4 | AFP/afp_eng_200308.txt 5 | AFP/afp_eng_200703.txt 6 | AFP/afp_eng_199502.txt 7 | AFP/afp_eng_199406.txt 8 | APW/apw_eng_199606.txt 9 | APW/apw_eng_199807.txt 10 | APW/apw_eng_200204.txt 11 | APW/apw_eng_199708.txt 12 | APW/apw_eng_200907.txt 13 | APW/apw_eng_200309.txt 14 | APW/apw_eng_200205.txt 15 | APW/apw_eng_200210.txt 16 | APW/apw_eng_200608.txt 17 | CNA/cna_eng_200710.txt 18 | CNA/cna_eng_200912.txt 19 | CNA/cna_eng_200102.txt 20 | CNA/cna_eng_200803.txt 21 | CNA/cna_eng_200504.txt 22 | CNA/cna_eng_200003.txt 23 | CNA/cna_eng_201005.txt 24 | LTW/ltw_eng_200310.txt 25 | LTW/ltw_eng_200512.txt 26 | LTW/ltw_eng_199503.txt 27 | LTW/ltw_eng_200407.txt 28 | LTW/ltw_eng_199803.txt 29 | LTW/ltw_eng_199509.txt 30 | NYT/nyt_eng_200205.txt 31 | NYT/nyt_eng_200203.txt 32 | NYT/nyt_eng_199512.txt 33 | NYT/nyt_eng_200307.txt 34 | NYT/nyt_eng_199709.txt 35 | NYT/nyt_eng_199704.txt 36 | NYT/nyt_eng_201011.txt 37 | NYT/nyt_eng_200003.txt 38 | NYT/nyt_eng_200306.txt 39 | XIN/xin_eng_199901.txt 40 | XIN/xin_eng_200702.txt 41 | XIN/xin_eng_200407.txt 42 | XIN/xin_eng_199904.txt 43 | XIN/xin_eng_200406.txt 44 | XIN/xin_eng_200306.txt 45 | XIN/xin_eng_199510.txt 46 | XIN/xin_eng_199908.txt 47 | XIN/xin_eng_200212.txt 48 | -------------------------------------------------------------------------------- /prep_torch_data.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | COUNT=5 4 | WINDOW=5 5 | 6 | DATA_DIR=$1 7 | OUT_DIR=$1/processed 8 | SCRIPTS=$ABS/summary 9 | 10 | export LUA_PATH="$LUA_PATH;$ABS/?.lua" 11 | 12 | mkdir -p $OUT_DIR 13 | 14 | th $SCRIPTS/build_dict.lua -inf $DATA_DIR/train.article.dict -outf $OUT_DIR/train.article.dict.torch 15 | th $SCRIPTS/build_dict.lua -inf $DATA_DIR/train.title.dict -outf $OUT_DIR/train.title.dict.torch 16 | 17 | echo "-- Creating data directories." 18 | mkdir -p $OUT_DIR/train/title 19 | mkdir -p $OUT_DIR/train/article 20 | 21 | mkdir -p $OUT_DIR/valid.filter/title 22 | mkdir -p $OUT_DIR/valid.filter/article 23 | 24 | cp $OUT_DIR/train.title.dict.torch $OUT_DIR/train/title/dict 25 | cp $OUT_DIR/train.article.dict.torch $OUT_DIR/train/article/dict 26 | 27 | 28 | echo "-- Build the matrices" 29 | 30 | # Share the dictionary. 31 | th $SCRIPTS/build.lua -inArticleDictionary $OUT_DIR/train.article.dict.torch -inTitleDictionary $OUT_DIR/train.title.dict.torch -inTitleFile $DATA_DIR/valid.title.filter.txt -outTitleDirectory $OUT_DIR/valid.filter/title/ -inArticleFile $DATA_DIR/valid.article.filter.txt -outArticleDirectory $OUT_DIR/valid.filter/article/ -window $WINDOW 32 | 33 | th $SCRIPTS/build.lua -inArticleDictionary $OUT_DIR/train.article.dict.torch -inTitleDictionary $OUT_DIR/train.title.dict.torch -inTitleFile $DATA_DIR/train.title.txt -outTitleDirectory $OUT_DIR/train/title/ -inArticleFile $DATA_DIR/train.article.txt -outArticleDirectory $OUT_DIR/train/article/ -window $WINDOW 34 | -------------------------------------------------------------------------------- /summary/beam_search.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | -- A beam search decoder 14 | local data = require('summary.data') 15 | local features = require('summary.features') 16 | local util = require('summary.util') 17 | 18 | local beam = {} 19 | local INF = 1e9 20 | 21 | function beam.addOpts(cmd) 22 | cmd:option('-allowUNK', false, "Allow generating .") 23 | cmd:option('-fixedLength', true, "Produce exactly -length words.") 24 | cmd:option('-blockRepeatWords', false, "Disallow generating a word twice.") 25 | cmd:option('-lmWeight', 1.0, "Weight for main model.") 26 | cmd:option('-beamSize', 100, "Size of the beam.") 27 | cmd:option('-extractive', false, "Force fully extractive summary.") 28 | cmd:option('-abstractive', false, "Force fully abstractive summary.") 29 | cmd:option('-recombine', false, "Used hypothesis recombination.") 30 | features.addOpts(cmd) 31 | end 32 | 33 | function beam.init(opt, mlp, aux_model, article_to_title, dict) 34 | local new_beam = {} 35 | setmetatable(new_beam, { __index = beam }) 36 | new_beam.opt = opt 37 | new_beam.K = opt.beamSize 38 | new_beam.mlp = mlp 39 | new_beam.aux_model = aux_model 40 | new_beam.article_to_title = article_to_title 41 | new_beam.dict = dict 42 | 43 | -- Special Symbols. 44 | new_beam.UNK = dict.symbol_to_index[""] 45 | new_beam.START = dict.symbol_to_index[""] 46 | new_beam.END = dict.symbol_to_index[""] 47 | 48 | return new_beam 49 | end 50 | 51 | -- Helper: convert flat index to matrix. 52 | local function flat_to_rc(v, indices, flat_index) 53 | local row = math.floor((flat_index - 1) / v:size(2)) + 1 54 | return row, indices[row][(flat_index - 1) % v:size(2) + 1] 55 | end 56 | 57 | -- Helper: find kmax of vector. 58 | local function find_k_max(pool, mat) 59 | local v = pool:forward(mat:t()):t() 60 | local orig_indices = pool.indices:t():add(1) 61 | return v:contiguous(), orig_indices 62 | end 63 | 64 | -- Use beam search to generate a summary of 65 | -- the article of length <= len. 66 | function beam:generate(article, len) 67 | local n = len 68 | local K = self.K 69 | local W = self.opt.window 70 | 71 | -- Initialize the extractive features. 72 | local feat_gen = features.init(self.opt, self.article_to_title) 73 | feat_gen:match_words(self.START, article) 74 | local F = feat_gen.num_features 75 | local FINAL_VAL = 1000 76 | 77 | -- Initilize the charts. 78 | -- scores[i][k] is the log prob of the k'th hyp of i words. 79 | -- hyps[i][k] contains the words in k'th hyp at 80 | -- i word (left padded with W ) tokens. 81 | -- feats[i][k][f] contains the feature count of 82 | -- the f features for the k'th hyp at word i. 83 | local result = {} 84 | local scores = torch.zeros(n+1, K):float() 85 | local hyps = torch.zeros(n+1, K, W+n+1):long() 86 | local feats = torch.zeros(n+1, K, F):float() 87 | hyps:fill(self.START) 88 | 89 | -- Initilialize used word set. 90 | -- words_used[i][k] is a set of the words used in the i,k hyp. 91 | local words_used = {} 92 | if self.opt.blockRepeatWords then 93 | for i = 1, n + 1 do 94 | words_used[i] = {} 95 | for k = 1, K do 96 | words_used[i][k] = {} 97 | end 98 | end 99 | end 100 | 101 | -- Find k-max columns of a matrix. 102 | -- Use 2*k in case some are invalid. 103 | local pool = nn.TemporalKMaxPooling(2*K) 104 | 105 | -- Main loop of beam search. 106 | for i = 1, n do 107 | local cur_beam = hyps[i]:narrow(2, i+1, W) 108 | local cur_K = K 109 | 110 | -- (1) Score all next words for each context in the beam. 111 | -- log p(y_{i+1} | y_c, x) for all y_c 112 | local input = data.make_input(article, cur_beam, cur_K) 113 | local model_scores = self.mlp:forward(input) 114 | 115 | local out = model_scores:clone():double() 116 | out:mul(self.opt.lmWeight) 117 | 118 | -- If length limit is reached, next word must be end. 119 | local finalized = (i == n) and self.opt.fixedLength 120 | if finalized then 121 | out[{{}, self.END}]:add(FINAL_VAL) 122 | else 123 | -- Apply hard constraints. 124 | out[{{}, self.START}] = -INF 125 | if not self.opt.allowUNK then 126 | out[{{}, self.UNK}] = -INF 127 | end 128 | if self.opt.fixedLength then 129 | out[{{}, self.END}] = -INF 130 | end 131 | 132 | -- Add additional extractive features. 133 | feat_gen:add_features(out, cur_beam) 134 | end 135 | 136 | -- Only take first row when starting out. 137 | if i == 1 then 138 | cur_K = 1 139 | out = out:narrow(1, 1, 1) 140 | model_scores = model_scores:narrow(1, 1, 1) 141 | end 142 | 143 | -- Prob of summary is log p + log p(y_{i+1} | y_c, x) 144 | for k = 1, cur_K do 145 | out[k]:add(scores[i][k]) 146 | end 147 | 148 | -- (2) Retain the K-best words for each hypothesis using GPU. 149 | -- This leaves a KxK matrix which we flatten to a K^2 vector. 150 | local max_scores, mat_indices = find_k_max(pool, out:cuda()) 151 | local flat = max_scores:view(max_scores:size(1) 152 | * max_scores:size(2)):float() 153 | 154 | -- 3) Construct the next hypotheses by taking the next k-best. 155 | local seen_ngram = {} 156 | for k = 1, K do 157 | for _ = 1, 100 do 158 | 159 | -- (3a) Pull the score, index, rank, and word of the 160 | -- current best in the table, and then zero it out. 161 | local score, index = flat:max(1) 162 | if finalized then 163 | score[1] = score[1] - FINAL_VAL 164 | end 165 | scores[i+1][k] = score[1] 166 | local prev_k, y_i1 = flat_to_rc(max_scores, mat_indices, index[1]) 167 | flat[index[1]] = -INF 168 | 169 | -- (3b) Is this a valid next word? 170 | local blocked = (self.opt.blockRepeatWords and 171 | words_used[i][prev_k][y_i1]) 172 | 173 | blocked = blocked or 174 | (self.opt.extractive and not feat_gen:has_ngram({y_i1})) 175 | blocked = blocked or 176 | (self.opt.abstractive and feat_gen:has_ngram({y_i1})) 177 | 178 | -- Hypothesis recombination. 179 | local new_context = {} 180 | if self.opt.recombine then 181 | for j = i+2, i+W do 182 | table.insert(new_context, hyps[i][prev_k][j]) 183 | end 184 | table.insert(new_context, y_i1) 185 | blocked = blocked or util.has(seen_ngram, new_context) 186 | end 187 | 188 | -- (3c) Add the word, its score, and its features to the 189 | -- beam. 190 | if not blocked then 191 | -- Update tables with new hypothesis. 192 | for j = 1, i+W do 193 | local pword = hyps[i][prev_k][j] 194 | hyps[i+1][k][j] = pword 195 | words_used[i+1][k][pword] = true 196 | end 197 | hyps[i+1][k][i+W+1] = y_i1 198 | words_used[i+1][k][y_i1] = true 199 | 200 | -- Keep track of hypotheses seen. 201 | if self.opt.recombine then 202 | util.add(seen_ngram, new_context) 203 | end 204 | 205 | -- Keep track of features used (For MERT) 206 | feats[i+1][k]:copy(feats[i][prev_k]) 207 | feat_gen:compute(feats[i+1][k], hyps[i+1][k], 208 | model_scores[prev_k][y_i1], y_i1, i) 209 | 210 | -- If we have produced an END symbol, push to stack. 211 | if y_i1 == self.END then 212 | table.insert(result, {i+1, scores[i+1][k], 213 | hyps[i+1][k]:clone(), 214 | feats[i+1][k]:clone()}) 215 | scores[i+1][k] = -INF 216 | end 217 | break 218 | end 219 | end 220 | end 221 | end 222 | 223 | -- Sort by score. 224 | table.sort(result, function (a, b) return a[2] > b[2] end) 225 | 226 | -- Return the scores and hypotheses at the final stage. 227 | return result 228 | end 229 | 230 | 231 | return beam 232 | -------------------------------------------------------------------------------- /summary/build.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | -- Script to build the dataset 14 | require('torch') 15 | local utils = require('summary/util') 16 | 17 | torch.setdefaulttensortype('torch.LongTensor') 18 | 19 | cmd = torch.CmdLine() 20 | cmd:text() 21 | cmd:text() 22 | cmd:text('Build torch serialized version of a summarization problem.') 23 | cmd:text() 24 | 25 | cmd:option('-window', 5, 'The ngram window to use.') 26 | 27 | cmd:option('-inTitleFile', '', 'The input file.') 28 | cmd:option('-inTitleDictionary', '', 'The input dictionary.') 29 | cmd:option('-outTitleDirectory', '', 'The output directory.') 30 | cmd:option('-inArticleFile', '', 'The input file.') 31 | cmd:option('-inArticleDictionary', '', 'The input dictionary.') 32 | cmd:option('-outArticleDirectory', '', 'The output directory.') 33 | 34 | opt = cmd:parse(arg) 35 | 36 | local function count(file, aligned_lengths, pad) 37 | -- Count up properties of the input file. 38 | local f = io.open(file, 'r') 39 | local counter = { 40 | nsents = 0, 41 | max_length = 0, 42 | aligned_lengths = {}, 43 | line_lengths = {}, 44 | bucket_words = {}} 45 | local nline = 1 46 | for l in f:lines() do 47 | local true_l = l 48 | if pad then 49 | true_l = " " .. l .. " " 50 | end 51 | local line = utils.string_split(true_l, " ") 52 | counter.line_lengths[#line] = (counter.line_lengths[#line] or 0) + 1 53 | counter.nsents = counter.nsents + 1 54 | counter.aligned_lengths[nline] = #line 55 | if aligned_lengths ~= nil then 56 | -- Add extra for implicit . 57 | counter.bucket_words[aligned_lengths[nline]] = 58 | (counter.bucket_words[aligned_lengths[nline]] or 0) 59 | + #line + 1 60 | end 61 | nline = nline + 1 62 | end 63 | return counter 64 | end 65 | 66 | 67 | local function build_article_matrices(dict, file, nsents, line_lengths) 68 | -- For each length bucket, construct a #sentence x length matrix 69 | -- of word forms. 70 | local f = io.open(file, 'r') 71 | 72 | -- One matrix for each length. 73 | local mat = {} 74 | 75 | -- Number of sentences seen of this length. 76 | local of_length = {} 77 | 78 | for length, count in pairs(line_lengths) do 79 | mat[length] = torch.zeros(count, length):long() 80 | of_length[length] = 1 81 | end 82 | 83 | -- For each sentence. 84 | -- Col 1 is its length bin. 85 | -- Col 2 is its position in bin. 86 | local pos = torch.zeros(nsents, 2):long() 87 | 88 | local nsent = 1 89 | for l in f:lines() do 90 | local true_l = " " .. l .. " " 91 | local line = utils.string_split(true_l, " ") 92 | local length = #line 93 | local nbin = of_length[length] 94 | for j = 1, #line do 95 | local index = dict.symbol_to_index[line[j]] or 1 96 | --assert(index ~= nil) 97 | mat[length][nbin][j] = index 98 | end 99 | pos[nsent][1] = length 100 | pos[nsent][2] = nbin 101 | of_length[length] = nbin + 1 102 | nsent = nsent + 1 103 | end 104 | return mat, pos 105 | end 106 | 107 | 108 | local function build_title_matrices(dict, file, aligned_lengths, 109 | bucket_sizes, window) 110 | -- For each article length bucket construct a num-words x 1 flat vector 111 | -- of word forms and a corresponding num-words x window matrix of 112 | -- context forms. 113 | local nsent = 1 114 | local pos = {} 115 | 116 | -- One matrix for each length. 117 | local mat = {} 118 | local ngram = {} 119 | 120 | -- Number of sentences seen of this length. 121 | local sent_of_length = {} 122 | local words_of_length = {} 123 | 124 | -- Initialize. 125 | for length, count in pairs(bucket_sizes) do 126 | mat[length] = torch.zeros(count, 3):long() 127 | sent_of_length[length] = 1 128 | words_of_length[length] = 1 129 | ngram[length] = torch.zeros(count, window):long() 130 | end 131 | 132 | -- Columns are the preceding window. 133 | local nline = 1 134 | local f = io.open(file, 'r') 135 | for l in f:lines() do 136 | -- Add implicit . 137 | local true_l = l .. " " 138 | local line = utils.string_split(true_l, " ") 139 | 140 | local last = {} 141 | -- Initialize window as START symbol. 142 | for w = 1, window do 143 | table.insert(last, dict.symbol_to_index[""]) 144 | end 145 | 146 | local aligned_length = aligned_lengths[nline] 147 | for j = 1, #line do 148 | local nword = words_of_length[aligned_length] 149 | local index = dict.symbol_to_index[line[j]] or 1 150 | 151 | mat[aligned_length][nword][1] = index 152 | mat[aligned_length][nword][2] = sent_of_length[aligned_length] 153 | mat[aligned_length][nword][3] = j 154 | 155 | -- Move the window forward. 156 | for w = 1, window-1 do 157 | ngram[aligned_length][nword][w] = last[w] 158 | last[w] = last[w+1] 159 | end 160 | ngram[aligned_length][nword][window] = last[window] 161 | last[window] = index 162 | words_of_length[aligned_length] = words_of_length[aligned_length] + 1 163 | end 164 | sent_of_length[aligned_length] = sent_of_length[aligned_length] + 1 165 | nsent = nsent + 1 166 | 167 | -- Debug logging. 168 | if nsent % 100000 == 1 then 169 | print(nsent) 170 | end 171 | nline = nline + 1 172 | end 173 | return mat, pos, ngram 174 | end 175 | 176 | local function main() 177 | local counter = count(opt.inArticleFile, nil, true) 178 | local dict = torch.load(opt.inArticleDictionary) 179 | 180 | -- Construct a rectangular word matrix. 181 | local word_mat, offset_mat = 182 | build_article_matrices(dict, opt.inArticleFile, 183 | counter.nsents, counter.line_lengths) 184 | torch.save(opt.outArticleDirectory .. '/word.mat.torch', word_mat) 185 | torch.save(opt.outArticleDirectory .. '/offset.mat.torch', offset_mat) 186 | 187 | local title_counter = count(opt.inTitleFile, counter.aligned_lengths, false) 188 | local title_dict = torch.load(opt.inTitleDictionary) 189 | 190 | -- Construct a 1d word matrix. 191 | local word_mat, offset_mat, ngram_mat = 192 | build_title_matrices(title_dict, 193 | opt.inTitleFile, 194 | counter.aligned_lengths, 195 | title_counter.bucket_words, 196 | opt.window) 197 | torch.save(opt.outTitleDirectory .. '/word.mat.torch', word_mat) 198 | torch.save(opt.outTitleDirectory .. '/offset.mat.torch', offset_mat) 199 | torch.save(opt.outTitleDirectory .. '/ngram.mat.torch', ngram_mat) 200 | end 201 | 202 | main() 203 | -------------------------------------------------------------------------------- /summary/build_dict.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | -- Script to build the dictionary 14 | local utils = require('summary/util') 15 | 16 | cmd = torch.CmdLine() 17 | cmd:text() 18 | cmd:text() 19 | cmd:text('Build torch serialized version of a dictionary file.') 20 | cmd:text() 21 | cmd:text('Options') 22 | cmd:option('-inf', '', 'The input dictionary.') 23 | cmd:option('-outf', '', 'The output directory.') 24 | cmd:text() 25 | 26 | opt = cmd:parse(arg) 27 | 28 | local f = io.open(opt.inf, 'r') 29 | local word_id = 0 30 | local dict = {symbol_to_index = {}, 31 | index_to_symbol = {}} 32 | for l in f:lines() do 33 | word_id = word_id + 1 34 | local word = utils.string_split(l)[1] 35 | dict.symbol_to_index[word] = word_id 36 | dict.index_to_symbol[word_id] = word 37 | end 38 | torch.save(opt.outf, dict) 39 | -------------------------------------------------------------------------------- /summary/data.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | -- Load data for summary experiments. 14 | local util = require('summary/util') 15 | 16 | local data = {} 17 | 18 | function data.add_opts(cmd) 19 | cmd:option('-articleDir', '', 20 | 'Directory containing article training matrices.') 21 | cmd:option('-titleDir', '', 22 | 'Directory containing title training matrices.') 23 | cmd:option('-validArticleDir', '', 24 | 'Directory containing article matricess for validation.') 25 | cmd:option('-validTitleDir', '', 26 | 'Directory containing title matrices for validation.') 27 | end 28 | 29 | function data.load(article_dir, title_dir) 30 | return data.init() 31 | end 32 | 33 | function data.init(title_data, article_data) 34 | local new_data = {} 35 | setmetatable(new_data, { __index = data }) 36 | new_data.title_data = title_data 37 | new_data.article_data = article_data 38 | new_data:reset() 39 | return new_data 40 | end 41 | 42 | function data:reset() 43 | self.bucket_order = {} 44 | for length, _ in pairs(self.title_data.target) do 45 | table.insert(self.bucket_order, length) 46 | end 47 | util.shuffleTable(self.bucket_order) 48 | self.bucket_index = 0 49 | self:load_next_bucket() 50 | end 51 | 52 | function data:load_next_bucket() 53 | self.done_bucket = false 54 | self.bucket_index = self.bucket_index + 1 55 | self.bucket = self.bucket_order[self.bucket_index] 56 | self.bucket_size = self.title_data.target[self.bucket]:size(1) 57 | self.pos = 1 58 | self.aux_ptrs = self.title_data.sentences[self.bucket]:float():long() 59 | self.positions = torch.range(1, self.bucket):view(1, self.bucket) 60 | :expand(1000, self.bucket):contiguous():cuda() + (200 * self.bucket) 61 | end 62 | 63 | function data:is_done() 64 | return self.bucket_index >= #self.bucket_order - 1 and 65 | self.done_bucket 66 | end 67 | 68 | function data:next_batch(max_size) 69 | local diff = self.bucket_size - self.pos 70 | if self.done_bucket or diff == 0 or diff == 1 then 71 | self:load_next_bucket() 72 | end 73 | local offset 74 | if self.pos + max_size > self.bucket_size then 75 | offset = self.bucket_size - self.pos 76 | self.done_bucket = true 77 | else 78 | offset = max_size 79 | end 80 | local positions = self.positions:narrow(1, 1, offset) 81 | 82 | local aux_rows = self.article_data.words[self.bucket]: 83 | index(1, self.aux_ptrs:narrow(1, self.pos, offset)) 84 | local context = self.title_data.ngram[self.bucket] 85 | :narrow(1, self.pos, offset) 86 | local target = self.title_data.target[self.bucket] 87 | :narrow(1, self.pos, offset) 88 | self.pos = self.pos + offset 89 | return {aux_rows, positions, context}, target 90 | end 91 | 92 | function data.make_input(article, context, K) 93 | local bucket = article:size(1) 94 | local aux_sentence = article:view(bucket, 1) 95 | :expand(article:size(1), K):t():contiguous():cuda() 96 | local positions = torch.range(1, bucket):view(bucket, 1) 97 | :expand(bucket, K):t():contiguous():cuda() + (200 * bucket) 98 | return {aux_sentence, positions, context} 99 | end 100 | 101 | function data.load_title_dict(dname) 102 | return torch.load(dname .. 'dict') 103 | end 104 | 105 | function data.load_title(dname, shuffle, use_dict) 106 | local ngram = torch.load(dname .. 'ngram.mat.torch') 107 | local words = torch.load(dname .. 'word.mat.torch') 108 | local dict = use_dict or torch.load(dname .. 'dict') 109 | local target_full = {} 110 | local sentences_full = {} 111 | local pos_full = {} 112 | for length, mat in pairs(ngram) do 113 | if shuffle ~= nil then 114 | local perm = torch.randperm(ngram[length]:size(1)):long() 115 | ngram[length] = ngram[length]:index(1, perm):float():cuda() 116 | words[length] = words[length]:index(1, perm) 117 | else 118 | ngram[length] = ngram[length]:float():cuda() 119 | end 120 | assert(ngram[length]:size(1) == words[length]:size(1)) 121 | target_full[length] = words[length][{{}, 1}]:contiguous():float():cuda() 122 | sentences_full[length] = 123 | words[length][{{}, 2}]:contiguous():float():cuda() 124 | pos_full[length] = words[length][{{}, 3}] 125 | 126 | end 127 | local title_data = {ngram = ngram, 128 | target = target_full, 129 | sentences = sentences_full, 130 | pos = pos_full, 131 | dict = dict} 132 | return title_data 133 | end 134 | 135 | function data.load_article(dname, use_dict) 136 | local input_words = torch.load(dname .. 'word.mat.torch') 137 | -- local offsets = torch.load(dname .. 'offset.mat.torch') 138 | 139 | local dict = use_dict or torch.load(dname .. 'dict') 140 | for length, mat in pairs(input_words) do 141 | input_words[length] = mat 142 | input_words[length] = input_words[length]:float():cuda() 143 | end 144 | local article_data = {words = input_words, dict = dict} 145 | return article_data 146 | end 147 | 148 | return data 149 | -------------------------------------------------------------------------------- /summary/encoder.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | -- require('fbcunn') 14 | 15 | local encoder = {} 16 | 17 | function encoder.add_opts(cmd) 18 | cmd:option('-encoderModel', 'bow', "The encoder model to use.") 19 | cmd:option('-bowDim', 50, "Article embedding size.") 20 | cmd:option('-attenPool', 5, "Attention model pooling size.") 21 | cmd:option('-hiddenUnits', 1000, "Conv net encoder hidden units.") 22 | cmd:option('-kernelWidth', 5, "Conv net encoder kernel width.") 23 | end 24 | 25 | 26 | function encoder.build(opt, data) 27 | torch.setdefaulttensortype("torch.CudaTensor") 28 | local model = nil 29 | if opt.encoderModel == "none" then 30 | model = encoder.build_blank_model(opt, data) 31 | elseif opt.encoderModel == "bow" then 32 | model = encoder.build_bow_model(opt, data) 33 | elseif opt.encoderModel == "attenbow" then 34 | model = encoder.build_attnbow_model(opt, data) 35 | elseif opt.encoderModel == "conv" then 36 | model = encoder.build_conv_model(opt, data) 37 | end 38 | torch.setdefaulttensortype("torch.DoubleTensor") 39 | return model 40 | end 41 | 42 | 43 | function encoder.build_blank_model(opt, data) 44 | -- Ignores the article layer entirely (acts like LM). 45 | local lookup = nn.Identity()() 46 | local ignore1 = nn.Identity()() 47 | local ignore2 = nn.Identity()() 48 | local start = nn.SelectTable(3)({lookup, ignore1, ignore2}) 49 | 50 | local mout = nn.MulConstant(0)(start) 51 | local encoder_mlp = nn.gModule({lookup, ignore1, ignore2}, {mout}) 52 | encoder_mlp:cuda() 53 | return encoder_mlp 54 | end 55 | 56 | 57 | function encoder.build_bow_model(opt, data) 58 | print("Encoder model: Bag-of-Words") 59 | 60 | -- BOW with mean on article. 61 | local lookup = nn.LookupTable( 62 | #data.article_data.dict.index_to_symbol, 63 | opt.bowDim)() 64 | 65 | -- Ignore the context. 66 | local ignore1 = nn.Identity()() 67 | local ignore2 = nn.Identity()() 68 | 69 | -- Ignores the context and position input. 70 | local start = nn.SelectTable(1)({lookup, ignore1, ignore2}) 71 | local mout = nn.Linear(opt.bowDim, opt.bowDim)( 72 | nn.Mean(3)(nn.Transpose({2, 3})(start))) 73 | 74 | local encoder_mlp = nn.gModule({lookup, ignore1, ignore2}, {mout}) 75 | encoder_mlp:cuda() 76 | 77 | return encoder_mlp 78 | end 79 | 80 | 81 | function encoder.build_conv_model(opt, data) 82 | -- Three layer thin convolutional architecture. 83 | print("Encoder model: Conv") 84 | local V2 = #data.article_data.dict.index_to_symbol 85 | local nhid = opt.hiddenUnits 86 | 87 | -- Article embedding. 88 | local article_lookup = nn.LookupTable(V2, nhid)() 89 | 90 | -- Ignore the context. 91 | local ignore1 = nn.Identity()() 92 | local ignore2 = nn.Identity()() 93 | local start = nn.SelectTable(1)({article_lookup, ignore1, ignore2}) 94 | local kwidth = opt.kernelWidth 95 | local model = nn.Sequential() 96 | model:add(nn.View(1, -1, nhid):setNumInputDims(2)) 97 | model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0)) 98 | model:add(cudnn.SpatialMaxPooling(1, 2, 1, 2)) 99 | model:add(nn.Threshold()) 100 | model:add(nn.Transpose({2,4})) 101 | 102 | -- layer 2 103 | model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0)) 104 | model:add(nn.Threshold()) 105 | model:add(nn.Transpose({2,4})) 106 | 107 | -- layer 3 108 | model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0)) 109 | model:add(nn.View(nhid, -1):setNumInputDims(3)) 110 | model:add(nn.Max(3)) 111 | local done = nn.View(opt.hiddenUnits)(model(start)) 112 | 113 | local mout = nn.Linear(opt.hiddenUnits, opt.embeddingDim)(done) 114 | 115 | local encoder_mlp = nn.gModule({article_lookup, ignore1, ignore2}, {mout}) 116 | encoder_mlp.lookup = article_lookup.data.module 117 | encoder_mlp:cuda() 118 | return encoder_mlp 119 | end 120 | 121 | 122 | function encoder.build_attnbow_model(opt, data) 123 | print("Encoder model: BoW + Attention") 124 | 125 | local D2 = opt.bowDim 126 | local N = opt.window 127 | local V = #data.title_data.dict.index_to_symbol 128 | local V2 = #data.article_data.dict.index_to_symbol 129 | 130 | -- Article Embedding. 131 | local article_lookup = nn.LookupTable(V2, D2)() 132 | 133 | -- Title Embedding. 134 | local title_lookup = nn.LookupTable(V, D2)() 135 | 136 | -- Size Lookup 137 | local size_lookup = nn.Identity()() 138 | 139 | -- Ignore size lookup to make NNGraph happy. 140 | local article_context = nn.SelectTable(1)({article_lookup, size_lookup}) 141 | 142 | -- Pool article 143 | local pad = (opt.attenPool - 1) / 2 144 | local article_match = article_context 145 | 146 | -- Title context embedding. 147 | local title_context = nn.View(D2, 1)( 148 | nn.Linear(N * D2, D2)(nn.View(N * D2)(title_lookup))) 149 | 150 | -- Attention layer. Distribution over article. 151 | local dot_article_context = nn.MM()({article_match, 152 | title_context}) 153 | 154 | -- Compute the attention distribution. 155 | local non_linearity = nn.SoftMax() 156 | local attention = non_linearity(nn.Sum(3)(dot_article_context)) 157 | 158 | local process_article = 159 | nn.Sum(2)(nn.SpatialSubSampling(1, 1, opt.attenPool)( 160 | nn.SpatialZeroPadding(0, 0, pad, pad)( 161 | nn.View(1, -1, D2):setNumInputDims(2)(article_context)))) 162 | 163 | -- Apply attention to the subsampled article. 164 | local mout = nn.Linear(D2, D2)( 165 | nn.Sum(3)(nn.MM(true, false)( 166 | {process_article, 167 | nn.View(-1, 1):setNumInputDims(1)(attention)}))) 168 | 169 | -- Apply attention 170 | local encoder_mlp = nn.gModule({article_lookup, size_lookup, title_lookup}, 171 | {mout}) 172 | 173 | encoder_mlp:cuda() 174 | encoder_mlp.lookup = article_lookup.data.module 175 | encoder_mlp.title_lookup = title_lookup.data.module 176 | return encoder_mlp 177 | end 178 | 179 | return encoder 180 | -------------------------------------------------------------------------------- /summary/features.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | local util = require('summary.util') 14 | 15 | local features = {} 16 | 17 | function features.addOpts(cmd) 18 | cmd:option('-lmWeight', 1.0, "Feature weight for the neural model.") 19 | cmd:option('-unigramBonus', 0.0, "Feature weight for unigram extraction.") 20 | cmd:option('-bigramBonus', 0.0, "Feature weight for bigram extraction.") 21 | cmd:option('-trigramBonus', 0.0, "Feature weight for trigram extraction.") 22 | cmd:option('-lengthBonus', 0.0, "Feature weight for length.") 23 | cmd:option('-unorderBonus', 0.0, "Feature weight for out-of-order.") 24 | end 25 | 26 | -- Feature positions. 27 | local NNLM = 1 28 | local UNI = 2 29 | local BI = 3 30 | local TRI = 4 31 | local OO = 5 32 | local LEN = 6 33 | 34 | local kFeat = 6 35 | 36 | function features.init(opt, article_to_title) 37 | local new_features = {} 38 | setmetatable(new_features, { __index = features }) 39 | new_features.opt = opt 40 | new_features.num_features = kFeat 41 | new_features.article_to_title = article_to_title 42 | return new_features 43 | end 44 | 45 | -- Helper: Are words in article. 46 | function features:has_ngram(words) 47 | return util.has(self.ngrams[#words], words) 48 | end 49 | 50 | -- Augment the feature count based on the new word. 51 | function features:compute(f_new, hyp, out_score, y_i1, i) 52 | local W = self.opt.window 53 | 54 | -- LM Features. 55 | f_new[NNLM] = f_new[NNLM] + out_score 56 | 57 | if self:has_ngram({y_i1}) then 58 | f_new[UNI] = f_new[UNI] + 1 59 | end 60 | 61 | if self:has_ngram({hyp[i+W], y_i1}) then 62 | f_new[BI] = f_new[BI] + 1 63 | end 64 | 65 | if self:has_ngram({hyp[i+W-1], hyp[i+W], y_i1}) then 66 | f_new[TRI] = f_new[TRI] + 1 67 | end 68 | 69 | if self.ooordered_ngram[hyp[i+W]] ~= nil and 70 | self.ooordered_ngram[hyp[i+W]][y_i1] ~= nil then 71 | f_new[OO] = f_new[OO] + 1 72 | end 73 | 74 | -- Length 75 | f_new[LEN] = f_new[LEN] + 1 76 | end 77 | 78 | -- Augment the score based on the extractive feature values. 79 | function features:add_features(out, beam) 80 | local W = self.opt.window 81 | for k = 1, beam:size(1) do 82 | 83 | -- Exact unigram matches. 84 | for s, _ in pairs(self.ngrams[1]) do 85 | out[k][s] = out[k][s] + self.opt.unigramBonus 86 | end 87 | 88 | -- Exact bigram matches. 89 | if self.ngrams[2][beam[k][W]] ~= nil then 90 | for s, _ in pairs(self.ngrams[2][beam[k][W]]) do 91 | out[k][s] = out[k][s] + self.opt.bigramBonus 92 | end 93 | end 94 | 95 | -- Exact trigram matches. 96 | if self.ngrams[3][beam[k][W-1]] ~= nil and 97 | self.ngrams[3][beam[k][W-1]][beam[k][W]] then 98 | for s, _ in pairs(self.ngrams[3][beam[k][W-1]][beam[k][W]]) do 99 | out[k][s] = out[k][s] + self.opt.trigramBonus 100 | end 101 | end 102 | 103 | if self.ooordered_ngram[beam[k][W]] ~= nil then 104 | for s, _ in pairs(self.ooordered_ngram[beam[k][W]]) do 105 | out[k][s] = out[k][s] + self.opt.unorderBonus 106 | end 107 | end 108 | end 109 | out:add(self.opt.lengthBonus) 110 | end 111 | 112 | -- Precompute extractive table based on the input article. 113 | function features:match_words(START, article) 114 | self.ooordered_ngram = {} 115 | local ordered_ngram = {} 116 | self.ngrams = {{}, {}, {}} 117 | local hist = {START, START, START, START} 118 | 119 | for j = 1, article:size(1) do 120 | local tw = self.article_to_title[article[j]] 121 | 122 | -- Does the current word exist in title dict. 123 | if tw ~= nil then 124 | for j2 = 1, j do 125 | local tw2 = self.article_to_title[article[j2]] 126 | if tw2 ~= nil then 127 | util.add(ordered_ngram, {tw2, tw}) 128 | if not util.has(ordered_ngram, {tw, tw2}) then 129 | util.add(self.ooordered_ngram, {tw, tw2}) 130 | end 131 | end 132 | end 133 | 134 | util.add(self.ngrams[1], {tw}) 135 | util.add(self.ngrams[2], {hist[3], tw}) 136 | util.add(self.ngrams[3], {hist[2], hist[3], tw}) 137 | end 138 | 139 | -- Advance window. 140 | for k = 2, 4 do 141 | hist[k-1] = hist[k] 142 | end 143 | hist[4] = tw 144 | end 145 | end 146 | 147 | return features 148 | -------------------------------------------------------------------------------- /summary/nnlm.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | -- Ngram neural language model with auxiliary model 14 | require('nn') 15 | require('nngraph') 16 | require('fbnn') 17 | require('cunn') 18 | require('sys') 19 | local utils = require('summary.util') 20 | 21 | local nnlm = {} 22 | 23 | function nnlm.addOpts() 24 | cmd:option('-epochs', 5, "Number of epochs to train.") 25 | cmd:option('-miniBatchSize', 64, "Size of training minibatch.") 26 | cmd:option('-printEvery', 10000, "How often to print during training.") 27 | cmd:option('-modelFilename', '', "File for saving loading/model.") 28 | cmd:option('-window', 5, "Size of NNLM window.") 29 | cmd:option('-embeddingDim', 50, "Size of NNLM embeddings.") 30 | cmd:option('-hiddenSize', 100, "Size of NNLM hiddent layer.") 31 | cmd:option('-learningRate', 0.1, "SGD learning rate.") 32 | end 33 | 34 | 35 | function nnlm.create_lm(opt, dict, encoder, encoder_size, encoder_dict) 36 | local new_mlp = {} 37 | setmetatable(new_mlp, { __index = nnlm }) 38 | new_mlp.opt = opt 39 | new_mlp.dict = dict 40 | new_mlp.encoder_dict = encoder_dict 41 | new_mlp.encoder_model = encoder 42 | new_mlp.window = opt.window 43 | if encoder ~= nil then 44 | new_mlp:build_mlp(encoder, encoder_size) 45 | end 46 | return new_mlp 47 | end 48 | 49 | 50 | function nnlm:build_mlp(encoder, encoder_size) 51 | -- Set constants 52 | local D = self.opt.embeddingDim 53 | local N = self.opt.window 54 | local H = self.opt.hiddenSize 55 | local V = #self.dict.index_to_symbol 56 | local P = encoder_size 57 | print(H, P) 58 | 59 | -- Input 60 | local context_input = nn.Identity()() 61 | local encoder_input = nn.Identity()() 62 | local position_input = nn.Identity()() 63 | 64 | local lookup = nn.LookupTable(V, D)(context_input) 65 | local encoder_node = encoder({encoder_input, position_input, context_input}) 66 | 67 | -- tanh W (E y) 68 | local lm_mlp = nn.Tanh()(nn.Linear(D * N, H)(nn.View(D * N)(lookup))) 69 | 70 | -- Second layer: takes LM and encoder model. 71 | local mlp = nn.Linear(H + P, V)(nn.View(H + P)(nn.JoinTable(2)( 72 | {lm_mlp, encoder_node}))) 73 | self.soft_max = nn.LogSoftMax()(mlp) 74 | 75 | -- Input is conditional context and ngram context. 76 | self.mlp = nn.gModule({encoder_input, position_input, context_input}, 77 | {self.soft_max}) 78 | 79 | self.criterion = nn.ClassNLLCriterion() 80 | self.lookup = lookup.data.module 81 | self.mlp:cuda() 82 | self.criterion:cuda() 83 | collectgarbage() 84 | end 85 | 86 | 87 | -- Run validation 88 | function nnlm:validation(valid_data) 89 | print("[Running Validation]") 90 | 91 | local offset = 1000 92 | local loss = 0 93 | local total = 0 94 | 95 | valid_data:reset() 96 | while not valid_data:is_done() do 97 | local input, target = valid_data:next_batch(offset) 98 | local out = self.mlp:forward(input) 99 | local err = self.criterion:forward(out, target) * target:size(1) 100 | 101 | -- Augment counters. 102 | loss = loss + err 103 | total = total + target:size(1) 104 | end 105 | print(string.format("[perp: %f validation: %f total: %d]", 106 | math.exp(loss/total), 107 | loss/total, total)) 108 | return loss / total 109 | end 110 | 111 | 112 | function nnlm:renorm(data, th) 113 | local size = data:size(1) 114 | for i = 1, size do 115 | local norm = data[i]:norm() 116 | if norm > th then 117 | data[i]:div(norm/th) 118 | end 119 | end 120 | end 121 | 122 | 123 | function nnlm:renorm_tables() 124 | -- Renormalize the lookup tables. 125 | if self.lookup ~= nil then 126 | print(self.lookup.weight:size()) 127 | print(self.lookup.weight:type()) 128 | self:renorm(self.lookup.weight, 1) 129 | end 130 | if self.encoder_model.lookup ~= nil then 131 | self:renorm(self.encoder_model.lookup.weight, 1) 132 | if self.encoder_model.title_lookup ~= nil then 133 | self:renorm(self.encoder_model.title_lookup.weight, 1) 134 | end 135 | end 136 | if self.encoder_model.lookups ~= nil then 137 | for i = 1, #self.encoder_model.lookups do 138 | self:renorm(self.encoder_model.lookups[i].weight, 1) 139 | end 140 | end 141 | end 142 | 143 | 144 | function nnlm:run_valid(valid_data) 145 | -- Run validation. 146 | if valid_data ~= nil then 147 | local cur_valid_loss = self:validation(valid_data) 148 | -- If valid loss does not improve drop learning rate. 149 | if cur_valid_loss > self.last_valid_loss then 150 | self.opt.learningRate = self.opt.learningRate / 2 151 | end 152 | self.last_valid_loss = cur_valid_loss 153 | end 154 | 155 | -- Save the model. 156 | self:save(self.opt.modelFilename) 157 | end 158 | 159 | 160 | function nnlm:train(data, valid_data) 161 | -- Best loss seen yet. 162 | self.last_valid_loss = 1e9 163 | -- Train 164 | for epoch = 1, self.opt.epochs do 165 | data:reset() 166 | self:renorm_tables() 167 | self:run_valid(valid_data) 168 | 169 | -- Loss for the epoch. 170 | local epoch_loss = 0 171 | local batch = 1 172 | local last_batch = 1 173 | local total = 0 174 | local loss = 0 175 | 176 | sys.tic() 177 | while not data:is_done() do 178 | local input, target = data:next_batch(self.opt.miniBatchSize) 179 | if data:is_done() then break end 180 | 181 | local out = self.mlp:forward(input) 182 | local err = self.criterion:forward(out, target) * target:size(1) 183 | local deriv = self.criterion:backward(out, target) 184 | 185 | if not utils.isnan(err) then 186 | loss = loss + err 187 | epoch_loss = epoch_loss + err 188 | 189 | self.mlp:zeroGradParameters() 190 | self.mlp:backward(input, deriv) 191 | self.mlp:updateParameters(self.opt.learningRate) 192 | else 193 | print("NaN") 194 | print(input) 195 | end 196 | 197 | -- Logging 198 | if batch % self.opt.printEvery == 1 then 199 | print(string.format( 200 | "[Loss: %f Epoch: %d Position: %d Rate: %f Time: %f]", 201 | loss / ((batch - last_batch) * self.opt.miniBatchSize), 202 | epoch, 203 | batch * self.opt.miniBatchSize, 204 | self.opt.learningRate, 205 | sys.toc() 206 | )) 207 | sys.tic() 208 | last_batch = batch 209 | loss = 0 210 | end 211 | 212 | batch = batch + 1 213 | total = total + input[1]:size(1) 214 | end 215 | print(string.format("[EPOCH : %d LOSS: %f TOTAL: %d BATCHES: %d]", 216 | epoch, epoch_loss / total, total, batch)) 217 | end 218 | end 219 | 220 | 221 | function nnlm:save(fname) 222 | print("[saving mlp: " .. fname .. "]") 223 | torch.save(fname, self) 224 | return true 225 | end 226 | 227 | 228 | function nnlm:load(fname) 229 | local new_self = torch.load(fname) 230 | for k, v in pairs(new_self) do 231 | if k ~= 'opt' then 232 | self[k] = v 233 | end 234 | end 235 | return true 236 | end 237 | 238 | 239 | return nnlm 240 | -------------------------------------------------------------------------------- /summary/run.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | require('torch') 14 | require('nn') 15 | require('sys') 16 | 17 | local nnlm = require('summary.nnlm') 18 | local encoder = require('summary.encoder') 19 | local beam = require('summary.beam_search') 20 | local utils = require('summary.util') 21 | 22 | cmd = torch.CmdLine() 23 | 24 | beam.addOpts(cmd) 25 | 26 | cutorch.setDevice(2) 27 | 28 | cmd:option('-modelFilename', '', 'Model to test.') 29 | cmd:option('-inputf', '', 'Input article files. ') 30 | cmd:option('-nbest', false, 'Write out the nbest list in ZMert format.') 31 | cmd:option('-length', 15, 'Maximum length of summary.') 32 | opt = cmd:parse(arg) 33 | 34 | -- Map the words from one dictionary to another. 35 | local function sync_dicts(dict1, dict2) 36 | local dict_map = torch.ones(#dict1.index_to_symbol):long() 37 | for i = 1, #dict1.index_to_symbol do 38 | local res = dict2.symbol_to_index[dict1.index_to_symbol[i]] 39 | dict_map[i] = res or 1 40 | end 41 | return dict_map 42 | end 43 | 44 | -- Apply digit preprocessing. 45 | local function process_word(input_word) 46 | local word = string.lower(input_word) 47 | for i = 1, word:len() do 48 | if word:sub(i, i) >= '0' and word:sub(i, i) <= '9' then 49 | word = word:sub(1, i-1) .. '#' .. word:sub(i+1) 50 | end 51 | end 52 | return word 53 | end 54 | 55 | local function main() 56 | -- Load in the dictionaries and the input files. 57 | local mlp = nnlm.create_lm(opt) 58 | mlp:load(opt.modelFilename) 59 | local adict = mlp.encoder_dict 60 | local tdict = mlp.dict 61 | 62 | local dict_map = sync_dicts(adict, tdict) 63 | local sent_file = assert(io.open(opt.inputf)) 64 | local len = opt.length 65 | local W = mlp.window 66 | opt.window = W 67 | 68 | local sent_num = 0 69 | for line in sent_file:lines() do 70 | sent_num = sent_num + 1 71 | 72 | -- Add padding. 73 | local true_line = " " .. line .. " " 74 | local words = utils.string_split(true_line) 75 | 76 | local article = torch.zeros(#words) 77 | for j = 1, #words do 78 | local word = process_word(words[j]) 79 | article[j] = adict.symbol_to_index[word] or 80 | adict.symbol_to_index[""] 81 | end 82 | 83 | -- Run beam search. 84 | local sbeam = beam.init(opt, mlp.mlp, mlp.encoder_model, 85 | dict_map, tdict) 86 | local results = sbeam:generate(article, len) 87 | 88 | if not opt.nbest then 89 | if #results == 0 then 90 | io.write("*FAIL*") 91 | else 92 | -- Print out in standard format. 93 | local len, _, output, _ = unpack(results[1]) 94 | local total = 0 95 | for j = W+2, W+len - 1 do 96 | local word = tdict.index_to_symbol[output[j]] 97 | total = total + #word + 1 98 | io.write(word, " " ) 99 | end 100 | end 101 | print("") 102 | else 103 | -- Print out an nbest list in Moses/ZMert format. 104 | for k = 1, #results do 105 | io.write(sent_num-1, " ||| ") 106 | local len, score, output, features = unpack(results[k]) 107 | for j = W+2, W+len - 1 do 108 | io.write(tdict.index_to_symbol[output[j]], " " ) 109 | end 110 | io.write(" ||| ") 111 | for f = 1, features:size(1) do 112 | io.write(features[f], " ") 113 | end 114 | io.write(" ||| ", score) 115 | print("") 116 | end 117 | end 118 | end 119 | end 120 | 121 | main() 122 | -------------------------------------------------------------------------------- /summary/train.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | -- The top-level training script 14 | require('torch') 15 | require('nngraph') 16 | 17 | local nnlm = require('summary.nnlm') 18 | local data = require('summary.data') 19 | local encoder = require('summary.encoder') 20 | 21 | cmd = torch.CmdLine() 22 | cmd:text() 23 | cmd:text() 24 | cmd:text('Train a summarization model.') 25 | cmd:text() 26 | 27 | data.add_opts(cmd) 28 | encoder.add_opts(cmd) 29 | nnlm.addOpts(cmd) 30 | 31 | opt = cmd:parse(arg) 32 | 33 | local function main() 34 | -- Load in the data. 35 | local tdata = data.load_title(opt.titleDir, true) 36 | local article_data = data.load_article(opt.articleDir) 37 | 38 | local valid_data = data.load_title(opt.validTitleDir, nil, tdata.dict) 39 | local valid_article_data = 40 | data.load_article(opt.validArticleDir, article_data.dict) 41 | 42 | -- Make main LM 43 | local train_data = data.init(tdata, article_data) 44 | local valid = data.init(valid_data, valid_article_data) 45 | local encoder_mlp = encoder.build(opt, train_data) 46 | local mlp = nnlm.create_lm(opt, tdata.dict, encoder_mlp, 47 | opt.bowDim, article_data.dict) 48 | 49 | mlp:train(train_data, valid) 50 | end 51 | 52 | main() 53 | -------------------------------------------------------------------------------- /summary/util.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright (c) 2015, Facebook, Inc. 3 | -- All rights reserved. 4 | -- 5 | -- This source code is licensed under the BSD-style license found in the 6 | -- LICENSE file in the root directory of this source tree. An additional grant 7 | -- of patent rights can be found in the PATENTS file in the same directory. 8 | -- 9 | -- Author: Alexander M Rush 10 | -- Sumit Chopra 11 | -- Jason Weston 12 | 13 | -- The utility tool box 14 | local util = {} 15 | 16 | function util.string_shortfloat(t) 17 | return string.format('%2.4g', t) 18 | end 19 | 20 | function util.shuffleTable(t) 21 | local rand = math.random 22 | local iterations = #t 23 | local j 24 | for i = iterations, 2, -1 do 25 | j = rand(i) 26 | t[i], t[j] = t[j], t[i] 27 | end 28 | end 29 | 30 | 31 | function util.string_split(s, c) 32 | if c==nil then c=' ' end 33 | local t={} 34 | while true do 35 | local f=s:find(c) 36 | if f==nil then 37 | if s:len()>0 then 38 | table.insert(t, s) 39 | end 40 | break 41 | end 42 | if f > 1 then 43 | table.insert(t, s:sub(1,f-1)) 44 | end 45 | s=s:sub(f+1,s:len()) 46 | end 47 | return t 48 | end 49 | 50 | 51 | function util.add(tab, key) 52 | local cur = tab 53 | 54 | for i = 1, #key-1 do 55 | local new_cur = cur[key[i]] 56 | if new_cur == nil then 57 | cur[key[i]] = {} 58 | new_cur = cur[key[i]] 59 | end 60 | cur = new_cur 61 | end 62 | cur[key[#key]] = true 63 | end 64 | 65 | function util.has(tab, key) 66 | local cur = tab 67 | for i = 1, #key do 68 | cur = cur[key[i]] 69 | if cur == nil then 70 | return false 71 | end 72 | end 73 | return true 74 | end 75 | 76 | function util.isnan(x) 77 | return x ~= x 78 | end 79 | 80 | return util 81 | -------------------------------------------------------------------------------- /test_model.sh: -------------------------------------------------------------------------------- 1 | export LUA_PATH="$LUA_PATH;?.lua" 2 | 3 | th summary/run.lua \ 4 | -modelFilename $2 \ 5 | -inputf $1 \ 6 | -length $3 \ 7 | -blockRepeatWords 8 | 9 | -------------------------------------------------------------------------------- /train_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WINDOW=5 4 | export OUT_DIR=$1/processed 5 | export MDL_DIR=$1/models 6 | 7 | export LUA_PATH="$LUA_PATH;$ABS/?.lua" 8 | 9 | #bash $ABS/prep_torch_data.sh $2 10 | 11 | mkdir -p $MDL_DIR 12 | 13 | th -i $ABS/summary/train.lua -titleDir $OUT_DIR/train/title/ \ 14 | -articleDir $OUT_DIR/train/article/ \ 15 | -modelFilename $MDL_DIR/$2 \ 16 | -miniBatchSize 64 \ 17 | -embeddingDim 64 \ 18 | -bowDim 200 \ 19 | -hiddenSize 64 \ 20 | -epochs 20 \ 21 | -learningRate 0.1 \ 22 | -validArticleDir $OUT_DIR/valid.filter/article/ \ 23 | -validTitleDir $OUT_DIR/valid.filter/title/ \ 24 | -window $WINDOW \ 25 | -printEvery 100 \ 26 | -encoderModel "attenbow" \ 27 | -attenPool 5 \ 28 | -------------------------------------------------------------------------------- /tuning/SDecoder_cfg.txt: -------------------------------------------------------------------------------- 1 | LM 1.0 2 | uni 4.84922778048135 3 | bi 1.2132386742991166 4 | tri -13.382831610766107 5 | ooo -0.5293249226416208 6 | length 0.0 7 | -------------------------------------------------------------------------------- /tuning/SDecoder_cmd.tpl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | 6 | d = {"src" : , 7 | "model" : , 8 | "title_len" : } 9 | 10 | for l in open("SDecoder_cfg.txt"): 11 | f, val = l.strip().split() 12 | d[f] = val 13 | 14 | cmd = "cd $ABS; th $ABS/summary/run.lua -modelFilename {model} " + \ 15 | "-inputf {src} " + \ 16 | "-length {title_len} -blockRepeatWords -recombine " + \ 17 | "-beamSize 50 " + \ 18 | "-lmWeight {LM} -unigramBonus {uni} -bigramBonus {bi} " + \ 19 | "-trigramBonus {tri} -lengthBonus {length} -unorderBonus {ooo} " + \ 20 | "-nbest > $ABS/tuning/nbest.out" 21 | 22 | os.system(cmd.format(d)) 23 | -------------------------------------------------------------------------------- /tuning/SDecoder_test.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the BSD-style license found in the 6 | # LICENSE file in the root directory of this source tree. An additional grant 7 | # of patent rights can be found in the PATENTS file in the same directory. 8 | # 9 | # Author: Alexander M Rush 10 | # Sumit Chopra 11 | # Jason Weston 12 | 13 | import os 14 | import sys 15 | #@lint-avoid-python-3-compatibility-imports 16 | 17 | d = {"src": sys.argv[1], 18 | "model": sys.argv[2], 19 | "title_len": 14} 20 | 21 | for l in open("tuning/blank.params"): 22 | f, val = l.strip().split() 23 | d[f] = val 24 | 25 | cmd = "cd $ABS; $CUTH $ABS/summary/run.lua -modelFilename {model} " + \ 26 | "-inputf {src} -recombine " + \ 27 | "-length {title_len} -blockRepeatWords " + \ 28 | "-lmWeight {LM} -unigramBonus {uni} -bigramBonus {bi} " + \ 29 | "-trigramBonus {tri} -lengthBonus {length} -unorderBonus {ooo} " 30 | 31 | os.system(cmd.format(**d)) 32 | -------------------------------------------------------------------------------- /tuning/ZMERT_cfg.txt: -------------------------------------------------------------------------------- 1 | ### Commonly used parameters 2 | -r ref # target sentences file name (in this case, file name prefix) 3 | -rps 4 # references per sentence 4 | -p params.txt # parameter file 5 | -m BLEU 4 closest # evaluation metric and its options 6 | -ipi 20 # number of intermediate initial points 7 | -cmd ./SDecoder_cmd.py # file containing commands to run decoder 8 | -decOut nbest.out # file prodcued by decoder 9 | -dcfg SDecoder_cfg.txt # decoder config file 10 | -N 500 # size of N-best list generated each iteration 11 | -v 1 # verbosity level (0-2; higher value => 12 | -seed 12341234 # random number generator seed 13 | -------------------------------------------------------------------------------- /tuning/params.txt: -------------------------------------------------------------------------------- 1 | LM ||| 1.0 Fix 0.0 +Inf -1 +1 2 | uni ||| 0.0 Opt -Inf +Inf -1 +1 3 | bi ||| 0.0 Opt -Inf +Inf -1 +1 4 | tri ||| 0.0 Opt -Inf +Inf -1 +1 5 | ooo ||| 0.0 Opt -Inf 0 -1 +1 6 | length ||| 0.0 Fix -Inf +Inf -1 +1 7 | normalization = none 8 | --------------------------------------------------------------------------------