├── CONTRIBUTING.md
├── DUC
├── eval.sh
├── make_DUC.py
├── make_rouge.py
└── setup.sh
├── LICENSE
├── PATENTS
├── README.md
├── construct_data.sh
├── dataset
├── filter.py
├── make_dict.py
├── process_agiga.py
├── pull.py
├── small_train.splits
├── test.splits
├── train.splits
└── valid.splits
├── prep_torch_data.sh
├── summary
├── beam_search.lua
├── build.lua
├── build_dict.lua
├── data.lua
├── encoder.lua
├── features.lua
├── nnlm.lua
├── run.lua
├── train.lua
└── util.lua
├── test_model.sh
├── train_model.sh
└── tuning
├── SDecoder_cfg.txt
├── SDecoder_cmd.tpl
├── SDecoder_test.py
├── ZMERT_cfg.txt
└── params.txt
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Neural Attention Model for Abstractive Summarization software
2 | We want to make contributing to this project as easy and transparent as
3 | possible.
4 |
5 | ## Our Development Process
6 | ... (in particular how this is synced with internal changes to the project)
7 |
8 | ## Pull Requests
9 | We actively welcome your pull requests.
10 |
11 | 1. Fork the repo and create your branch from `master`.
12 | 2. If you've added code that should be tested, add tests
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 |
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 |
22 | Complete your CLA here:
23 |
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 |
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 |
32 | ## Coding Style
33 | * 2 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * ...
36 |
37 | ## License
38 | By contributing to Neural Attention Model for Abstractive Summarization, you agree that your contributions will be licensed
39 | under its BSD license.
40 |
--------------------------------------------------------------------------------
/DUC/eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cd $1
4 | rm -fr $1/tmp_GOLD
5 | rm -fr $1/tmp_SYSTEM
6 | rm -fr $1/tmp_OUTPUT
7 | mkdir -p $1/tmp_GOLD
8 | mkdir -p $1/tmp_SYSTEM
9 |
10 | python $ABS/DUC/make_rouge.py --base $1 --gold tmp_GOLD --system tmp_SYSTEM --input input.txt
11 | perl $ABS/DUC/prepare4rouge-simple.pl tmp_SYSTEM tmp_GOLD tmp_OUTPUT
12 |
13 | cd tmp_OUTPUT
14 | export PERL5LIB=/data/users/sashar/summary/duc/RELEASE-1.5.5/
15 |
16 | echo "FULL LENGTH"
17 | perl $ROUGE/ROUGE-1.5.5.pl -m -n 2 -w 1.2 -e $ROUGE -a settings.xml
18 |
19 |
20 | echo "LIMITED LENGTH"
21 | perl $ROUGE/ROUGE-1.5.5.pl -m -b 75 -n 2 -w 1.2 -e $ROUGE -a settings.xml
22 |
--------------------------------------------------------------------------------
/DUC/make_DUC.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2015, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree. An additional grant
7 | # of patent rights can be found in the PATENTS file in the same directory.
8 | #
9 | # Author: Alexander M Rush
10 | # Sumit Chopra
11 | # Jason Weston
12 |
13 | """Construct the DUC test set. """
14 |
15 | import sys
16 | import argparse
17 | import glob
18 | import re
19 | import nltk.data
20 | from nltk.tokenize.treebank import TreebankWordTokenizer
21 | #@lint-avoid-python-3-compatibility-imports
22 |
23 | sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
24 | tokenizer = TreebankWordTokenizer()
25 | def main(arguments):
26 |
27 | parser = argparse.ArgumentParser(description=__doc__,
28 | formatter_class=
29 | argparse.RawDescriptionHelpFormatter)
30 | parser.add_argument('--sum_docs', help="Article directory.", type=str)
31 | parser.add_argument('--year', help="DUC year to process.", type=str)
32 | parser.add_argument('--result_docs', help="Reference directory.", type=str)
33 | parser.add_argument('--ref_dir',
34 | help="Directory to output the references.", type=str)
35 | parser.add_argument('--sys_dir',
36 | help="Directory to output the references.", type=str)
37 | parser.add_argument('--article_file',
38 | help="File to output the article sentences..", type=str)
39 | args = parser.parse_args(arguments)
40 |
41 | refs = [open("{0}/task1_ref{1}.txt".format(args.ref_dir, i), "w")
42 | for i in range(4)]
43 | article = open(args.article_file, "w")
44 | prefix = open(args.sys_dir + "/task1_prefix.txt", "w")
45 | if args.year == "2003":
46 | files = glob.glob("{0}/*/*".format(args.sum_docs))
47 | else:
48 | files = glob.glob("{0}/*/*".format(args.sum_docs))
49 | files.sort()
50 | for f in files:
51 | docset = f.split("/")[-2][:-1].upper()
52 | name = f.split("/")[-1].upper()
53 |
54 | # Find references.
55 | if args.year == "2003":
56 | matches = list(glob.glob("{0}/{1}*.10.*{2}*".format(
57 | args.result_docs, docset, name)))
58 | else:
59 | matches = list(glob.glob("{0}/{1}*{2}*".format(
60 | args.result_docs, docset, name)))
61 | matches.sort()
62 | assert len(matches) == 4, matches
63 | for i, m in enumerate(matches):
64 | print >>refs[i], open(m).read().strip()
65 |
66 | # Make input.
67 | mode = 0
68 | text = ""
69 | for l in open(f):
70 | if l.strip() in ["
", ""]:
71 | continue
72 | if mode == 1 and l.strip() != "
":
73 | text += l.strip() + " "
74 | if l.strip() == "":
75 | mode = 1
76 | text = " ".join([w for w in text.split() if w[0] != "&"])
77 |
78 | sents = sent_detector.tokenize(text)
79 | if len(sents) == 0:
80 | print >>article
81 | print >>prefix
82 | continue
83 | first = sents[0]
84 |
85 | # If the sentence is too short, add the second as well.
86 | if len(sents[0]) < 130 and len(sents) > 1:
87 | first = first.strip()[:-1] + " , " + sents[1]
88 |
89 | first = " ".join(tokenizer.tokenize(first.lower()))
90 | if ")" in first or ("_" in first and args.year == "2003"):
91 | first = re.split(" ((--)|-|_) ", first, 1)[-1]
92 | first = first.replace("(", "-lrb-") \
93 | .replace(")", "-rrb-").replace("_", ",")
94 | print >>article, first
95 | print >>prefix, first[:75]
96 | if __name__ == '__main__':
97 | sys.exit(main(sys.argv[1:]))
98 |
--------------------------------------------------------------------------------
/DUC/make_rouge.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2015, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree. An additional grant
7 | # of patent rights can be found in the PATENTS file in the same directory.
8 | #
9 | # Author: Alexander M Rush
10 | # Sumit Chopra
11 | # Jason Weston
12 |
13 | """Prep ROUGE eval. """
14 |
15 | import sys
16 | import glob
17 | import os
18 | import argparse
19 | import itertools
20 | #@lint-avoid-python-3-compatibility-imports
21 |
22 | parser = argparse.ArgumentParser(description=__doc__,
23 | formatter_class=
24 | argparse.RawDescriptionHelpFormatter)
25 | parser.add_argument('--base', help="Base directory.", type=str)
26 | parser.add_argument('--gold', help="Base directory.", type=str)
27 | parser.add_argument('--system', help="Base directory.", type=str)
28 | parser.add_argument('--input', help="Input text.", type=str)
29 |
30 | args = parser.parse_args(sys.argv[1:])
31 |
32 | for f in glob.glob("{0}/references/*".format(args.base)):
33 | task, ref = f.split("/")[-1].split("_")
34 | ref = int(ref.split(".")[0][-1])
35 |
36 | for i, l in enumerate(open(f)):
37 | os.system("mkdir -p %s/%s%04d"%(args.gold, task, i))
38 | with open("%s/%s%04d/%s%04d.%04d.gold" % (args.gold, task, i, task, i, ref), "w") as out:
39 | print >>out, l.strip()
40 |
41 |
42 | for f in glob.glob("{0}/system/*".format(args.base)):
43 | task, ref = f.split("/")[-1].split("_", 1)
44 | #if ref.startswith("ducsystem"): continue
45 | system = ref.split(".")[0]
46 | os.system("mkdir -p %s/%s"%(args.system, system))
47 | for i, (l, input_line) in enumerate(itertools.izip(open(f), open(args.input))):
48 | words = []
49 | numbers = dict([(len(w), w) for w in input_line.strip().split() if w[0].isdigit()])
50 | for w in l.strip().split():
51 | # Replace # with numbers from the input.
52 | if w[0] == "#" and len(w) in numbers:
53 | words.append(numbers[len(w)])
54 | elif w == "":
55 | continue
56 | else:
57 | words.append(w)
58 |
59 | with open("%s/%s/%s%04d.%s.system" % (args.system, system, task, i, system),"w") as out:
60 | if words:
61 | print >>out, " ".join(words)
62 | else:
63 | print >>out, "fail"
64 |
--------------------------------------------------------------------------------
/DUC/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "Step 1: Extracting DUC files"
4 | cd $1
5 | tar xvf DUC2003_Summarization_Documents.tgz
6 | tar xvf DUC2004_Summarization_Documents.tgz
7 | tar xvf duc2004_results.tgz
8 | tar xvf detagged.duc2003.abstracts.tar.gz
9 |
10 | cd duc2004_results/ROUGE/; tar xvf duc2004.task1.ROUGE.models.tar.gz
11 | cd $1
12 | cd DUC2003_Summarization_Documents/duc2003_testdata/task1/; tar xvf task1.docs.tar.gz
13 |
14 |
15 | echo "Step 2: Make reference files."
16 | cd $1
17 | mkdir $1/clean_2004/
18 | mkdir $1/clean_2004/references
19 | mkdir $1/clean_2004/system
20 | python $ABS/DUC/make_DUC.py --result_docs duc2004_results/ROUGE/eval/models/1/ \
21 | --sum_docs DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs/ \
22 | --ref_dir clean_2004/references --year 2004 --article_file clean_2004/input.txt \
23 | --sys_dir clean_2004/system
24 |
25 | mkdir $1/clean_2003/
26 | mkdir $1/clean_2003/references
27 | mkdir $1/clean_2003/system
28 | python $ABS/DUC/make_DUC.py --result_docs detagged.duc2003.abstracts/models/ \
29 | --sum_docs DUC2003_Summarization_Documents/duc2003_testdata/task1/docs.without.headlines/ \
30 | --ref_dir clean_2003/references --year 2003 --article_file clean_2003/input.txt \
31 | --sys_dir clean_2003/system
32 |
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD License
2 |
3 | For Neural Attention Model for Abstractive Summarization software
4 |
5 | Copyright (c) 2015-present, Facebook, Inc. All rights reserved.
6 |
7 | Redistribution and use in source and binary forms, with or without modification,
8 | are permitted provided that the following conditions are met:
9 |
10 | * Redistributions of source code must retain the above copyright notice, this
11 | list of conditions and the following disclaimer.
12 |
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 | this list of conditions and the following disclaimer in the documentation
15 | and/or other materials provided with the distribution.
16 |
17 | * Neither the name Facebook nor the names of its contributors may be used to
18 | endorse or promote products derived from this software without specific
19 | prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/PATENTS:
--------------------------------------------------------------------------------
1 | Additional Grant of Patent Rights Version 2
2 |
3 | "Software" means the Neural Attention Model for Abstractive Summarization software distributed by Facebook, Inc.
4 |
5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
7 | (subject to the termination provision below) license under any Necessary
8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise
9 | transfer the Software. For avoidance of doubt, no license is granted under
10 | Facebook’s rights in any patent claims that are infringed by (i) modifications
11 | to the Software made by you or any third party or (ii) the Software in
12 | combination with any software or other technology.
13 |
14 | The license granted hereunder will terminate, automatically and without notice,
15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate
16 | directly or indirectly, or take a direct financial interest in, any Patent
17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate
18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or
19 | in part from any software, technology, product or service of Facebook or any of
20 | its subsidiaries or corporate affiliates, or (iii) against any party relating
21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its
22 | subsidiaries or corporate affiliates files a lawsuit alleging patent
23 | infringement against you in the first instance, and you respond by filing a
24 | patent infringement counterclaim in that lawsuit against that party that is
25 | unrelated to the Software, the license granted hereunder will not terminate
26 | under section (i) of this paragraph due to such counterclaim.
27 |
28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is
29 | necessarily infringed by the Software standing alone.
30 |
31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
32 | or contributory infringement or inducement to infringe any patent, including a
33 | cross-claim or counterclaim.
34 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Attention-Based Summarization
2 |
3 | This project contains the Abs. neural abstractive summarization system from the paper
4 |
5 | A Neural Attention Model for Abstractive Summarization.
6 | Alexander M. Rush, Sumit Chopra, Jason Weston.
7 |
8 | The release includes code for:
9 |
10 | * Extracting the summarization data set
11 | * Training the neural summarization model
12 | * Constructing evaluation sets with ROUGE
13 | * Tuning extractive features
14 |
15 | ## Setup
16 |
17 | To run the system, you will need to have [Torch7](http://torch.ch/))
18 | and [fbcunn](https://github.com/facebook/fbcunn) (Facebook's deep
19 | learning library) installed. You will also need Python 2.7, NLTK, and
20 | GNU Parallel to run the data processing scripts. Additionally the
21 | code currently requires a CUDA GPU for training and decoding.
22 |
23 | Finally the scripts require that you set the $ABS environment variable.
24 |
25 | > export ABS=$PWD
26 | > export LUA_PATH="$LUA_PATH;$ABS/?.lua"
27 |
28 | ## Constructing the Data Set
29 |
30 | The model is trained to perform title generation from the first line
31 | of newspaper articles. Since the system is completely data-driven it
32 | requires a large set of aligned input-title pairs for training.
33 |
34 | To provide these pairs we use the [Annotated Gigaword
35 | corpus](https://catalog.ldc.upenn.edu/LDC2012T21) as our main data
36 | set. The corpus is available on LDC, but it requires membership. Once
37 | the annotated gigaword is obtained, you can simply run the provided
38 | script to extract the data set in text format.
39 |
40 | ### Generating the data
41 |
42 | To construct the data set run the following script to produce `working_dir/`,
43 | where `working_dir/' is the path to the directory where you want to store the
44 | processed data. The script 'construct_data.sh' makes use of the 'parallel'
45 | utility, so please make sure that it is in your path.
46 | WARNING: This may take a couple hours to run.
47 |
48 | > ./construct_data.sh agiga/ working_dir/
49 |
50 | ### Format of the data files
51 |
52 | The above command builds aligned files of the form split.type.txt where split
53 | is train/valid/test and type is title/article.
54 |
55 | The output of the script is several aligned plain-text files.
56 | Each has one title or article per line.
57 |
58 | > head train.title.txt
59 | australian current account deficit narrows sharply
60 | at least two dead in southern philippines blast
61 | australian stocks close down #.# percent
62 | envoy urges north korea to restart nuclear disablement
63 | skorea announces tax cuts to stimulate economy
64 |
65 | These files can be used to train the ABS system or be used by other baseline models.
66 |
67 | ## Training the Model
68 |
69 | Once the data set has been constructed, we provide a simple script to train
70 | the model.
71 |
72 | > ./train_model.sh working_dir/ model.th
73 |
74 |
75 | The training process consists of two stages. First we convert the text
76 | files into generic input-title matrices and then we train a
77 | conditional NNLM on this representation.
78 |
79 | Once the model has been fully trained (this may require 3-4 days),
80 | you can use the test script to produce summaries of any plain text file.w
81 |
82 | > ./test_model.sh working_dir/valid.article.filter.txt model.th length_of_summary
83 |
84 |
85 | ### Training options
86 |
87 | These scripts utilize the Torch code available in `$ABS/summary/`
88 |
89 | There are two main torch entry points. One for training the model
90 | from data matrices and the other for evaluating the model on plain-text.
91 |
92 | > th summary/train.lua -help
93 |
94 | Train a summarization model.
95 |
96 | -articleDir Directory containing article training matrices. []
97 | -titleDir Directory containing title training matrices. []
98 | -validArticleDir Directory containing article matricess for validation. []
99 | -validTitleDir Directory containing title matrices for validation. []
100 | -auxModel The encoder model to use. [bow]
101 | -bowDim Article embedding size. [50]
102 | -attenPool Attention model pooling size. [5]
103 | -hiddenUnits Conv net encoder hidden units. [1000]
104 | -kernelWidth Conv net encoder kernel width. [5]
105 | -epochs Number of epochs to train. [5]
106 | -miniBatchSize Size of training minibatch. [64]
107 | -printEvery How often to print during training. [1000]
108 | -modelFilename File for saving loading/model. []
109 | -window Size of NNLM window. [5]
110 | -embeddingDim Size of NNLM embeddings. [50]
111 | -hiddenSize Size of NNLM hidden layer. [100]
112 | -learningRate SGD learning rate. [0.1]
113 |
114 |
115 |
116 | ### Testing options
117 |
118 |
119 | The run script is used for beam-search decoding with a trained
120 | model. See the paper for a description of the extractive
121 | features used at decoding time.
122 |
123 | > th summary/run.lua -help
124 |
125 | -blockRepeatWords Disallow generating a repeated word. [false]
126 | -allowUNK Allow generating . [false]
127 | -fixedLength Produce exactly -length words. [true]
128 | -lmWeight Weight for main model. [1]
129 | -beamSize Size of the beam. [100]
130 | -extractive Force fully extractive summary. [false]
131 | -lmWeight Feature weight for the neural model. [1]
132 | -unigramBonus Feature weight for unigram extraction. [0]
133 | -bigramBonus Feature weight for bigram extraction. [0]
134 | -trigramBonus Feature weight for trigram extraction. [0]
135 | -lengthBonus Feature weight for length. [0]
136 | -unorderBonus Feature weight for out-of-order extraction. [0]
137 | -modelFilename Model to test. []
138 | -inputf Input article files. []
139 | -nbest Write out the nbest list in ZMert format. [false]
140 | -length Maximum length of summary.. [5]
141 |
142 |
143 |
144 | ## Evaluation Data Sets
145 |
146 | We evaluate the ABS model using the shared task from the Document Understanding Conference (DUC).
147 |
148 | This release also includes code for interactive with the DUC shared
149 | task on headline generation. The scripts for processing and evaluating
150 | on this data set are in the DUC/ directory.
151 |
152 | The [DUC data set](http://duc.nist.gov/duc2004/tasks.html) is
153 | available online, unfortunately you must manually fill out a form to
154 | request the data from NIST. Send the request to
155 | [Angela Ellis](mailto:angela.ellis@nist.gov).
156 |
157 | ### Processing DUC
158 |
159 | After receiving credentials you should obtain a series of
160 | tar files containing the data used as part of this shared task.
161 |
162 | 1. Make a directory DUC_data/ which should contain the given files
163 |
164 |
165 | >DUC2003\_Summarization\_Documents.tgz
166 | >DUC2004\_Summarization\_Documents.tgz
167 | >duc2004\_results.tgz
168 | >detagged.duc2003.abstracts.tar.gz
169 |
170 | 2. Run the setup script (this requires python and NLTK for tokenization)
171 |
172 |
173 | > ./DUC/setup.sh DUC_data/
174 |
175 |
176 | After running the scripts there should be directories
177 |
178 | DUC_data/clean_2003/
179 | DUC_data/clean_2004/
180 |
181 |
182 | Each contains a file input.txt where each line is a tokenized first line of an article.
183 |
184 |
185 | > head DUC_data/clean_2003/input.txt
186 | schizophrenia patients whose medication could n't stop the imaginary voices in their heads gained some relief after researchers repeatedly sent a magnetic field into a small area of their brains .
187 | scientists trying to fathom the mystery of schizophrenia say they have found the strongest evidence to date that the disabling psychiatric disorder is caused by gene abnormalities , according to a researcher at two state universities .
188 | a yale school of medicine study is expanding upon what scientists know about the link between schizophrenia and nicotine addiction .
189 | exploring chaos in a search for order , scientists who study the reality-shattering mental disease schizophrenia are becoming fascinated by the chemical environment of areas of the brain where perception is regulated .
190 |
191 | As well as a set of references:
192 |
193 |
194 | > head DUC_data/clean_2003/references/task1_ref0.txt
195 | Magnetic treatment may ease or lessen occurrence of schizophrenic voices.
196 | Evidence shows schizophrenia caused by gene abnormalities of Chromosome 1.
197 | Researchers examining evidence of link between schizophrenia and nicotine addiction.
198 | Scientists focusing on chemical environment of brain to understand schizophrenia.
199 | Schizophrenia study shows disparity between what's known and what's provided to patients.
200 |
201 | System output should be added to the directory system/task1_{name}.txt. For instance the script includes a baseline PREFIX system.
202 |
203 |
204 | DUC_data/clean_2003/references/task1_prefix.txt
205 |
206 |
207 | ### ROUGE for Eval
208 |
209 | To evaluate the summaries you will need the [ROUGE eval system](http://research.microsoft.com/~cyl/download/ROUGE-1.5.5.tgz).
210 |
211 | The ROUGE script requires output in a very complex HTML form.
212 | To simplify this process we include a script to convert the
213 | simple output to one that ROUGE can handle.
214 |
215 | Export the ROUGE directory `export ROUGE={path_to_rouge}` and then run the eval scripts
216 |
217 |
218 | > ./DUC/eval.sh DUC_data/clean_2003/
219 | FULL LENGTH
220 | ---------------------------------------------
221 | prefix ROUGE-1 Average_R: 0.17831 (95%-conf.int. 0.16916 - 0.18736)
222 | prefix ROUGE-1 Average_P: 0.15445 (95%-conf.int. 0.14683 - 0.16220)
223 | prefix ROUGE-1 Average_F: 0.16482 (95%-conf.int. 0.15662 - 0.17318)
224 | ---------------------------------------------
225 | prefix ROUGE-2 Average_R: 0.04936 (95%-conf.int. 0.04420 - 0.05452)
226 | prefix ROUGE-2 Average_P: 0.04257 (95%-conf.int. 0.03794 - 0.04710)
227 | prefix ROUGE-2 Average_F: 0.04550 (95%-conf.int. 0.04060 - 0.05026)
228 |
229 |
230 | ## Tuning Feature Weights
231 |
232 | For our system ABS+ we additionally tune extractive features on the DUC
233 | summarization data. The final features we obtained our distributed with the
234 | system as `tuning/params.best.txt`.
235 |
236 | The MERT tuning code itself is located in the `tuning/` directory. Our setup
237 | uses [ZMert](http://cs.jhu.edu/~ozaidan/zmert/) for this process.
238 |
239 | It should be straightforward to tune the system on any developments
240 | summarization data. Take the following steps to run tuning on the
241 | DUC-2003 data set described above.
242 |
243 | First copy over reference files to the tuning directoy. For instance to tune on DUC-2003:
244 |
245 | ln -s DUC_data/clean_2003/references/task1_ref0.txt tuning/ref.0
246 | ln -s DUC_data/clean_2003/references/task1_ref1.txt tuning/ref.1
247 | ln -s DUC_data/clean_2003/references/task1_ref2.txt tuning/ref.2
248 | ln -s DUC_data/clean_2003/references/task1_ref3.txt tuning/ref.3
249 |
250 | Next copy the SDecoder template, `cp SDecoder_cmd.tpl SDecoder_cmd.py`
251 | and modify the `SDecoder_cmd.py` to point to the model and input text.
252 |
253 | {"model" : "model.th",
254 | "src" : "/data/users/sashar/DUC_data/clean_2003/input.txt",
255 | "title_len" : 14}
256 |
257 |
258 | Now you should be able to run Z-MERT and let it do its thing.
259 |
260 | > cd tuning/; java -cp zmert/lib/zmert.jar ZMERT ZMERT_cfg.txt
261 |
262 | When Z-MERT has finished you can run on new data using command:
263 |
264 | > python SDecoder_test.py input.txt model.th
265 |
--------------------------------------------------------------------------------
/construct_data.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 |
3 | export AGIGA=$1
4 | export WORK=$2
5 | export THREADS=30
6 | export SCRIPTS=$ABS/dataset
7 | export SPLITS=$ABS/dataset
8 | export UNK=5
9 |
10 | echo "Step 1: Construct the title-article pairs from gigaword"
11 | mkdir -p $WORK
12 | find $AGIGA/???/*.xml.gz | parallel --gnu --progress -j $THREADS python2.7 $SCRIPTS/process_agiga.py \{\} $WORK
13 |
14 |
15 | echo "Step 2: Compile the data into train/dev/test."
16 | cd $WORK
17 | cat $SPLITS/train.splits | xargs cat > train.data.txt
18 | cat $SPLITS/valid.splits | xargs cat > valid.data.txt
19 | cat $SPLITS/test.splits | xargs cat > test.data.txt
20 |
21 |
22 | echo "Step 3: Basic filtering on train/dev."
23 | python2.7 $SCRIPTS/filter.py train.data.txt > train.data.filter.txt
24 | python2.7 $SCRIPTS/filter.py valid.data.txt > valid.data.filter.txt
25 |
26 |
27 | echo "Step 4: Compile dictionary."
28 | python2.7 $SCRIPTS/make_dict.py $WORK/train.data.filter.txt $WORK/train $UNK
29 |
30 |
31 | echo "Step 5: Construct title-article files."
32 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict < $WORK/train.data.filter.txt > $WORK/train.title.txt
33 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/train.data.filter.txt > $WORK/train.article.txt
34 |
35 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict < $WORK/valid.data.txt > $WORK/valid.title.txt
36 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/valid.data.txt > $WORK/valid.article.txt
37 |
38 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict < $WORK/valid.data.filter.txt > $WORK/valid.title.filter.txt
39 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/valid.data.filter.txt > $WORK/valid.article.filter.txt
40 |
41 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict < $WORK/test.data.txt > $WORK/test.title.txt
42 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/test.data.txt > $WORK/test.article.txt
43 |
44 |
45 | echo "Step 6: Constructing torch data files."
46 | bash $ABS/prep_torch_data.sh $WORK
47 |
--------------------------------------------------------------------------------
/dataset/filter.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2015, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree. An additional grant
7 | # of patent rights can be found in the PATENTS file in the same directory.
8 | #
9 | # Author: Alexander M Rush
10 | # Sumit Chopra
11 | # Jason Weston
12 |
13 | import sys
14 | #@lint-avoid-python-3-compatibility-imports
15 |
16 | def get_words(parse):
17 | return [w.strip(")")
18 | for w in parse.split()
19 | if w[-1] == ')']
20 |
21 | for l in open(sys.argv[1]):
22 | splits = l.strip().split("\t")
23 | if len(splits) != 4:
24 | continue
25 | title_parse, article_parse, title, article = splits
26 | title_words = title.split()
27 | article_words = article.split()
28 |
29 | # No blanks.
30 | if any((word == "" for word in title_words)):
31 | continue
32 |
33 | if any((word == "" for word in article_words)):
34 | continue
35 |
36 | if not any((word == "." for word in article_words)):
37 | continue
38 |
39 | # Spurious words to blacklist.
40 | # First set is words that never appear in input and output
41 | # Second set is punctuation and non-title words.
42 | bad_words = ['update#', 'update', 'recasts', 'undated', 'grafs', 'corrects',
43 | 'retransmitting', 'updates', 'dateline', 'writethru',
44 | 'recaps', 'inserts', 'incorporates', 'adv##',
45 | 'ld-writethru', 'djlfx', 'edits', 'byline',
46 | 'repetition', 'background', 'thruout', 'quotes',
47 | 'attention', 'ny###', 'overline', 'embargoed', 'ap', 'gmt',
48 | 'adds', 'embargo',
49 | 'urgent', '?', ' i ', ' : ', ' - ', ' by ', '-lrb-', '-rrb-']
50 | if any((bad in title.lower()
51 | for bad in bad_words)):
52 | continue
53 |
54 | # Reasonable lengths
55 | if not (10 < len(article_words) < 100 and
56 | 3 < len(title_words) < 50):
57 | continue
58 |
59 | # Some word match.
60 | matches = len(set([w.lower() for w in title_words if len(w) > 3]) &
61 | set([w.lower() for w in article_words if len(w) > 3]))
62 | if matches < 1:
63 | continue
64 |
65 | # Okay, print.
66 | print(l.strip())
67 |
--------------------------------------------------------------------------------
/dataset/make_dict.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2015, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree. An additional grant
7 | # of patent rights can be found in the PATENTS file in the same directory.
8 | #
9 | # Author: Alexander M Rush
10 | # Sumit Chopra
11 | # Jason Weston
12 |
13 | import sys
14 | from collections import Counter
15 | #@lint-avoid-python-3-compatibility-imports
16 |
17 | title_words = Counter()
18 | article_words = Counter()
19 | limit = int(sys.argv[3])
20 |
21 | for l in open(sys.argv[1]):
22 | splits = l.strip().split("\t")
23 | if len(splits) != 4:
24 | continue
25 | title_parse, article_parse, title, article = l.strip().split("\t")
26 | title_words.update(title.lower().split())
27 | article_words.update(article.lower().split())
28 |
29 | with open(sys.argv[2] + ".article.dict", "w") as f:
30 | print >>f, "", 1e5
31 | print >>f, "", 1e5
32 | print >>f, "", 1e5
33 | for word, count in article_words.most_common():
34 | if count < limit:
35 | break
36 | print >>f, word, count
37 |
38 | with open(sys.argv[2] + ".title.dict", "w") as f:
39 | print >>f, "", 1e5
40 | print >>f, "", 1e5
41 | print >>f, "", 1e5
42 | for word, count in title_words.most_common():
43 | if count < limit:
44 | break
45 | print >>f, word, count
46 |
--------------------------------------------------------------------------------
/dataset/process_agiga.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2015, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree. An additional grant
7 | # of patent rights can be found in the PATENTS file in the same directory.
8 | #
9 | # Author: Alexander M Rush
10 | # Sumit Chopra
11 | # Jason Weston
12 |
13 | #/usr/bin/env python
14 |
15 | import sys
16 | import os
17 | import re
18 | import gzip
19 | #@lint-avoid-python-3-compatibility-imports
20 |
21 | # Make directory for output if it doesn't exist
22 |
23 | try:
24 | os.mkdir(sys.argv[2] + "/" + sys.argv[1].split("/")[-2])
25 | except OSError:
26 | pass
27 |
28 | # Strip off .gz ending
29 | end = "/".join(sys.argv[1].split("/")[-2:])[:-len(".xml.gz")] + ".txt"
30 |
31 | out = open(sys.argv[2] + end, "w")
32 |
33 | # Parse and print titles and articles
34 | NONE, HEAD, NEXT, TEXT = 0, 1, 2, 3
35 | MODE = NONE
36 | title_parse = ""
37 | article_parse = []
38 |
39 | # FIX: Some parses are mis-parenthesized.
40 | def fix_paren(parse):
41 | if len(parse) < 2:
42 | return parse
43 | if parse[0] == "(" and parse[1] == " ":
44 | return parse[2:-1]
45 | return parse
46 |
47 | def get_words(parse):
48 | words = []
49 | for w in parse.split():
50 | if w[-1] == ')':
51 | words.append(w.strip(")"))
52 | if words[-1] == ".":
53 | break
54 | return words
55 |
56 | def remove_digits(parse):
57 | return re.sub(r'\d', '#', parse)
58 |
59 | for l in gzip.open(sys.argv[1]):
60 | if MODE == HEAD:
61 | title_parse = remove_digits(fix_paren(l.strip()))
62 | MODE = NEXT
63 |
64 | if MODE == TEXT:
65 | article_parse.append(remove_digits(fix_paren(l.strip())))
66 |
67 | if MODE == NONE and l.strip() == "":
68 | MODE = HEAD
69 |
70 | if MODE == NEXT and l.strip() == "":
71 | MODE = TEXT
72 |
73 | if MODE == TEXT and l.strip() == "
":
74 | articles = []
75 | # Annotated gigaword has a poor sentence segmenter.
76 | # Ensure there is a least a period.
77 |
78 | for i in range(len(article_parse)):
79 | articles.append(article_parse[i])
80 | if "(. .)" in article_parse[i]:
81 | break
82 |
83 | article_parse = "(TOP " + " ".join(articles) + ")"
84 |
85 | # title_parse \t article_parse \t title \t article
86 | print >>out, "\t".join([title_parse, article_parse,
87 | " ".join(get_words(title_parse)),
88 | " ".join(get_words(article_parse))])
89 | article_parse = []
90 | MODE = NONE
91 |
--------------------------------------------------------------------------------
/dataset/pull.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2015, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree. An additional grant
7 | # of patent rights can be found in the PATENTS file in the same directory.
8 | #
9 | # Author: Alexander M Rush
10 | # Sumit Chopra
11 | # Jason Weston
12 |
13 | """
14 | Pull out elements of the title-article file.
15 | """
16 | import sys
17 | #@lint-avoid-python-3-compatibility-imports
18 |
19 | words_dict = set([l.split()[0]
20 | for l in open(sys.argv[2])])
21 |
22 | for l in sys.stdin:
23 | splits = l.strip().split("\t")
24 | if len(splits) != 4:
25 | continue
26 | title_parse, article_parse, title, article = l.strip().split("\t")
27 | if sys.argv[1] == "src":
28 | print(article)
29 | elif sys.argv[1] == "trg":
30 | print(title)
31 | elif sys.argv[1] == "src_lc":
32 | words = [w if w in words_dict else ""
33 | for w in article.lower().split()]
34 | print(" ".join(words))
35 | elif sys.argv[1] == "trg_lc":
36 | t = title.lower()
37 | words = [w if w in words_dict else ""
38 | for w in t.split()
39 | if w not in ['"', "'", "''", "!", "=", "-",
40 | "--", ",", "?", ".",
41 | "``", "`", "-rrb-", "-llb-", "\\/"]]
42 | print(" ".join(words))
43 | elif sys.argv[1] == "srctree":
44 | print(article_parse)
45 | elif sys.argv[1] == "interleave":
46 | # Format needed for T3
47 | print(article_parse)
48 | print(title_parse)
49 |
--------------------------------------------------------------------------------
/dataset/small_train.splits:
--------------------------------------------------------------------------------
1 | AFP/afp_eng_201004.txt
2 | AFP/afp_eng_200212.txt
3 | AFP/afp_eng_200401.txt
4 | AFP/afp_eng_199508.txt
5 | AFP/afp_eng_200610.txt
6 | AFP/afp_eng_201007.txt
7 | APW/apw_eng_200105.txt
8 | APW/apw_eng_200408.txt
9 | APW/apw_eng_201001.txt
10 | APW/apw_eng_200906.txt
11 | APW/apw_eng_200606.txt
12 | APW/apw_eng_200211.txt
13 | APW/apw_eng_200512.txt
14 | APW/apw_eng_200505.txt
15 | CNA/cna_eng_199910.txt
16 | CNA/cna_eng_199905.txt
17 | CNA/cna_eng_200905.txt
18 | CNA/cna_eng_200101.txt
19 | CNA/cna_eng_200105.txt
20 | CNA/cna_eng_200201.txt
21 | LTW/ltw_eng_199806.txt
22 | LTW/ltw_eng_200702.txt
23 | LTW/ltw_eng_200607.txt
24 | LTW/ltw_eng_200708.txt
25 | LTW/ltw_eng_200501.txt
26 | NYT/nyt_eng_200807.txt
27 | NYT/nyt_eng_200612.txt
28 | NYT/nyt_eng_199608.txt
29 | NYT/nyt_eng_200106.txt
30 | NYT/nyt_eng_200311.txt
31 | NYT/nyt_eng_200702.txt
32 | NYT/nyt_eng_201007.txt
33 | NYT/nyt_eng_200212.txt
34 | XIN/xin_eng_199506.txt
35 | XIN/xin_eng_200311.txt
36 | XIN/xin_eng_199703.txt
37 | XIN/xin_eng_200305.txt
38 | XIN/xin_eng_199808.txt
39 | XIN/xin_eng_199609.txt
40 | XIN/xin_eng_200109.txt
41 | XIN/xin_eng_200706.txt
42 |
--------------------------------------------------------------------------------
/dataset/test.splits:
--------------------------------------------------------------------------------
1 | AFP/afp_eng_199511.txt
2 | AFP/afp_eng_200606.txt
3 | AFP/afp_eng_199703.txt
4 | AFP/afp_eng_200811.txt
5 | AFP/afp_eng_199604.txt
6 | AFP/afp_eng_200704.txt
7 | AFP/afp_eng_200701.txt
8 | APW/apw_eng_200412.txt
9 | APW/apw_eng_200908.txt
10 | APW/apw_eng_199605.txt
11 | APW/apw_eng_200305.txt
12 | APW/apw_eng_200506.txt
13 | APW/apw_eng_199608.txt
14 | APW/apw_eng_199808.txt
15 | APW/apw_eng_200708.txt
16 | APW/apw_eng_199707.txt
17 | CNA/cna_eng_200910.txt
18 | CNA/cna_eng_200103.txt
19 | CNA/cna_eng_200308.txt
20 | CNA/cna_eng_200904.txt
21 | CNA/cna_eng_201012.txt
22 | CNA/cna_eng_201007.txt
23 | CNA/cna_eng_200112.txt
24 | LTW/ltw_eng_200605.txt
25 | LTW/ltw_eng_200608.txt
26 | LTW/ltw_eng_200312.txt
27 | LTW/ltw_eng_200906.txt
28 | LTW/ltw_eng_200606.txt
29 | LTW/ltw_eng_200805.txt
30 | NYT/nyt_eng_201005.txt
31 | NYT/nyt_eng_200305.txt
32 | NYT/nyt_eng_200505.txt
33 | NYT/nyt_eng_199612.txt
34 | NYT/nyt_eng_199910.txt
35 | NYT/nyt_eng_199809.txt
36 | NYT/nyt_eng_201004.txt
37 | NYT/nyt_eng_200808.txt
38 | NYT/nyt_eng_200601.txt
39 | XIN/xin_eng_199704.txt
40 | XIN/xin_eng_200310.txt
41 | XIN/xin_eng_200711.txt
42 | XIN/xin_eng_200804.txt
43 | XIN/xin_eng_200902.txt
44 | XIN/xin_eng_200106.txt
45 | XIN/xin_eng_199802.txt
46 | XIN/xin_eng_200411.txt
47 | XIN/xin_eng_200511.txt
48 |
--------------------------------------------------------------------------------
/dataset/train.splits:
--------------------------------------------------------------------------------
1 | AFP/afp_eng_200809.txt
2 | AFP/afp_eng_199412.txt
3 | AFP/afp_eng_200311.txt
4 | AFP/afp_eng_199512.txt
5 | AFP/afp_eng_200203.txt
6 | AFP/afp_eng_200204.txt
7 | AFP/afp_eng_200608.txt
8 | AFP/afp_eng_200509.txt
9 | AFP/afp_eng_200410.txt
10 | AFP/afp_eng_200405.txt
11 | AFP/afp_eng_200211.txt
12 | AFP/afp_eng_200205.txt
13 | AFP/afp_eng_199405.txt
14 | AFP/afp_eng_199510.txt
15 | AFP/afp_eng_199611.txt
16 | AFP/afp_eng_199612.txt
17 | AFP/afp_eng_200907.txt
18 | AFP/afp_eng_200412.txt
19 | AFP/afp_eng_201002.txt
20 | AFP/afp_eng_200910.txt
21 | AFP/afp_eng_199504.txt
22 | AFP/afp_eng_200207.txt
23 | AFP/afp_eng_199501.txt
24 | AFP/afp_eng_200812.txt
25 | AFP/afp_eng_200307.txt
26 | AFP/afp_eng_199608.txt
27 | AFP/afp_eng_200303.txt
28 | AFP/afp_eng_200304.txt
29 | AFP/afp_eng_199409.txt
30 | AFP/afp_eng_200202.txt
31 | AFP/afp_eng_199610.txt
32 | AFP/afp_eng_199503.txt
33 | AFP/afp_eng_200904.txt
34 | AFP/afp_eng_200212.txt
35 | AFP/afp_eng_201010.txt
36 | AFP/afp_eng_200901.txt
37 | AFP/afp_eng_200702.txt
38 | AFP/afp_eng_199609.txt
39 | AFP/afp_eng_200806.txt
40 | AFP/afp_eng_200805.txt
41 | AFP/afp_eng_200408.txt
42 | AFP/afp_eng_200611.txt
43 | AFP/afp_eng_201012.txt
44 | AFP/afp_eng_200501.txt
45 | AFP/afp_eng_200706.txt
46 | AFP/afp_eng_200505.txt
47 | AFP/afp_eng_199602.txt
48 | AFP/afp_eng_199601.txt
49 | AFP/afp_eng_200607.txt
50 | AFP/afp_eng_200404.txt
51 | AFP/afp_eng_200406.txt
52 | AFP/afp_eng_200912.txt
53 | AFP/afp_eng_200306.txt
54 | AFP/afp_eng_200312.txt
55 | AFP/afp_eng_199506.txt
56 | AFP/afp_eng_199701.txt
57 | AFP/afp_eng_199505.txt
58 | AFP/afp_eng_199606.txt
59 | AFP/afp_eng_200512.txt
60 | AFP/afp_eng_200711.txt
61 | AFP/afp_eng_200603.txt
62 | AFP/afp_eng_200504.txt
63 | AFP/afp_eng_200310.txt
64 | AFP/afp_eng_200209.txt
65 | AFP/afp_eng_199411.txt
66 | AFP/afp_eng_199509.txt
67 | AFP/afp_eng_200903.txt
68 | AFP/afp_eng_200707.txt
69 | AFP/afp_eng_200705.txt
70 | AFP/afp_eng_199603.txt
71 | AFP/afp_eng_200112.txt
72 | AFP/afp_eng_200502.txt
73 | AFP/afp_eng_200508.txt
74 | AFP/afp_eng_200403.txt
75 | AFP/afp_eng_199705.txt
76 | AFP/afp_eng_200908.txt
77 | AFP/afp_eng_200206.txt
78 | AFP/afp_eng_200906.txt
79 | AFP/afp_eng_199507.txt
80 | AFP/afp_eng_201001.txt
81 | AFP/afp_eng_199407.txt
82 | AFP/afp_eng_201004.txt
83 | AFP/afp_eng_200208.txt
84 | AFP/afp_eng_200902.txt
85 | AFP/afp_eng_200710.txt
86 | AFP/afp_eng_200503.txt
87 | AFP/afp_eng_200905.txt
88 | AFP/afp_eng_200712.txt
89 | AFP/afp_eng_200402.txt
90 | AFP/afp_eng_200807.txt
91 | AFP/afp_eng_200804.txt
92 | AFP/afp_eng_201006.txt
93 | AFP/afp_eng_200511.txt
94 | AFP/afp_eng_200802.txt
95 | AFP/afp_eng_201008.txt
96 | AFP/afp_eng_200309.txt
97 | AFP/afp_eng_200301.txt
98 | AFP/afp_eng_200612.txt
99 | AFP/afp_eng_199704.txt
100 | AFP/afp_eng_200604.txt
101 | AFP/afp_eng_199410.txt
102 | AFP/afp_eng_200911.txt
103 | AFP/afp_eng_200510.txt
104 | AFP/afp_eng_200803.txt
105 | AFP/afp_eng_201009.txt
106 | AFP/afp_eng_200810.txt
107 | AFP/afp_eng_200610.txt
108 | AFP/afp_eng_200507.txt
109 | AFP/afp_eng_200708.txt
110 | AFP/afp_eng_200201.txt
111 | AFP/afp_eng_200801.txt
112 | AFP/afp_eng_200407.txt
113 | AFP/afp_eng_200305.txt
114 | AFP/afp_eng_199408.txt
115 | AFP/afp_eng_200210.txt
116 | AFP/afp_eng_199607.txt
117 | AFP/afp_eng_201003.txt
118 | AFP/afp_eng_200605.txt
119 | AFP/afp_eng_201011.txt
120 | AFP/afp_eng_201007.txt
121 | AFP/afp_eng_200401.txt
122 | AFP/afp_eng_200602.txt
123 | AFP/afp_eng_201005.txt
124 | AFP/afp_eng_200709.txt
125 | AFP/afp_eng_200302.txt
126 | AFP/afp_eng_200909.txt
127 | AFP/afp_eng_200609.txt
128 | AFP/afp_eng_200808.txt
129 | AFP/afp_eng_200411.txt
130 | AFP/afp_eng_199508.txt
131 | AFP/afp_eng_199605.txt
132 | AFP/afp_eng_200409.txt
133 | APW/apw_eng_201001.txt
134 | APW/apw_eng_199501.txt
135 | APW/apw_eng_200307.txt
136 | APW/apw_eng_200902.txt
137 | APW/apw_eng_200303.txt
138 | APW/apw_eng_200304.txt
139 | APW/apw_eng_200503.txt
140 | APW/apw_eng_200905.txt
141 | APW/apw_eng_200111.txt
142 | APW/apw_eng_200301.txt
143 | APW/apw_eng_199712.txt
144 | APW/apw_eng_199612.txt
145 | APW/apw_eng_200011.txt
146 | APW/apw_eng_199503.txt
147 | APW/apw_eng_200106.txt
148 | APW/apw_eng_200802.txt
149 | APW/apw_eng_200007.txt
150 | APW/apw_eng_199905.txt
151 | APW/apw_eng_201009.txt
152 | APW/apw_eng_200109.txt
153 | APW/apw_eng_200612.txt
154 | APW/apw_eng_200702.txt
155 | APW/apw_eng_199609.txt
156 | APW/apw_eng_199909.txt
157 | APW/apw_eng_199702.txt
158 | APW/apw_eng_200805.txt
159 | APW/apw_eng_199902.txt
160 | APW/apw_eng_201011.txt
161 | APW/apw_eng_200107.txt
162 | APW/apw_eng_200611.txt
163 | APW/apw_eng_200904.txt
164 | APW/apw_eng_200006.txt
165 | APW/apw_eng_200505.txt
166 | APW/apw_eng_200810.txt
167 | APW/apw_eng_199801.txt
168 | APW/apw_eng_200808.txt
169 | APW/apw_eng_200607.txt
170 | APW/apw_eng_200404.txt
171 | APW/apw_eng_199803.txt
172 | APW/apw_eng_199611.txt
173 | APW/apw_eng_200406.txt
174 | APW/apw_eng_200211.txt
175 | APW/apw_eng_199911.txt
176 | APW/apw_eng_200912.txt
177 | APW/apw_eng_200809.txt
178 | APW/apw_eng_199710.txt
179 | APW/apw_eng_199907.txt
180 | APW/apw_eng_199607.txt
181 | APW/apw_eng_199506.txt
182 | APW/apw_eng_200605.txt
183 | APW/apw_eng_199502.txt
184 | APW/apw_eng_199505.txt
185 | APW/apw_eng_200811.txt
186 | APW/apw_eng_200401.txt
187 | APW/apw_eng_200602.txt
188 | APW/apw_eng_200512.txt
189 | APW/apw_eng_200711.txt
190 | APW/apw_eng_200909.txt
191 | APW/apw_eng_200201.txt
192 | APW/apw_eng_200202.txt
193 | APW/apw_eng_200103.txt
194 | APW/apw_eng_199604.txt
195 | APW/apw_eng_199508.txt
196 | APW/apw_eng_199711.txt
197 | APW/apw_eng_200310.txt
198 | APW/apw_eng_200209.txt
199 | APW/apw_eng_199809.txt
200 | APW/apw_eng_199411.txt
201 | APW/apw_eng_200003.txt
202 | APW/apw_eng_200903.txt
203 | APW/apw_eng_199903.txt
204 | APW/apw_eng_199512.txt
205 | APW/apw_eng_200104.txt
206 | APW/apw_eng_201006.txt
207 | APW/apw_eng_200005.txt
208 | APW/apw_eng_200405.txt
209 | APW/apw_eng_199906.txt
210 | APW/apw_eng_199904.txt
211 | APW/apw_eng_199510.txt
212 | APW/apw_eng_200112.txt
213 | APW/apw_eng_200508.txt
214 | APW/apw_eng_200108.txt
215 | APW/apw_eng_200403.txt
216 | APW/apw_eng_201010.txt
217 | APW/apw_eng_200906.txt
218 | APW/apw_eng_201002.txt
219 | APW/apw_eng_200910.txt
220 | APW/apw_eng_199806.txt
221 | APW/apw_eng_200806.txt
222 | APW/apw_eng_199504.txt
223 | APW/apw_eng_200207.txt
224 | APW/apw_eng_201004.txt
225 | APW/apw_eng_200208.txt
226 | APW/apw_eng_199709.txt
227 | APW/apw_eng_200812.txt
228 | APW/apw_eng_200710.txt
229 | APW/apw_eng_200410.txt
230 | APW/apw_eng_200712.txt
231 | APW/apw_eng_200001.txt
232 | APW/apw_eng_201012.txt
233 | APW/apw_eng_200402.txt
234 | APW/apw_eng_200804.txt
235 | APW/apw_eng_199610.txt
236 | APW/apw_eng_200009.txt
237 | APW/apw_eng_200511.txt
238 | APW/apw_eng_199602.txt
239 | APW/apw_eng_199601.txt
240 | APW/apw_eng_200901.txt
241 | APW/apw_eng_199704.txt
242 | APW/apw_eng_200308.txt
243 | APW/apw_eng_200604.txt
244 | APW/apw_eng_200701.txt
245 | APW/apw_eng_200704.txt
246 | APW/apw_eng_199603.txt
247 | APW/apw_eng_200408.txt
248 | APW/apw_eng_200911.txt
249 | APW/apw_eng_199511.txt
250 | APW/apw_eng_200510.txt
251 | APW/apw_eng_200803.txt
252 | APW/apw_eng_199802.txt
253 | APW/apw_eng_200501.txt
254 | APW/apw_eng_200706.txt
255 | APW/apw_eng_200610.txt
256 | APW/apw_eng_199804.txt
257 | APW/apw_eng_200507.txt
258 | APW/apw_eng_200801.txt
259 | APW/apw_eng_199908.txt
260 | APW/apw_eng_201007.txt
261 | APW/apw_eng_200601.txt
262 | APW/apw_eng_200306.txt
263 | APW/apw_eng_200407.txt
264 | APW/apw_eng_200212.txt
265 | APW/apw_eng_199910.txt
266 | APW/apw_eng_200004.txt
267 | APW/apw_eng_200312.txt
268 | APW/apw_eng_201003.txt
269 | APW/apw_eng_199701.txt
270 | APW/apw_eng_200008.txt
271 | APW/apw_eng_200012.txt
272 | APW/apw_eng_201005.txt
273 | APW/apw_eng_200709.txt
274 | APW/apw_eng_200105.txt
275 | APW/apw_eng_200302.txt
276 | APW/apw_eng_200101.txt
277 | APW/apw_eng_200609.txt
278 | APW/apw_eng_200603.txt
279 | APW/apw_eng_199901.txt
280 | APW/apw_eng_200002.txt
281 | APW/apw_eng_200504.txt
282 | APW/apw_eng_200606.txt
283 | APW/apw_eng_200409.txt
284 | APW/apw_eng_199509.txt
285 | APW/apw_eng_199412.txt
286 | APW/apw_eng_200311.txt
287 | APW/apw_eng_200203.txt
288 | APW/apw_eng_200703.txt
289 | APW/apw_eng_200707.txt
290 | APW/apw_eng_200509.txt
291 | APW/apw_eng_200102.txt
292 | APW/apw_eng_200705.txt
293 | APW/apw_eng_201008.txt
294 | APW/apw_eng_200807.txt
295 | APW/apw_eng_200502.txt
296 | APW/apw_eng_200110.txt
297 | APW/apw_eng_200010.txt
298 | APW/apw_eng_199705.txt
299 | APW/apw_eng_199706.txt
300 | APW/apw_eng_200206.txt
301 | APW/apw_eng_199703.txt
302 | APW/apw_eng_199805.txt
303 | APW/apw_eng_200411.txt
304 | APW/apw_eng_199507.txt
305 | CNA/cna_eng_200608.txt
306 | CNA/cna_eng_200906.txt
307 | CNA/cna_eng_200110.txt
308 | CNA/cna_eng_199712.txt
309 | CNA/cna_eng_200609.txt
310 | CNA/cna_eng_199903.txt
311 | CNA/cna_eng_200111.txt
312 | CNA/cna_eng_200712.txt
313 | CNA/cna_eng_200808.txt
314 | CNA/cna_eng_200006.txt
315 | CNA/cna_eng_199803.txt
316 | CNA/cna_eng_200811.txt
317 | CNA/cna_eng_200004.txt
318 | CNA/cna_eng_199906.txt
319 | CNA/cna_eng_200009.txt
320 | CNA/cna_eng_200401.txt
321 | CNA/cna_eng_200602.txt
322 | CNA/cna_eng_200802.txt
323 | CNA/cna_eng_200108.txt
324 | CNA/cna_eng_200501.txt
325 | CNA/cna_eng_200106.txt
326 | CNA/cna_eng_200203.txt
327 | CNA/cna_eng_200903.txt
328 | CNA/cna_eng_200812.txt
329 | CNA/cna_eng_200911.txt
330 | CNA/cna_eng_200505.txt
331 | CNA/cna_eng_199710.txt
332 | CNA/cna_eng_200806.txt
333 | CNA/cna_eng_200311.txt
334 | CNA/cna_eng_200507.txt
335 | CNA/cna_eng_200809.txt
336 | CNA/cna_eng_200010.txt
337 | CNA/cna_eng_200312.txt
338 | CNA/cna_eng_199802.txt
339 | CNA/cna_eng_200807.txt
340 | CNA/cna_eng_199908.txt
341 | CNA/cna_eng_200202.txt
342 | CNA/cna_eng_201002.txt
343 | CNA/cna_eng_200512.txt
344 | CNA/cna_eng_200309.txt
345 | CNA/cna_eng_200607.txt
346 | CNA/cna_eng_199711.txt
347 | CNA/cna_eng_199809.txt
348 | CNA/cna_eng_200805.txt
349 | CNA/cna_eng_200610.txt
350 | CNA/cna_eng_200109.txt
351 | CNA/cna_eng_200007.txt
352 | CNA/cna_eng_200703.txt
353 | CNA/cna_eng_200201.txt
354 | CNA/cna_eng_199904.txt
355 | CNA/cna_eng_199806.txt
356 | CNA/cna_eng_200410.txt
357 | CNA/cna_eng_200001.txt
358 | CNA/cna_eng_200709.txt
359 | CNA/cna_eng_200408.txt
360 | CNA/cna_eng_200711.txt
361 | CNA/cna_eng_200101.txt
362 | CNA/cna_eng_201003.txt
363 | CNA/cna_eng_199805.txt
364 | CNA/cna_eng_200012.txt
365 | CNA/cna_eng_199804.txt
366 | CNA/cna_eng_200907.txt
367 | CNA/cna_eng_200502.txt
368 | CNA/cna_eng_200603.txt
369 | CNA/cna_eng_199911.txt
370 | CNA/cna_eng_200902.txt
371 | CNA/cna_eng_200605.txt
372 | CNA/cna_eng_200107.txt
373 | CNA/cna_eng_200611.txt
374 | CNA/cna_eng_201008.txt
375 | CNA/cna_eng_200409.txt
376 | CNA/cna_eng_200412.txt
377 | CNA/cna_eng_200503.txt
378 | CNA/cna_eng_200005.txt
379 | CNA/cna_eng_200905.txt
380 | CNA/cna_eng_200105.txt
381 | CNA/cna_eng_199905.txt
382 | CNA/cna_eng_200511.txt
383 | CNA/cna_eng_199902.txt
384 | CNA/cna_eng_200704.txt
385 | CNA/cna_eng_200901.txt
386 | CNA/cna_eng_199808.txt
387 | CNA/cna_eng_201009.txt
388 | CNA/cna_eng_200810.txt
389 | CNA/cna_eng_201011.txt
390 | CNA/cna_eng_200708.txt
391 | CNA/cna_eng_200402.txt
392 | CNA/cna_eng_200604.txt
393 | CNA/cna_eng_201006.txt
394 | CNA/cna_eng_200008.txt
395 | CNA/cna_eng_201001.txt
396 | CNA/cna_eng_200509.txt
397 | CNA/cna_eng_200510.txt
398 | CNA/cna_eng_200405.txt
399 | CNA/cna_eng_200801.txt
400 | CNA/cna_eng_199912.txt
401 | CNA/cna_eng_200104.txt
402 | CNA/cna_eng_200307.txt
403 | CNA/cna_eng_201010.txt
404 | CNA/cna_eng_200506.txt
405 | CNA/cna_eng_200612.txt
406 | CNA/cna_eng_200706.txt
407 | CNA/cna_eng_200701.txt
408 | CNA/cna_eng_200804.txt
409 | CNA/cna_eng_199709.txt
410 | CNA/cna_eng_200411.txt
411 | CNA/cna_eng_199901.txt
412 | CNA/cna_eng_200002.txt
413 | CNA/cna_eng_200508.txt
414 | CNA/cna_eng_200310.txt
415 | CNA/cna_eng_200908.txt
416 | CNA/cna_eng_199907.txt
417 | CNA/cna_eng_200606.txt
418 | CNA/cna_eng_200601.txt
419 | CNA/cna_eng_200702.txt
420 | CNA/cna_eng_200909.txt
421 | CNA/cna_eng_199807.txt
422 | CNA/cna_eng_199909.txt
423 | CNA/cna_eng_200404.txt
424 | CNA/cna_eng_200403.txt
425 | CNA/cna_eng_200406.txt
426 | CNA/cna_eng_200707.txt
427 | CNA/cna_eng_199910.txt
428 | CNA/cna_eng_200705.txt
429 | CNA/cna_eng_200011.txt
430 | CNA/cna_eng_201004.txt
431 | CNA/cna_eng_199801.txt
432 | LTW/ltw_eng_200405.txt
433 | LTW/ltw_eng_199710.txt
434 | LTW/ltw_eng_200311.txt
435 | LTW/ltw_eng_200507.txt
436 | LTW/ltw_eng_200809.txt
437 | LTW/ltw_eng_199801.txt
438 | LTW/ltw_eng_199406.txt
439 | LTW/ltw_eng_200506.txt
440 | LTW/ltw_eng_199704.txt
441 | LTW/ltw_eng_199508.txt
442 | LTW/ltw_eng_200409.txt
443 | LTW/ltw_eng_200412.txt
444 | LTW/ltw_eng_200710.txt
445 | LTW/ltw_eng_200904.txt
446 | LTW/ltw_eng_199603.txt
447 | LTW/ltw_eng_199512.txt
448 | LTW/ltw_eng_200411.txt
449 | LTW/ltw_eng_200603.txt
450 | LTW/ltw_eng_200810.txt
451 | LTW/ltw_eng_200401.txt
452 | LTW/ltw_eng_200410.txt
453 | LTW/ltw_eng_199411.txt
454 | LTW/ltw_eng_200404.txt
455 | LTW/ltw_eng_199705.txt
456 | LTW/ltw_eng_200510.txt
457 | LTW/ltw_eng_199804.txt
458 | LTW/ltw_eng_200705.txt
459 | LTW/ltw_eng_200812.txt
460 | LTW/ltw_eng_200911.txt
461 | LTW/ltw_eng_200502.txt
462 | LTW/ltw_eng_199501.txt
463 | LTW/ltw_eng_199506.txt
464 | LTW/ltw_eng_200611.txt
465 | LTW/ltw_eng_200804.txt
466 | LTW/ltw_eng_199701.txt
467 | LTW/ltw_eng_199711.txt
468 | LTW/ltw_eng_199601.txt
469 | LTW/ltw_eng_199606.txt
470 | LTW/ltw_eng_200704.txt
471 | LTW/ltw_eng_199702.txt
472 | LTW/ltw_eng_200703.txt
473 | LTW/ltw_eng_200308.txt
474 | LTW/ltw_eng_200602.txt
475 | LTW/ltw_eng_199703.txt
476 | LTW/ltw_eng_200708.txt
477 | LTW/ltw_eng_200604.txt
478 | LTW/ltw_eng_200711.txt
479 | LTW/ltw_eng_200909.txt
480 | LTW/ltw_eng_200509.txt
481 | LTW/ltw_eng_200406.txt
482 | LTW/ltw_eng_199612.txt
483 | LTW/ltw_eng_199608.txt
484 | LTW/ltw_eng_200505.txt
485 | LTW/ltw_eng_200912.txt
486 | LTW/ltw_eng_199412.txt
487 | LTW/ltw_eng_200709.txt
488 | LTW/ltw_eng_200910.txt
489 | LTW/ltw_eng_200612.txt
490 | LTW/ltw_eng_199405.txt
491 | LTW/ltw_eng_199510.txt
492 | LTW/ltw_eng_199407.txt
493 | LTW/ltw_eng_200803.txt
494 | LTW/ltw_eng_200607.txt
495 | LTW/ltw_eng_199712.txt
496 | LTW/ltw_eng_199611.txt
497 | LTW/ltw_eng_200609.txt
498 | LTW/ltw_eng_200503.txt
499 | LTW/ltw_eng_199605.txt
500 | LTW/ltw_eng_199709.txt
501 | LTW/ltw_eng_200808.txt
502 | LTW/ltw_eng_200907.txt
503 | LTW/ltw_eng_200902.txt
504 | LTW/ltw_eng_199707.txt
505 | LTW/ltw_eng_200811.txt
506 | LTW/ltw_eng_199409.txt
507 | LTW/ltw_eng_199410.txt
508 | LTW/ltw_eng_200908.txt
509 | LTW/ltw_eng_199609.txt
510 | LTW/ltw_eng_199408.txt
511 | LTW/ltw_eng_200601.txt
512 | LTW/ltw_eng_200402.txt
513 | LTW/ltw_eng_200501.txt
514 | LTW/ltw_eng_199504.txt
515 | LTW/ltw_eng_199805.txt
516 | LTW/ltw_eng_199511.txt
517 | LTW/ltw_eng_199505.txt
518 | LTW/ltw_eng_199610.txt
519 | LTW/ltw_eng_200801.txt
520 | LTW/ltw_eng_200806.txt
521 | LTW/ltw_eng_199802.txt
522 | LTW/ltw_eng_200807.txt
523 | LTW/ltw_eng_199507.txt
524 | LTW/ltw_eng_200309.txt
525 | LTW/ltw_eng_200706.txt
526 | LTW/ltw_eng_200701.txt
527 | LTW/ltw_eng_199708.txt
528 | LTW/ltw_eng_199502.txt
529 | LTW/ltw_eng_200712.txt
530 | LTW/ltw_eng_200511.txt
531 | LTW/ltw_eng_200610.txt
532 | LTW/ltw_eng_200905.txt
533 | LTW/ltw_eng_200901.txt
534 | LTW/ltw_eng_200903.txt
535 | LTW/ltw_eng_199806.txt
536 | LTW/ltw_eng_200508.txt
537 | LTW/ltw_eng_200802.txt
538 | LTW/ltw_eng_200702.txt
539 | LTW/ltw_eng_200408.txt
540 | LTW/ltw_eng_199604.txt
541 | LTW/ltw_eng_200403.txt
542 | LTW/ltw_eng_199607.txt
543 | LTW/ltw_eng_199602.txt
544 | LTW/ltw_eng_200504.txt
545 | LTW/ltw_eng_200707.txt
546 | LTW/ltw_eng_199706.txt
547 | NYT/nyt_eng_200110.txt
548 | NYT/nyt_eng_200904.txt
549 | NYT/nyt_eng_200903.txt
550 | NYT/nyt_eng_200707.txt
551 | NYT/nyt_eng_199505.txt
552 | NYT/nyt_eng_200703.txt
553 | NYT/nyt_eng_200704.txt
554 | NYT/nyt_eng_200103.txt
555 | NYT/nyt_eng_199701.txt
556 | NYT/nyt_eng_199502.txt
557 | NYT/nyt_eng_200511.txt
558 | NYT/nyt_eng_200701.txt
559 | NYT/nyt_eng_200602.txt
560 | NYT/nyt_eng_200902.txt
561 | NYT/nyt_eng_200411.txt
562 | NYT/nyt_eng_199411.txt
563 | NYT/nyt_eng_200506.txt
564 | NYT/nyt_eng_201007.txt
565 | NYT/nyt_eng_199711.txt
566 | NYT/nyt_eng_200407.txt
567 | NYT/nyt_eng_200612.txt
568 | NYT/nyt_eng_200709.txt
569 | NYT/nyt_eng_199806.txt
570 | NYT/nyt_eng_201009.txt
571 | NYT/nyt_eng_200509.txt
572 | NYT/nyt_eng_200212.txt
573 | NYT/nyt_eng_200302.txt
574 | NYT/nyt_eng_200909.txt
575 | NYT/nyt_eng_200804.txt
576 | NYT/nyt_eng_200803.txt
577 | NYT/nyt_eng_200812.txt
578 | NYT/nyt_eng_200507.txt
579 | NYT/nyt_eng_200211.txt
580 | NYT/nyt_eng_199705.txt
581 | NYT/nyt_eng_200905.txt
582 | NYT/nyt_eng_200911.txt
583 | NYT/nyt_eng_200907.txt
584 | NYT/nyt_eng_200105.txt
585 | NYT/nyt_eng_199608.txt
586 | NYT/nyt_eng_199808.txt
587 | NYT/nyt_eng_200207.txt
588 | NYT/nyt_eng_200004.txt
589 | NYT/nyt_eng_199703.txt
590 | NYT/nyt_eng_200006.txt
591 | NYT/nyt_eng_199905.txt
592 | NYT/nyt_eng_201006.txt
593 | NYT/nyt_eng_199802.txt
594 | NYT/nyt_eng_199903.txt
595 | NYT/nyt_eng_200705.txt
596 | NYT/nyt_eng_201012.txt
597 | NYT/nyt_eng_200610.txt
598 | NYT/nyt_eng_199801.txt
599 | NYT/nyt_eng_199410.txt
600 | NYT/nyt_eng_200001.txt
601 | NYT/nyt_eng_200202.txt
602 | NYT/nyt_eng_199412.txt
603 | NYT/nyt_eng_199702.txt
604 | NYT/nyt_eng_200112.txt
605 | NYT/nyt_eng_200311.txt
606 | NYT/nyt_eng_199611.txt
607 | NYT/nyt_eng_199912.txt
608 | NYT/nyt_eng_200011.txt
609 | NYT/nyt_eng_200002.txt
610 | NYT/nyt_eng_200710.txt
611 | NYT/nyt_eng_200609.txt
612 | NYT/nyt_eng_201002.txt
613 | NYT/nyt_eng_200403.txt
614 | NYT/nyt_eng_199504.txt
615 | NYT/nyt_eng_200809.txt
616 | NYT/nyt_eng_200504.txt
617 | NYT/nyt_eng_199708.txt
618 | NYT/nyt_eng_201001.txt
619 | NYT/nyt_eng_199610.txt
620 | NYT/nyt_eng_200405.txt
621 | NYT/nyt_eng_200005.txt
622 | NYT/nyt_eng_200611.txt
623 | NYT/nyt_eng_200605.txt
624 | NYT/nyt_eng_199907.txt
625 | NYT/nyt_eng_199601.txt
626 | NYT/nyt_eng_200512.txt
627 | NYT/nyt_eng_199510.txt
628 | NYT/nyt_eng_199901.txt
629 | NYT/nyt_eng_199607.txt
630 | NYT/nyt_eng_200508.txt
631 | NYT/nyt_eng_200908.txt
632 | NYT/nyt_eng_200810.txt
633 | NYT/nyt_eng_199902.txt
634 | NYT/nyt_eng_199501.txt
635 | NYT/nyt_eng_199707.txt
636 | NYT/nyt_eng_200607.txt
637 | NYT/nyt_eng_200608.txt
638 | NYT/nyt_eng_199804.txt
639 | NYT/nyt_eng_200109.txt
640 | NYT/nyt_eng_199908.txt
641 | NYT/nyt_eng_200805.txt
642 | NYT/nyt_eng_200310.txt
643 | NYT/nyt_eng_200502.txt
644 | NYT/nyt_eng_199606.txt
645 | NYT/nyt_eng_200312.txt
646 | NYT/nyt_eng_200401.txt
647 | NYT/nyt_eng_199409.txt
648 | NYT/nyt_eng_199909.txt
649 | NYT/nyt_eng_200409.txt
650 | NYT/nyt_eng_199509.txt
651 | NYT/nyt_eng_199503.txt
652 | NYT/nyt_eng_199604.txt
653 | NYT/nyt_eng_200901.txt
654 | NYT/nyt_eng_199506.txt
655 | NYT/nyt_eng_200708.txt
656 | NYT/nyt_eng_200204.txt
657 | NYT/nyt_eng_200301.txt
658 | NYT/nyt_eng_200304.txt
659 | NYT/nyt_eng_200910.txt
660 | NYT/nyt_eng_200008.txt
661 | NYT/nyt_eng_199407.txt
662 | NYT/nyt_eng_199508.txt
663 | NYT/nyt_eng_199609.txt
664 | NYT/nyt_eng_199710.txt
665 | NYT/nyt_eng_200101.txt
666 | NYT/nyt_eng_199602.txt
667 | NYT/nyt_eng_200210.txt
668 | NYT/nyt_eng_200107.txt
669 | NYT/nyt_eng_200108.txt
670 | NYT/nyt_eng_200308.txt
671 | NYT/nyt_eng_200801.txt
672 | NYT/nyt_eng_199712.txt
673 | NYT/nyt_eng_200802.txt
674 | NYT/nyt_eng_200912.txt
675 | NYT/nyt_eng_200807.txt
676 | NYT/nyt_eng_200201.txt
677 | NYT/nyt_eng_200706.txt
678 | NYT/nyt_eng_200007.txt
679 | NYT/nyt_eng_200404.txt
680 | NYT/nyt_eng_199803.txt
681 | NYT/nyt_eng_200712.txt
682 | NYT/nyt_eng_199408.txt
683 | NYT/nyt_eng_200408.txt
684 | NYT/nyt_eng_199603.txt
685 | NYT/nyt_eng_200412.txt
686 | NYT/nyt_eng_200106.txt
687 | NYT/nyt_eng_200309.txt
688 | NYT/nyt_eng_201010.txt
689 | NYT/nyt_eng_200811.txt
690 | NYT/nyt_eng_200702.txt
691 | NYT/nyt_eng_200501.txt
692 | NYT/nyt_eng_200209.txt
693 | NYT/nyt_eng_200906.txt
694 | NYT/nyt_eng_200402.txt
695 | NYT/nyt_eng_200104.txt
696 | NYT/nyt_eng_199911.txt
697 | NYT/nyt_eng_200206.txt
698 | NYT/nyt_eng_199805.txt
699 | NYT/nyt_eng_200009.txt
700 | NYT/nyt_eng_200711.txt
701 | NYT/nyt_eng_200806.txt
702 | NYT/nyt_eng_200603.txt
703 | NYT/nyt_eng_201003.txt
704 | NYT/nyt_eng_200604.txt
705 | NYT/nyt_eng_200303.txt
706 | NYT/nyt_eng_200208.txt
707 | NYT/nyt_eng_199511.txt
708 | NYT/nyt_eng_200010.txt
709 | NYT/nyt_eng_199605.txt
710 | NYT/nyt_eng_200102.txt
711 | NYT/nyt_eng_199904.txt
712 | NYT/nyt_eng_199807.txt
713 | NYT/nyt_eng_200510.txt
714 | NYT/nyt_eng_199507.txt
715 | NYT/nyt_eng_200410.txt
716 | NYT/nyt_eng_199906.txt
717 | NYT/nyt_eng_199706.txt
718 | NYT/nyt_eng_200012.txt
719 | NYT/nyt_eng_200111.txt
720 | NYT/nyt_eng_201008.txt
721 | NYT/nyt_eng_200606.txt
722 | NYT/nyt_eng_200503.txt
723 | WPB/wpb_eng_201012.txt
724 | WPB/wpb_eng_201007.txt
725 | WPB/wpb_eng_201008.txt
726 | WPB/wpb_eng_201003.txt
727 | WPB/wpb_eng_201004.txt
728 | WPB/wpb_eng_201010.txt
729 | WPB/wpb_eng_201001.txt
730 | WPB/wpb_eng_201006.txt
731 | WPB/wpb_eng_201009.txt
732 | WPB/wpb_eng_201002.txt
733 | WPB/wpb_eng_201005.txt
734 | WPB/wpb_eng_201011.txt
735 | XIN/xin_eng_199708.txt
736 | XIN/xin_eng_200303.txt
737 | XIN/xin_eng_199701.txt
738 | XIN/xin_eng_200305.txt
739 | XIN/xin_eng_200208.txt
740 | XIN/xin_eng_200203.txt
741 | XIN/xin_eng_199807.txt
742 | XIN/xin_eng_199912.txt
743 | XIN/xin_eng_200302.txt
744 | XIN/xin_eng_201010.txt
745 | XIN/xin_eng_200612.txt
746 | XIN/xin_eng_199706.txt
747 | XIN/xin_eng_200104.txt
748 | XIN/xin_eng_200912.txt
749 | XIN/xin_eng_200412.txt
750 | XIN/xin_eng_201005.txt
751 | XIN/xin_eng_200507.txt
752 | XIN/xin_eng_199609.txt
753 | XIN/xin_eng_199910.txt
754 | XIN/xin_eng_200506.txt
755 | XIN/xin_eng_200404.txt
756 | XIN/xin_eng_200712.txt
757 | XIN/xin_eng_200401.txt
758 | XIN/xin_eng_200110.txt
759 | XIN/xin_eng_199502.txt
760 | XIN/xin_eng_200312.txt
761 | XIN/xin_eng_200005.txt
762 | XIN/xin_eng_200602.txt
763 | XIN/xin_eng_200002.txt
764 | XIN/xin_eng_199907.txt
765 | XIN/xin_eng_199608.txt
766 | XIN/xin_eng_199711.txt
767 | XIN/xin_eng_200207.txt
768 | XIN/xin_eng_201006.txt
769 | XIN/xin_eng_200710.txt
770 | XIN/xin_eng_199506.txt
771 | XIN/xin_eng_200201.txt
772 | XIN/xin_eng_200706.txt
773 | XIN/xin_eng_200909.txt
774 | XIN/xin_eng_199504.txt
775 | XIN/xin_eng_200705.txt
776 | XIN/xin_eng_200806.txt
777 | XIN/xin_eng_201003.txt
778 | XIN/xin_eng_200604.txt
779 | XIN/xin_eng_200109.txt
780 | XIN/xin_eng_199606.txt
781 | XIN/xin_eng_200410.txt
782 | XIN/xin_eng_200905.txt
783 | XIN/xin_eng_200101.txt
784 | XIN/xin_eng_199909.txt
785 | XIN/xin_eng_200105.txt
786 | XIN/xin_eng_200102.txt
787 | XIN/xin_eng_199503.txt
788 | XIN/xin_eng_200408.txt
789 | XIN/xin_eng_200107.txt
790 | XIN/xin_eng_200004.txt
791 | XIN/xin_eng_199604.txt
792 | XIN/xin_eng_199610.txt
793 | XIN/xin_eng_200606.txt
794 | XIN/xin_eng_200409.txt
795 | XIN/xin_eng_200403.txt
796 | XIN/xin_eng_200301.txt
797 | XIN/xin_eng_200608.txt
798 | XIN/xin_eng_200903.txt
799 | XIN/xin_eng_199801.txt
800 | XIN/xin_eng_199508.txt
801 | XIN/xin_eng_200502.txt
802 | XIN/xin_eng_200701.txt
803 | XIN/xin_eng_199705.txt
804 | XIN/xin_eng_199702.txt
805 | XIN/xin_eng_200111.txt
806 | XIN/xin_eng_201012.txt
807 | XIN/xin_eng_199808.txt
808 | XIN/xin_eng_199507.txt
809 | XIN/xin_eng_200509.txt
810 | XIN/xin_eng_199911.txt
811 | XIN/xin_eng_200802.txt
812 | XIN/xin_eng_200901.txt
813 | XIN/xin_eng_201009.txt
814 | XIN/xin_eng_199501.txt
815 | XIN/xin_eng_199805.txt
816 | XIN/xin_eng_200007.txt
817 | XIN/xin_eng_200309.txt
818 | XIN/xin_eng_199804.txt
819 | XIN/xin_eng_200209.txt
820 | XIN/xin_eng_200205.txt
821 | XIN/xin_eng_201001.txt
822 | XIN/xin_eng_201002.txt
823 | XIN/xin_eng_200103.txt
824 | XIN/xin_eng_199511.txt
825 | XIN/xin_eng_200210.txt
826 | XIN/xin_eng_200611.txt
827 | XIN/xin_eng_199601.txt
828 | XIN/xin_eng_199605.txt
829 | XIN/xin_eng_199602.txt
830 | XIN/xin_eng_201008.txt
831 | XIN/xin_eng_199607.txt
832 | XIN/xin_eng_199906.txt
833 | XIN/xin_eng_200508.txt
834 | XIN/xin_eng_199902.txt
835 | XIN/xin_eng_199806.txt
836 | XIN/xin_eng_200609.txt
837 | XIN/xin_eng_200009.txt
838 | XIN/xin_eng_200211.txt
839 | XIN/xin_eng_200603.txt
840 | XIN/xin_eng_199803.txt
841 | XIN/xin_eng_201004.txt
842 | XIN/xin_eng_200703.txt
843 | XIN/xin_eng_200704.txt
844 | XIN/xin_eng_200405.txt
845 | XIN/xin_eng_200010.txt
846 | XIN/xin_eng_200911.txt
847 | XIN/xin_eng_201011.txt
848 | XIN/xin_eng_199612.txt
849 | XIN/xin_eng_200501.txt
850 | XIN/xin_eng_199509.txt
851 | XIN/xin_eng_201007.txt
852 | XIN/xin_eng_200503.txt
853 | XIN/xin_eng_200003.txt
854 | XIN/xin_eng_200908.txt
855 | XIN/xin_eng_200601.txt
856 | XIN/xin_eng_200402.txt
857 | XIN/xin_eng_200012.txt
858 | XIN/xin_eng_200808.txt
859 | XIN/xin_eng_199707.txt
860 | XIN/xin_eng_199903.txt
861 | XIN/xin_eng_200803.txt
862 | XIN/xin_eng_200512.txt
863 | XIN/xin_eng_200904.txt
864 | XIN/xin_eng_200008.txt
865 | XIN/xin_eng_199505.txt
866 | XIN/xin_eng_200805.txt
867 | XIN/xin_eng_200307.txt
868 | XIN/xin_eng_199603.txt
869 | XIN/xin_eng_200001.txt
870 | XIN/xin_eng_200907.txt
871 | XIN/xin_eng_200311.txt
872 | XIN/xin_eng_200510.txt
873 | XIN/xin_eng_200906.txt
874 | XIN/xin_eng_200006.txt
875 | XIN/xin_eng_199905.txt
876 | XIN/xin_eng_199809.txt
877 | XIN/xin_eng_199512.txt
878 | XIN/xin_eng_199709.txt
879 | XIN/xin_eng_200809.txt
880 | XIN/xin_eng_200304.txt
881 | XIN/xin_eng_200308.txt
882 | XIN/xin_eng_200812.txt
883 | XIN/xin_eng_200504.txt
884 | XIN/xin_eng_200707.txt
885 | XIN/xin_eng_200810.txt
886 | XIN/xin_eng_200202.txt
887 | XIN/xin_eng_199710.txt
888 | XIN/xin_eng_200607.txt
889 | XIN/xin_eng_200605.txt
890 | XIN/xin_eng_200811.txt
891 | XIN/xin_eng_200108.txt
892 | XIN/xin_eng_200011.txt
893 | XIN/xin_eng_200708.txt
894 | XIN/xin_eng_199703.txt
895 | XIN/xin_eng_200801.txt
896 | XIN/xin_eng_200505.txt
897 | XIN/xin_eng_200709.txt
898 | XIN/xin_eng_199712.txt
899 | XIN/xin_eng_200807.txt
900 | XIN/xin_eng_200206.txt
901 | XIN/xin_eng_200204.txt
902 | XIN/xin_eng_200610.txt
903 | XIN/xin_eng_200910.txt
904 | XIN/xin_eng_199611.txt
905 |
--------------------------------------------------------------------------------
/dataset/valid.splits:
--------------------------------------------------------------------------------
1 | AFP/afp_eng_200601.txt
2 | AFP/afp_eng_199702.txt
3 | AFP/afp_eng_200506.txt
4 | AFP/afp_eng_200308.txt
5 | AFP/afp_eng_200703.txt
6 | AFP/afp_eng_199502.txt
7 | AFP/afp_eng_199406.txt
8 | APW/apw_eng_199606.txt
9 | APW/apw_eng_199807.txt
10 | APW/apw_eng_200204.txt
11 | APW/apw_eng_199708.txt
12 | APW/apw_eng_200907.txt
13 | APW/apw_eng_200309.txt
14 | APW/apw_eng_200205.txt
15 | APW/apw_eng_200210.txt
16 | APW/apw_eng_200608.txt
17 | CNA/cna_eng_200710.txt
18 | CNA/cna_eng_200912.txt
19 | CNA/cna_eng_200102.txt
20 | CNA/cna_eng_200803.txt
21 | CNA/cna_eng_200504.txt
22 | CNA/cna_eng_200003.txt
23 | CNA/cna_eng_201005.txt
24 | LTW/ltw_eng_200310.txt
25 | LTW/ltw_eng_200512.txt
26 | LTW/ltw_eng_199503.txt
27 | LTW/ltw_eng_200407.txt
28 | LTW/ltw_eng_199803.txt
29 | LTW/ltw_eng_199509.txt
30 | NYT/nyt_eng_200205.txt
31 | NYT/nyt_eng_200203.txt
32 | NYT/nyt_eng_199512.txt
33 | NYT/nyt_eng_200307.txt
34 | NYT/nyt_eng_199709.txt
35 | NYT/nyt_eng_199704.txt
36 | NYT/nyt_eng_201011.txt
37 | NYT/nyt_eng_200003.txt
38 | NYT/nyt_eng_200306.txt
39 | XIN/xin_eng_199901.txt
40 | XIN/xin_eng_200702.txt
41 | XIN/xin_eng_200407.txt
42 | XIN/xin_eng_199904.txt
43 | XIN/xin_eng_200406.txt
44 | XIN/xin_eng_200306.txt
45 | XIN/xin_eng_199510.txt
46 | XIN/xin_eng_199908.txt
47 | XIN/xin_eng_200212.txt
48 |
--------------------------------------------------------------------------------
/prep_torch_data.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 |
3 | COUNT=5
4 | WINDOW=5
5 |
6 | DATA_DIR=$1
7 | OUT_DIR=$1/processed
8 | SCRIPTS=$ABS/summary
9 |
10 | export LUA_PATH="$LUA_PATH;$ABS/?.lua"
11 |
12 | mkdir -p $OUT_DIR
13 |
14 | th $SCRIPTS/build_dict.lua -inf $DATA_DIR/train.article.dict -outf $OUT_DIR/train.article.dict.torch
15 | th $SCRIPTS/build_dict.lua -inf $DATA_DIR/train.title.dict -outf $OUT_DIR/train.title.dict.torch
16 |
17 | echo "-- Creating data directories."
18 | mkdir -p $OUT_DIR/train/title
19 | mkdir -p $OUT_DIR/train/article
20 |
21 | mkdir -p $OUT_DIR/valid.filter/title
22 | mkdir -p $OUT_DIR/valid.filter/article
23 |
24 | cp $OUT_DIR/train.title.dict.torch $OUT_DIR/train/title/dict
25 | cp $OUT_DIR/train.article.dict.torch $OUT_DIR/train/article/dict
26 |
27 |
28 | echo "-- Build the matrices"
29 |
30 | # Share the dictionary.
31 | th $SCRIPTS/build.lua -inArticleDictionary $OUT_DIR/train.article.dict.torch -inTitleDictionary $OUT_DIR/train.title.dict.torch -inTitleFile $DATA_DIR/valid.title.filter.txt -outTitleDirectory $OUT_DIR/valid.filter/title/ -inArticleFile $DATA_DIR/valid.article.filter.txt -outArticleDirectory $OUT_DIR/valid.filter/article/ -window $WINDOW
32 |
33 | th $SCRIPTS/build.lua -inArticleDictionary $OUT_DIR/train.article.dict.torch -inTitleDictionary $OUT_DIR/train.title.dict.torch -inTitleFile $DATA_DIR/train.title.txt -outTitleDirectory $OUT_DIR/train/title/ -inArticleFile $DATA_DIR/train.article.txt -outArticleDirectory $OUT_DIR/train/article/ -window $WINDOW
34 |
--------------------------------------------------------------------------------
/summary/beam_search.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | -- A beam search decoder
14 | local data = require('summary.data')
15 | local features = require('summary.features')
16 | local util = require('summary.util')
17 |
18 | local beam = {}
19 | local INF = 1e9
20 |
21 | function beam.addOpts(cmd)
22 | cmd:option('-allowUNK', false, "Allow generating .")
23 | cmd:option('-fixedLength', true, "Produce exactly -length words.")
24 | cmd:option('-blockRepeatWords', false, "Disallow generating a word twice.")
25 | cmd:option('-lmWeight', 1.0, "Weight for main model.")
26 | cmd:option('-beamSize', 100, "Size of the beam.")
27 | cmd:option('-extractive', false, "Force fully extractive summary.")
28 | cmd:option('-abstractive', false, "Force fully abstractive summary.")
29 | cmd:option('-recombine', false, "Used hypothesis recombination.")
30 | features.addOpts(cmd)
31 | end
32 |
33 | function beam.init(opt, mlp, aux_model, article_to_title, dict)
34 | local new_beam = {}
35 | setmetatable(new_beam, { __index = beam })
36 | new_beam.opt = opt
37 | new_beam.K = opt.beamSize
38 | new_beam.mlp = mlp
39 | new_beam.aux_model = aux_model
40 | new_beam.article_to_title = article_to_title
41 | new_beam.dict = dict
42 |
43 | -- Special Symbols.
44 | new_beam.UNK = dict.symbol_to_index[""]
45 | new_beam.START = dict.symbol_to_index[""]
46 | new_beam.END = dict.symbol_to_index[""]
47 |
48 | return new_beam
49 | end
50 |
51 | -- Helper: convert flat index to matrix.
52 | local function flat_to_rc(v, indices, flat_index)
53 | local row = math.floor((flat_index - 1) / v:size(2)) + 1
54 | return row, indices[row][(flat_index - 1) % v:size(2) + 1]
55 | end
56 |
57 | -- Helper: find kmax of vector.
58 | local function find_k_max(pool, mat)
59 | local v = pool:forward(mat:t()):t()
60 | local orig_indices = pool.indices:t():add(1)
61 | return v:contiguous(), orig_indices
62 | end
63 |
64 | -- Use beam search to generate a summary of
65 | -- the article of length <= len.
66 | function beam:generate(article, len)
67 | local n = len
68 | local K = self.K
69 | local W = self.opt.window
70 |
71 | -- Initialize the extractive features.
72 | local feat_gen = features.init(self.opt, self.article_to_title)
73 | feat_gen:match_words(self.START, article)
74 | local F = feat_gen.num_features
75 | local FINAL_VAL = 1000
76 |
77 | -- Initilize the charts.
78 | -- scores[i][k] is the log prob of the k'th hyp of i words.
79 | -- hyps[i][k] contains the words in k'th hyp at
80 | -- i word (left padded with W ) tokens.
81 | -- feats[i][k][f] contains the feature count of
82 | -- the f features for the k'th hyp at word i.
83 | local result = {}
84 | local scores = torch.zeros(n+1, K):float()
85 | local hyps = torch.zeros(n+1, K, W+n+1):long()
86 | local feats = torch.zeros(n+1, K, F):float()
87 | hyps:fill(self.START)
88 |
89 | -- Initilialize used word set.
90 | -- words_used[i][k] is a set of the words used in the i,k hyp.
91 | local words_used = {}
92 | if self.opt.blockRepeatWords then
93 | for i = 1, n + 1 do
94 | words_used[i] = {}
95 | for k = 1, K do
96 | words_used[i][k] = {}
97 | end
98 | end
99 | end
100 |
101 | -- Find k-max columns of a matrix.
102 | -- Use 2*k in case some are invalid.
103 | local pool = nn.TemporalKMaxPooling(2*K)
104 |
105 | -- Main loop of beam search.
106 | for i = 1, n do
107 | local cur_beam = hyps[i]:narrow(2, i+1, W)
108 | local cur_K = K
109 |
110 | -- (1) Score all next words for each context in the beam.
111 | -- log p(y_{i+1} | y_c, x) for all y_c
112 | local input = data.make_input(article, cur_beam, cur_K)
113 | local model_scores = self.mlp:forward(input)
114 |
115 | local out = model_scores:clone():double()
116 | out:mul(self.opt.lmWeight)
117 |
118 | -- If length limit is reached, next word must be end.
119 | local finalized = (i == n) and self.opt.fixedLength
120 | if finalized then
121 | out[{{}, self.END}]:add(FINAL_VAL)
122 | else
123 | -- Apply hard constraints.
124 | out[{{}, self.START}] = -INF
125 | if not self.opt.allowUNK then
126 | out[{{}, self.UNK}] = -INF
127 | end
128 | if self.opt.fixedLength then
129 | out[{{}, self.END}] = -INF
130 | end
131 |
132 | -- Add additional extractive features.
133 | feat_gen:add_features(out, cur_beam)
134 | end
135 |
136 | -- Only take first row when starting out.
137 | if i == 1 then
138 | cur_K = 1
139 | out = out:narrow(1, 1, 1)
140 | model_scores = model_scores:narrow(1, 1, 1)
141 | end
142 |
143 | -- Prob of summary is log p + log p(y_{i+1} | y_c, x)
144 | for k = 1, cur_K do
145 | out[k]:add(scores[i][k])
146 | end
147 |
148 | -- (2) Retain the K-best words for each hypothesis using GPU.
149 | -- This leaves a KxK matrix which we flatten to a K^2 vector.
150 | local max_scores, mat_indices = find_k_max(pool, out:cuda())
151 | local flat = max_scores:view(max_scores:size(1)
152 | * max_scores:size(2)):float()
153 |
154 | -- 3) Construct the next hypotheses by taking the next k-best.
155 | local seen_ngram = {}
156 | for k = 1, K do
157 | for _ = 1, 100 do
158 |
159 | -- (3a) Pull the score, index, rank, and word of the
160 | -- current best in the table, and then zero it out.
161 | local score, index = flat:max(1)
162 | if finalized then
163 | score[1] = score[1] - FINAL_VAL
164 | end
165 | scores[i+1][k] = score[1]
166 | local prev_k, y_i1 = flat_to_rc(max_scores, mat_indices, index[1])
167 | flat[index[1]] = -INF
168 |
169 | -- (3b) Is this a valid next word?
170 | local blocked = (self.opt.blockRepeatWords and
171 | words_used[i][prev_k][y_i1])
172 |
173 | blocked = blocked or
174 | (self.opt.extractive and not feat_gen:has_ngram({y_i1}))
175 | blocked = blocked or
176 | (self.opt.abstractive and feat_gen:has_ngram({y_i1}))
177 |
178 | -- Hypothesis recombination.
179 | local new_context = {}
180 | if self.opt.recombine then
181 | for j = i+2, i+W do
182 | table.insert(new_context, hyps[i][prev_k][j])
183 | end
184 | table.insert(new_context, y_i1)
185 | blocked = blocked or util.has(seen_ngram, new_context)
186 | end
187 |
188 | -- (3c) Add the word, its score, and its features to the
189 | -- beam.
190 | if not blocked then
191 | -- Update tables with new hypothesis.
192 | for j = 1, i+W do
193 | local pword = hyps[i][prev_k][j]
194 | hyps[i+1][k][j] = pword
195 | words_used[i+1][k][pword] = true
196 | end
197 | hyps[i+1][k][i+W+1] = y_i1
198 | words_used[i+1][k][y_i1] = true
199 |
200 | -- Keep track of hypotheses seen.
201 | if self.opt.recombine then
202 | util.add(seen_ngram, new_context)
203 | end
204 |
205 | -- Keep track of features used (For MERT)
206 | feats[i+1][k]:copy(feats[i][prev_k])
207 | feat_gen:compute(feats[i+1][k], hyps[i+1][k],
208 | model_scores[prev_k][y_i1], y_i1, i)
209 |
210 | -- If we have produced an END symbol, push to stack.
211 | if y_i1 == self.END then
212 | table.insert(result, {i+1, scores[i+1][k],
213 | hyps[i+1][k]:clone(),
214 | feats[i+1][k]:clone()})
215 | scores[i+1][k] = -INF
216 | end
217 | break
218 | end
219 | end
220 | end
221 | end
222 |
223 | -- Sort by score.
224 | table.sort(result, function (a, b) return a[2] > b[2] end)
225 |
226 | -- Return the scores and hypotheses at the final stage.
227 | return result
228 | end
229 |
230 |
231 | return beam
232 |
--------------------------------------------------------------------------------
/summary/build.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | -- Script to build the dataset
14 | require('torch')
15 | local utils = require('summary/util')
16 |
17 | torch.setdefaulttensortype('torch.LongTensor')
18 |
19 | cmd = torch.CmdLine()
20 | cmd:text()
21 | cmd:text()
22 | cmd:text('Build torch serialized version of a summarization problem.')
23 | cmd:text()
24 |
25 | cmd:option('-window', 5, 'The ngram window to use.')
26 |
27 | cmd:option('-inTitleFile', '', 'The input file.')
28 | cmd:option('-inTitleDictionary', '', 'The input dictionary.')
29 | cmd:option('-outTitleDirectory', '', 'The output directory.')
30 | cmd:option('-inArticleFile', '', 'The input file.')
31 | cmd:option('-inArticleDictionary', '', 'The input dictionary.')
32 | cmd:option('-outArticleDirectory', '', 'The output directory.')
33 |
34 | opt = cmd:parse(arg)
35 |
36 | local function count(file, aligned_lengths, pad)
37 | -- Count up properties of the input file.
38 | local f = io.open(file, 'r')
39 | local counter = {
40 | nsents = 0,
41 | max_length = 0,
42 | aligned_lengths = {},
43 | line_lengths = {},
44 | bucket_words = {}}
45 | local nline = 1
46 | for l in f:lines() do
47 | local true_l = l
48 | if pad then
49 | true_l = " " .. l .. " "
50 | end
51 | local line = utils.string_split(true_l, " ")
52 | counter.line_lengths[#line] = (counter.line_lengths[#line] or 0) + 1
53 | counter.nsents = counter.nsents + 1
54 | counter.aligned_lengths[nline] = #line
55 | if aligned_lengths ~= nil then
56 | -- Add extra for implicit .
57 | counter.bucket_words[aligned_lengths[nline]] =
58 | (counter.bucket_words[aligned_lengths[nline]] or 0)
59 | + #line + 1
60 | end
61 | nline = nline + 1
62 | end
63 | return counter
64 | end
65 |
66 |
67 | local function build_article_matrices(dict, file, nsents, line_lengths)
68 | -- For each length bucket, construct a #sentence x length matrix
69 | -- of word forms.
70 | local f = io.open(file, 'r')
71 |
72 | -- One matrix for each length.
73 | local mat = {}
74 |
75 | -- Number of sentences seen of this length.
76 | local of_length = {}
77 |
78 | for length, count in pairs(line_lengths) do
79 | mat[length] = torch.zeros(count, length):long()
80 | of_length[length] = 1
81 | end
82 |
83 | -- For each sentence.
84 | -- Col 1 is its length bin.
85 | -- Col 2 is its position in bin.
86 | local pos = torch.zeros(nsents, 2):long()
87 |
88 | local nsent = 1
89 | for l in f:lines() do
90 | local true_l = " " .. l .. " "
91 | local line = utils.string_split(true_l, " ")
92 | local length = #line
93 | local nbin = of_length[length]
94 | for j = 1, #line do
95 | local index = dict.symbol_to_index[line[j]] or 1
96 | --assert(index ~= nil)
97 | mat[length][nbin][j] = index
98 | end
99 | pos[nsent][1] = length
100 | pos[nsent][2] = nbin
101 | of_length[length] = nbin + 1
102 | nsent = nsent + 1
103 | end
104 | return mat, pos
105 | end
106 |
107 |
108 | local function build_title_matrices(dict, file, aligned_lengths,
109 | bucket_sizes, window)
110 | -- For each article length bucket construct a num-words x 1 flat vector
111 | -- of word forms and a corresponding num-words x window matrix of
112 | -- context forms.
113 | local nsent = 1
114 | local pos = {}
115 |
116 | -- One matrix for each length.
117 | local mat = {}
118 | local ngram = {}
119 |
120 | -- Number of sentences seen of this length.
121 | local sent_of_length = {}
122 | local words_of_length = {}
123 |
124 | -- Initialize.
125 | for length, count in pairs(bucket_sizes) do
126 | mat[length] = torch.zeros(count, 3):long()
127 | sent_of_length[length] = 1
128 | words_of_length[length] = 1
129 | ngram[length] = torch.zeros(count, window):long()
130 | end
131 |
132 | -- Columns are the preceding window.
133 | local nline = 1
134 | local f = io.open(file, 'r')
135 | for l in f:lines() do
136 | -- Add implicit .
137 | local true_l = l .. " "
138 | local line = utils.string_split(true_l, " ")
139 |
140 | local last = {}
141 | -- Initialize window as START symbol.
142 | for w = 1, window do
143 | table.insert(last, dict.symbol_to_index[""])
144 | end
145 |
146 | local aligned_length = aligned_lengths[nline]
147 | for j = 1, #line do
148 | local nword = words_of_length[aligned_length]
149 | local index = dict.symbol_to_index[line[j]] or 1
150 |
151 | mat[aligned_length][nword][1] = index
152 | mat[aligned_length][nword][2] = sent_of_length[aligned_length]
153 | mat[aligned_length][nword][3] = j
154 |
155 | -- Move the window forward.
156 | for w = 1, window-1 do
157 | ngram[aligned_length][nword][w] = last[w]
158 | last[w] = last[w+1]
159 | end
160 | ngram[aligned_length][nword][window] = last[window]
161 | last[window] = index
162 | words_of_length[aligned_length] = words_of_length[aligned_length] + 1
163 | end
164 | sent_of_length[aligned_length] = sent_of_length[aligned_length] + 1
165 | nsent = nsent + 1
166 |
167 | -- Debug logging.
168 | if nsent % 100000 == 1 then
169 | print(nsent)
170 | end
171 | nline = nline + 1
172 | end
173 | return mat, pos, ngram
174 | end
175 |
176 | local function main()
177 | local counter = count(opt.inArticleFile, nil, true)
178 | local dict = torch.load(opt.inArticleDictionary)
179 |
180 | -- Construct a rectangular word matrix.
181 | local word_mat, offset_mat =
182 | build_article_matrices(dict, opt.inArticleFile,
183 | counter.nsents, counter.line_lengths)
184 | torch.save(opt.outArticleDirectory .. '/word.mat.torch', word_mat)
185 | torch.save(opt.outArticleDirectory .. '/offset.mat.torch', offset_mat)
186 |
187 | local title_counter = count(opt.inTitleFile, counter.aligned_lengths, false)
188 | local title_dict = torch.load(opt.inTitleDictionary)
189 |
190 | -- Construct a 1d word matrix.
191 | local word_mat, offset_mat, ngram_mat =
192 | build_title_matrices(title_dict,
193 | opt.inTitleFile,
194 | counter.aligned_lengths,
195 | title_counter.bucket_words,
196 | opt.window)
197 | torch.save(opt.outTitleDirectory .. '/word.mat.torch', word_mat)
198 | torch.save(opt.outTitleDirectory .. '/offset.mat.torch', offset_mat)
199 | torch.save(opt.outTitleDirectory .. '/ngram.mat.torch', ngram_mat)
200 | end
201 |
202 | main()
203 |
--------------------------------------------------------------------------------
/summary/build_dict.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | -- Script to build the dictionary
14 | local utils = require('summary/util')
15 |
16 | cmd = torch.CmdLine()
17 | cmd:text()
18 | cmd:text()
19 | cmd:text('Build torch serialized version of a dictionary file.')
20 | cmd:text()
21 | cmd:text('Options')
22 | cmd:option('-inf', '', 'The input dictionary.')
23 | cmd:option('-outf', '', 'The output directory.')
24 | cmd:text()
25 |
26 | opt = cmd:parse(arg)
27 |
28 | local f = io.open(opt.inf, 'r')
29 | local word_id = 0
30 | local dict = {symbol_to_index = {},
31 | index_to_symbol = {}}
32 | for l in f:lines() do
33 | word_id = word_id + 1
34 | local word = utils.string_split(l)[1]
35 | dict.symbol_to_index[word] = word_id
36 | dict.index_to_symbol[word_id] = word
37 | end
38 | torch.save(opt.outf, dict)
39 |
--------------------------------------------------------------------------------
/summary/data.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | -- Load data for summary experiments.
14 | local util = require('summary/util')
15 |
16 | local data = {}
17 |
18 | function data.add_opts(cmd)
19 | cmd:option('-articleDir', '',
20 | 'Directory containing article training matrices.')
21 | cmd:option('-titleDir', '',
22 | 'Directory containing title training matrices.')
23 | cmd:option('-validArticleDir', '',
24 | 'Directory containing article matricess for validation.')
25 | cmd:option('-validTitleDir', '',
26 | 'Directory containing title matrices for validation.')
27 | end
28 |
29 | function data.load(article_dir, title_dir)
30 | return data.init()
31 | end
32 |
33 | function data.init(title_data, article_data)
34 | local new_data = {}
35 | setmetatable(new_data, { __index = data })
36 | new_data.title_data = title_data
37 | new_data.article_data = article_data
38 | new_data:reset()
39 | return new_data
40 | end
41 |
42 | function data:reset()
43 | self.bucket_order = {}
44 | for length, _ in pairs(self.title_data.target) do
45 | table.insert(self.bucket_order, length)
46 | end
47 | util.shuffleTable(self.bucket_order)
48 | self.bucket_index = 0
49 | self:load_next_bucket()
50 | end
51 |
52 | function data:load_next_bucket()
53 | self.done_bucket = false
54 | self.bucket_index = self.bucket_index + 1
55 | self.bucket = self.bucket_order[self.bucket_index]
56 | self.bucket_size = self.title_data.target[self.bucket]:size(1)
57 | self.pos = 1
58 | self.aux_ptrs = self.title_data.sentences[self.bucket]:float():long()
59 | self.positions = torch.range(1, self.bucket):view(1, self.bucket)
60 | :expand(1000, self.bucket):contiguous():cuda() + (200 * self.bucket)
61 | end
62 |
63 | function data:is_done()
64 | return self.bucket_index >= #self.bucket_order - 1 and
65 | self.done_bucket
66 | end
67 |
68 | function data:next_batch(max_size)
69 | local diff = self.bucket_size - self.pos
70 | if self.done_bucket or diff == 0 or diff == 1 then
71 | self:load_next_bucket()
72 | end
73 | local offset
74 | if self.pos + max_size > self.bucket_size then
75 | offset = self.bucket_size - self.pos
76 | self.done_bucket = true
77 | else
78 | offset = max_size
79 | end
80 | local positions = self.positions:narrow(1, 1, offset)
81 |
82 | local aux_rows = self.article_data.words[self.bucket]:
83 | index(1, self.aux_ptrs:narrow(1, self.pos, offset))
84 | local context = self.title_data.ngram[self.bucket]
85 | :narrow(1, self.pos, offset)
86 | local target = self.title_data.target[self.bucket]
87 | :narrow(1, self.pos, offset)
88 | self.pos = self.pos + offset
89 | return {aux_rows, positions, context}, target
90 | end
91 |
92 | function data.make_input(article, context, K)
93 | local bucket = article:size(1)
94 | local aux_sentence = article:view(bucket, 1)
95 | :expand(article:size(1), K):t():contiguous():cuda()
96 | local positions = torch.range(1, bucket):view(bucket, 1)
97 | :expand(bucket, K):t():contiguous():cuda() + (200 * bucket)
98 | return {aux_sentence, positions, context}
99 | end
100 |
101 | function data.load_title_dict(dname)
102 | return torch.load(dname .. 'dict')
103 | end
104 |
105 | function data.load_title(dname, shuffle, use_dict)
106 | local ngram = torch.load(dname .. 'ngram.mat.torch')
107 | local words = torch.load(dname .. 'word.mat.torch')
108 | local dict = use_dict or torch.load(dname .. 'dict')
109 | local target_full = {}
110 | local sentences_full = {}
111 | local pos_full = {}
112 | for length, mat in pairs(ngram) do
113 | if shuffle ~= nil then
114 | local perm = torch.randperm(ngram[length]:size(1)):long()
115 | ngram[length] = ngram[length]:index(1, perm):float():cuda()
116 | words[length] = words[length]:index(1, perm)
117 | else
118 | ngram[length] = ngram[length]:float():cuda()
119 | end
120 | assert(ngram[length]:size(1) == words[length]:size(1))
121 | target_full[length] = words[length][{{}, 1}]:contiguous():float():cuda()
122 | sentences_full[length] =
123 | words[length][{{}, 2}]:contiguous():float():cuda()
124 | pos_full[length] = words[length][{{}, 3}]
125 |
126 | end
127 | local title_data = {ngram = ngram,
128 | target = target_full,
129 | sentences = sentences_full,
130 | pos = pos_full,
131 | dict = dict}
132 | return title_data
133 | end
134 |
135 | function data.load_article(dname, use_dict)
136 | local input_words = torch.load(dname .. 'word.mat.torch')
137 | -- local offsets = torch.load(dname .. 'offset.mat.torch')
138 |
139 | local dict = use_dict or torch.load(dname .. 'dict')
140 | for length, mat in pairs(input_words) do
141 | input_words[length] = mat
142 | input_words[length] = input_words[length]:float():cuda()
143 | end
144 | local article_data = {words = input_words, dict = dict}
145 | return article_data
146 | end
147 |
148 | return data
149 |
--------------------------------------------------------------------------------
/summary/encoder.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | -- require('fbcunn')
14 |
15 | local encoder = {}
16 |
17 | function encoder.add_opts(cmd)
18 | cmd:option('-encoderModel', 'bow', "The encoder model to use.")
19 | cmd:option('-bowDim', 50, "Article embedding size.")
20 | cmd:option('-attenPool', 5, "Attention model pooling size.")
21 | cmd:option('-hiddenUnits', 1000, "Conv net encoder hidden units.")
22 | cmd:option('-kernelWidth', 5, "Conv net encoder kernel width.")
23 | end
24 |
25 |
26 | function encoder.build(opt, data)
27 | torch.setdefaulttensortype("torch.CudaTensor")
28 | local model = nil
29 | if opt.encoderModel == "none" then
30 | model = encoder.build_blank_model(opt, data)
31 | elseif opt.encoderModel == "bow" then
32 | model = encoder.build_bow_model(opt, data)
33 | elseif opt.encoderModel == "attenbow" then
34 | model = encoder.build_attnbow_model(opt, data)
35 | elseif opt.encoderModel == "conv" then
36 | model = encoder.build_conv_model(opt, data)
37 | end
38 | torch.setdefaulttensortype("torch.DoubleTensor")
39 | return model
40 | end
41 |
42 |
43 | function encoder.build_blank_model(opt, data)
44 | -- Ignores the article layer entirely (acts like LM).
45 | local lookup = nn.Identity()()
46 | local ignore1 = nn.Identity()()
47 | local ignore2 = nn.Identity()()
48 | local start = nn.SelectTable(3)({lookup, ignore1, ignore2})
49 |
50 | local mout = nn.MulConstant(0)(start)
51 | local encoder_mlp = nn.gModule({lookup, ignore1, ignore2}, {mout})
52 | encoder_mlp:cuda()
53 | return encoder_mlp
54 | end
55 |
56 |
57 | function encoder.build_bow_model(opt, data)
58 | print("Encoder model: Bag-of-Words")
59 |
60 | -- BOW with mean on article.
61 | local lookup = nn.LookupTable(
62 | #data.article_data.dict.index_to_symbol,
63 | opt.bowDim)()
64 |
65 | -- Ignore the context.
66 | local ignore1 = nn.Identity()()
67 | local ignore2 = nn.Identity()()
68 |
69 | -- Ignores the context and position input.
70 | local start = nn.SelectTable(1)({lookup, ignore1, ignore2})
71 | local mout = nn.Linear(opt.bowDim, opt.bowDim)(
72 | nn.Mean(3)(nn.Transpose({2, 3})(start)))
73 |
74 | local encoder_mlp = nn.gModule({lookup, ignore1, ignore2}, {mout})
75 | encoder_mlp:cuda()
76 |
77 | return encoder_mlp
78 | end
79 |
80 |
81 | function encoder.build_conv_model(opt, data)
82 | -- Three layer thin convolutional architecture.
83 | print("Encoder model: Conv")
84 | local V2 = #data.article_data.dict.index_to_symbol
85 | local nhid = opt.hiddenUnits
86 |
87 | -- Article embedding.
88 | local article_lookup = nn.LookupTable(V2, nhid)()
89 |
90 | -- Ignore the context.
91 | local ignore1 = nn.Identity()()
92 | local ignore2 = nn.Identity()()
93 | local start = nn.SelectTable(1)({article_lookup, ignore1, ignore2})
94 | local kwidth = opt.kernelWidth
95 | local model = nn.Sequential()
96 | model:add(nn.View(1, -1, nhid):setNumInputDims(2))
97 | model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0))
98 | model:add(cudnn.SpatialMaxPooling(1, 2, 1, 2))
99 | model:add(nn.Threshold())
100 | model:add(nn.Transpose({2,4}))
101 |
102 | -- layer 2
103 | model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0))
104 | model:add(nn.Threshold())
105 | model:add(nn.Transpose({2,4}))
106 |
107 | -- layer 3
108 | model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0))
109 | model:add(nn.View(nhid, -1):setNumInputDims(3))
110 | model:add(nn.Max(3))
111 | local done = nn.View(opt.hiddenUnits)(model(start))
112 |
113 | local mout = nn.Linear(opt.hiddenUnits, opt.embeddingDim)(done)
114 |
115 | local encoder_mlp = nn.gModule({article_lookup, ignore1, ignore2}, {mout})
116 | encoder_mlp.lookup = article_lookup.data.module
117 | encoder_mlp:cuda()
118 | return encoder_mlp
119 | end
120 |
121 |
122 | function encoder.build_attnbow_model(opt, data)
123 | print("Encoder model: BoW + Attention")
124 |
125 | local D2 = opt.bowDim
126 | local N = opt.window
127 | local V = #data.title_data.dict.index_to_symbol
128 | local V2 = #data.article_data.dict.index_to_symbol
129 |
130 | -- Article Embedding.
131 | local article_lookup = nn.LookupTable(V2, D2)()
132 |
133 | -- Title Embedding.
134 | local title_lookup = nn.LookupTable(V, D2)()
135 |
136 | -- Size Lookup
137 | local size_lookup = nn.Identity()()
138 |
139 | -- Ignore size lookup to make NNGraph happy.
140 | local article_context = nn.SelectTable(1)({article_lookup, size_lookup})
141 |
142 | -- Pool article
143 | local pad = (opt.attenPool - 1) / 2
144 | local article_match = article_context
145 |
146 | -- Title context embedding.
147 | local title_context = nn.View(D2, 1)(
148 | nn.Linear(N * D2, D2)(nn.View(N * D2)(title_lookup)))
149 |
150 | -- Attention layer. Distribution over article.
151 | local dot_article_context = nn.MM()({article_match,
152 | title_context})
153 |
154 | -- Compute the attention distribution.
155 | local non_linearity = nn.SoftMax()
156 | local attention = non_linearity(nn.Sum(3)(dot_article_context))
157 |
158 | local process_article =
159 | nn.Sum(2)(nn.SpatialSubSampling(1, 1, opt.attenPool)(
160 | nn.SpatialZeroPadding(0, 0, pad, pad)(
161 | nn.View(1, -1, D2):setNumInputDims(2)(article_context))))
162 |
163 | -- Apply attention to the subsampled article.
164 | local mout = nn.Linear(D2, D2)(
165 | nn.Sum(3)(nn.MM(true, false)(
166 | {process_article,
167 | nn.View(-1, 1):setNumInputDims(1)(attention)})))
168 |
169 | -- Apply attention
170 | local encoder_mlp = nn.gModule({article_lookup, size_lookup, title_lookup},
171 | {mout})
172 |
173 | encoder_mlp:cuda()
174 | encoder_mlp.lookup = article_lookup.data.module
175 | encoder_mlp.title_lookup = title_lookup.data.module
176 | return encoder_mlp
177 | end
178 |
179 | return encoder
180 |
--------------------------------------------------------------------------------
/summary/features.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | local util = require('summary.util')
14 |
15 | local features = {}
16 |
17 | function features.addOpts(cmd)
18 | cmd:option('-lmWeight', 1.0, "Feature weight for the neural model.")
19 | cmd:option('-unigramBonus', 0.0, "Feature weight for unigram extraction.")
20 | cmd:option('-bigramBonus', 0.0, "Feature weight for bigram extraction.")
21 | cmd:option('-trigramBonus', 0.0, "Feature weight for trigram extraction.")
22 | cmd:option('-lengthBonus', 0.0, "Feature weight for length.")
23 | cmd:option('-unorderBonus', 0.0, "Feature weight for out-of-order.")
24 | end
25 |
26 | -- Feature positions.
27 | local NNLM = 1
28 | local UNI = 2
29 | local BI = 3
30 | local TRI = 4
31 | local OO = 5
32 | local LEN = 6
33 |
34 | local kFeat = 6
35 |
36 | function features.init(opt, article_to_title)
37 | local new_features = {}
38 | setmetatable(new_features, { __index = features })
39 | new_features.opt = opt
40 | new_features.num_features = kFeat
41 | new_features.article_to_title = article_to_title
42 | return new_features
43 | end
44 |
45 | -- Helper: Are words in article.
46 | function features:has_ngram(words)
47 | return util.has(self.ngrams[#words], words)
48 | end
49 |
50 | -- Augment the feature count based on the new word.
51 | function features:compute(f_new, hyp, out_score, y_i1, i)
52 | local W = self.opt.window
53 |
54 | -- LM Features.
55 | f_new[NNLM] = f_new[NNLM] + out_score
56 |
57 | if self:has_ngram({y_i1}) then
58 | f_new[UNI] = f_new[UNI] + 1
59 | end
60 |
61 | if self:has_ngram({hyp[i+W], y_i1}) then
62 | f_new[BI] = f_new[BI] + 1
63 | end
64 |
65 | if self:has_ngram({hyp[i+W-1], hyp[i+W], y_i1}) then
66 | f_new[TRI] = f_new[TRI] + 1
67 | end
68 |
69 | if self.ooordered_ngram[hyp[i+W]] ~= nil and
70 | self.ooordered_ngram[hyp[i+W]][y_i1] ~= nil then
71 | f_new[OO] = f_new[OO] + 1
72 | end
73 |
74 | -- Length
75 | f_new[LEN] = f_new[LEN] + 1
76 | end
77 |
78 | -- Augment the score based on the extractive feature values.
79 | function features:add_features(out, beam)
80 | local W = self.opt.window
81 | for k = 1, beam:size(1) do
82 |
83 | -- Exact unigram matches.
84 | for s, _ in pairs(self.ngrams[1]) do
85 | out[k][s] = out[k][s] + self.opt.unigramBonus
86 | end
87 |
88 | -- Exact bigram matches.
89 | if self.ngrams[2][beam[k][W]] ~= nil then
90 | for s, _ in pairs(self.ngrams[2][beam[k][W]]) do
91 | out[k][s] = out[k][s] + self.opt.bigramBonus
92 | end
93 | end
94 |
95 | -- Exact trigram matches.
96 | if self.ngrams[3][beam[k][W-1]] ~= nil and
97 | self.ngrams[3][beam[k][W-1]][beam[k][W]] then
98 | for s, _ in pairs(self.ngrams[3][beam[k][W-1]][beam[k][W]]) do
99 | out[k][s] = out[k][s] + self.opt.trigramBonus
100 | end
101 | end
102 |
103 | if self.ooordered_ngram[beam[k][W]] ~= nil then
104 | for s, _ in pairs(self.ooordered_ngram[beam[k][W]]) do
105 | out[k][s] = out[k][s] + self.opt.unorderBonus
106 | end
107 | end
108 | end
109 | out:add(self.opt.lengthBonus)
110 | end
111 |
112 | -- Precompute extractive table based on the input article.
113 | function features:match_words(START, article)
114 | self.ooordered_ngram = {}
115 | local ordered_ngram = {}
116 | self.ngrams = {{}, {}, {}}
117 | local hist = {START, START, START, START}
118 |
119 | for j = 1, article:size(1) do
120 | local tw = self.article_to_title[article[j]]
121 |
122 | -- Does the current word exist in title dict.
123 | if tw ~= nil then
124 | for j2 = 1, j do
125 | local tw2 = self.article_to_title[article[j2]]
126 | if tw2 ~= nil then
127 | util.add(ordered_ngram, {tw2, tw})
128 | if not util.has(ordered_ngram, {tw, tw2}) then
129 | util.add(self.ooordered_ngram, {tw, tw2})
130 | end
131 | end
132 | end
133 |
134 | util.add(self.ngrams[1], {tw})
135 | util.add(self.ngrams[2], {hist[3], tw})
136 | util.add(self.ngrams[3], {hist[2], hist[3], tw})
137 | end
138 |
139 | -- Advance window.
140 | for k = 2, 4 do
141 | hist[k-1] = hist[k]
142 | end
143 | hist[4] = tw
144 | end
145 | end
146 |
147 | return features
148 |
--------------------------------------------------------------------------------
/summary/nnlm.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | -- Ngram neural language model with auxiliary model
14 | require('nn')
15 | require('nngraph')
16 | require('fbnn')
17 | require('cunn')
18 | require('sys')
19 | local utils = require('summary.util')
20 |
21 | local nnlm = {}
22 |
23 | function nnlm.addOpts()
24 | cmd:option('-epochs', 5, "Number of epochs to train.")
25 | cmd:option('-miniBatchSize', 64, "Size of training minibatch.")
26 | cmd:option('-printEvery', 10000, "How often to print during training.")
27 | cmd:option('-modelFilename', '', "File for saving loading/model.")
28 | cmd:option('-window', 5, "Size of NNLM window.")
29 | cmd:option('-embeddingDim', 50, "Size of NNLM embeddings.")
30 | cmd:option('-hiddenSize', 100, "Size of NNLM hiddent layer.")
31 | cmd:option('-learningRate', 0.1, "SGD learning rate.")
32 | end
33 |
34 |
35 | function nnlm.create_lm(opt, dict, encoder, encoder_size, encoder_dict)
36 | local new_mlp = {}
37 | setmetatable(new_mlp, { __index = nnlm })
38 | new_mlp.opt = opt
39 | new_mlp.dict = dict
40 | new_mlp.encoder_dict = encoder_dict
41 | new_mlp.encoder_model = encoder
42 | new_mlp.window = opt.window
43 | if encoder ~= nil then
44 | new_mlp:build_mlp(encoder, encoder_size)
45 | end
46 | return new_mlp
47 | end
48 |
49 |
50 | function nnlm:build_mlp(encoder, encoder_size)
51 | -- Set constants
52 | local D = self.opt.embeddingDim
53 | local N = self.opt.window
54 | local H = self.opt.hiddenSize
55 | local V = #self.dict.index_to_symbol
56 | local P = encoder_size
57 | print(H, P)
58 |
59 | -- Input
60 | local context_input = nn.Identity()()
61 | local encoder_input = nn.Identity()()
62 | local position_input = nn.Identity()()
63 |
64 | local lookup = nn.LookupTable(V, D)(context_input)
65 | local encoder_node = encoder({encoder_input, position_input, context_input})
66 |
67 | -- tanh W (E y)
68 | local lm_mlp = nn.Tanh()(nn.Linear(D * N, H)(nn.View(D * N)(lookup)))
69 |
70 | -- Second layer: takes LM and encoder model.
71 | local mlp = nn.Linear(H + P, V)(nn.View(H + P)(nn.JoinTable(2)(
72 | {lm_mlp, encoder_node})))
73 | self.soft_max = nn.LogSoftMax()(mlp)
74 |
75 | -- Input is conditional context and ngram context.
76 | self.mlp = nn.gModule({encoder_input, position_input, context_input},
77 | {self.soft_max})
78 |
79 | self.criterion = nn.ClassNLLCriterion()
80 | self.lookup = lookup.data.module
81 | self.mlp:cuda()
82 | self.criterion:cuda()
83 | collectgarbage()
84 | end
85 |
86 |
87 | -- Run validation
88 | function nnlm:validation(valid_data)
89 | print("[Running Validation]")
90 |
91 | local offset = 1000
92 | local loss = 0
93 | local total = 0
94 |
95 | valid_data:reset()
96 | while not valid_data:is_done() do
97 | local input, target = valid_data:next_batch(offset)
98 | local out = self.mlp:forward(input)
99 | local err = self.criterion:forward(out, target) * target:size(1)
100 |
101 | -- Augment counters.
102 | loss = loss + err
103 | total = total + target:size(1)
104 | end
105 | print(string.format("[perp: %f validation: %f total: %d]",
106 | math.exp(loss/total),
107 | loss/total, total))
108 | return loss / total
109 | end
110 |
111 |
112 | function nnlm:renorm(data, th)
113 | local size = data:size(1)
114 | for i = 1, size do
115 | local norm = data[i]:norm()
116 | if norm > th then
117 | data[i]:div(norm/th)
118 | end
119 | end
120 | end
121 |
122 |
123 | function nnlm:renorm_tables()
124 | -- Renormalize the lookup tables.
125 | if self.lookup ~= nil then
126 | print(self.lookup.weight:size())
127 | print(self.lookup.weight:type())
128 | self:renorm(self.lookup.weight, 1)
129 | end
130 | if self.encoder_model.lookup ~= nil then
131 | self:renorm(self.encoder_model.lookup.weight, 1)
132 | if self.encoder_model.title_lookup ~= nil then
133 | self:renorm(self.encoder_model.title_lookup.weight, 1)
134 | end
135 | end
136 | if self.encoder_model.lookups ~= nil then
137 | for i = 1, #self.encoder_model.lookups do
138 | self:renorm(self.encoder_model.lookups[i].weight, 1)
139 | end
140 | end
141 | end
142 |
143 |
144 | function nnlm:run_valid(valid_data)
145 | -- Run validation.
146 | if valid_data ~= nil then
147 | local cur_valid_loss = self:validation(valid_data)
148 | -- If valid loss does not improve drop learning rate.
149 | if cur_valid_loss > self.last_valid_loss then
150 | self.opt.learningRate = self.opt.learningRate / 2
151 | end
152 | self.last_valid_loss = cur_valid_loss
153 | end
154 |
155 | -- Save the model.
156 | self:save(self.opt.modelFilename)
157 | end
158 |
159 |
160 | function nnlm:train(data, valid_data)
161 | -- Best loss seen yet.
162 | self.last_valid_loss = 1e9
163 | -- Train
164 | for epoch = 1, self.opt.epochs do
165 | data:reset()
166 | self:renorm_tables()
167 | self:run_valid(valid_data)
168 |
169 | -- Loss for the epoch.
170 | local epoch_loss = 0
171 | local batch = 1
172 | local last_batch = 1
173 | local total = 0
174 | local loss = 0
175 |
176 | sys.tic()
177 | while not data:is_done() do
178 | local input, target = data:next_batch(self.opt.miniBatchSize)
179 | if data:is_done() then break end
180 |
181 | local out = self.mlp:forward(input)
182 | local err = self.criterion:forward(out, target) * target:size(1)
183 | local deriv = self.criterion:backward(out, target)
184 |
185 | if not utils.isnan(err) then
186 | loss = loss + err
187 | epoch_loss = epoch_loss + err
188 |
189 | self.mlp:zeroGradParameters()
190 | self.mlp:backward(input, deriv)
191 | self.mlp:updateParameters(self.opt.learningRate)
192 | else
193 | print("NaN")
194 | print(input)
195 | end
196 |
197 | -- Logging
198 | if batch % self.opt.printEvery == 1 then
199 | print(string.format(
200 | "[Loss: %f Epoch: %d Position: %d Rate: %f Time: %f]",
201 | loss / ((batch - last_batch) * self.opt.miniBatchSize),
202 | epoch,
203 | batch * self.opt.miniBatchSize,
204 | self.opt.learningRate,
205 | sys.toc()
206 | ))
207 | sys.tic()
208 | last_batch = batch
209 | loss = 0
210 | end
211 |
212 | batch = batch + 1
213 | total = total + input[1]:size(1)
214 | end
215 | print(string.format("[EPOCH : %d LOSS: %f TOTAL: %d BATCHES: %d]",
216 | epoch, epoch_loss / total, total, batch))
217 | end
218 | end
219 |
220 |
221 | function nnlm:save(fname)
222 | print("[saving mlp: " .. fname .. "]")
223 | torch.save(fname, self)
224 | return true
225 | end
226 |
227 |
228 | function nnlm:load(fname)
229 | local new_self = torch.load(fname)
230 | for k, v in pairs(new_self) do
231 | if k ~= 'opt' then
232 | self[k] = v
233 | end
234 | end
235 | return true
236 | end
237 |
238 |
239 | return nnlm
240 |
--------------------------------------------------------------------------------
/summary/run.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | require('torch')
14 | require('nn')
15 | require('sys')
16 |
17 | local nnlm = require('summary.nnlm')
18 | local encoder = require('summary.encoder')
19 | local beam = require('summary.beam_search')
20 | local utils = require('summary.util')
21 |
22 | cmd = torch.CmdLine()
23 |
24 | beam.addOpts(cmd)
25 |
26 | cutorch.setDevice(2)
27 |
28 | cmd:option('-modelFilename', '', 'Model to test.')
29 | cmd:option('-inputf', '', 'Input article files. ')
30 | cmd:option('-nbest', false, 'Write out the nbest list in ZMert format.')
31 | cmd:option('-length', 15, 'Maximum length of summary.')
32 | opt = cmd:parse(arg)
33 |
34 | -- Map the words from one dictionary to another.
35 | local function sync_dicts(dict1, dict2)
36 | local dict_map = torch.ones(#dict1.index_to_symbol):long()
37 | for i = 1, #dict1.index_to_symbol do
38 | local res = dict2.symbol_to_index[dict1.index_to_symbol[i]]
39 | dict_map[i] = res or 1
40 | end
41 | return dict_map
42 | end
43 |
44 | -- Apply digit preprocessing.
45 | local function process_word(input_word)
46 | local word = string.lower(input_word)
47 | for i = 1, word:len() do
48 | if word:sub(i, i) >= '0' and word:sub(i, i) <= '9' then
49 | word = word:sub(1, i-1) .. '#' .. word:sub(i+1)
50 | end
51 | end
52 | return word
53 | end
54 |
55 | local function main()
56 | -- Load in the dictionaries and the input files.
57 | local mlp = nnlm.create_lm(opt)
58 | mlp:load(opt.modelFilename)
59 | local adict = mlp.encoder_dict
60 | local tdict = mlp.dict
61 |
62 | local dict_map = sync_dicts(adict, tdict)
63 | local sent_file = assert(io.open(opt.inputf))
64 | local len = opt.length
65 | local W = mlp.window
66 | opt.window = W
67 |
68 | local sent_num = 0
69 | for line in sent_file:lines() do
70 | sent_num = sent_num + 1
71 |
72 | -- Add padding.
73 | local true_line = " " .. line .. " "
74 | local words = utils.string_split(true_line)
75 |
76 | local article = torch.zeros(#words)
77 | for j = 1, #words do
78 | local word = process_word(words[j])
79 | article[j] = adict.symbol_to_index[word] or
80 | adict.symbol_to_index[""]
81 | end
82 |
83 | -- Run beam search.
84 | local sbeam = beam.init(opt, mlp.mlp, mlp.encoder_model,
85 | dict_map, tdict)
86 | local results = sbeam:generate(article, len)
87 |
88 | if not opt.nbest then
89 | if #results == 0 then
90 | io.write("*FAIL*")
91 | else
92 | -- Print out in standard format.
93 | local len, _, output, _ = unpack(results[1])
94 | local total = 0
95 | for j = W+2, W+len - 1 do
96 | local word = tdict.index_to_symbol[output[j]]
97 | total = total + #word + 1
98 | io.write(word, " " )
99 | end
100 | end
101 | print("")
102 | else
103 | -- Print out an nbest list in Moses/ZMert format.
104 | for k = 1, #results do
105 | io.write(sent_num-1, " ||| ")
106 | local len, score, output, features = unpack(results[k])
107 | for j = W+2, W+len - 1 do
108 | io.write(tdict.index_to_symbol[output[j]], " " )
109 | end
110 | io.write(" ||| ")
111 | for f = 1, features:size(1) do
112 | io.write(features[f], " ")
113 | end
114 | io.write(" ||| ", score)
115 | print("")
116 | end
117 | end
118 | end
119 | end
120 |
121 | main()
122 |
--------------------------------------------------------------------------------
/summary/train.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | -- The top-level training script
14 | require('torch')
15 | require('nngraph')
16 |
17 | local nnlm = require('summary.nnlm')
18 | local data = require('summary.data')
19 | local encoder = require('summary.encoder')
20 |
21 | cmd = torch.CmdLine()
22 | cmd:text()
23 | cmd:text()
24 | cmd:text('Train a summarization model.')
25 | cmd:text()
26 |
27 | data.add_opts(cmd)
28 | encoder.add_opts(cmd)
29 | nnlm.addOpts(cmd)
30 |
31 | opt = cmd:parse(arg)
32 |
33 | local function main()
34 | -- Load in the data.
35 | local tdata = data.load_title(opt.titleDir, true)
36 | local article_data = data.load_article(opt.articleDir)
37 |
38 | local valid_data = data.load_title(opt.validTitleDir, nil, tdata.dict)
39 | local valid_article_data =
40 | data.load_article(opt.validArticleDir, article_data.dict)
41 |
42 | -- Make main LM
43 | local train_data = data.init(tdata, article_data)
44 | local valid = data.init(valid_data, valid_article_data)
45 | local encoder_mlp = encoder.build(opt, train_data)
46 | local mlp = nnlm.create_lm(opt, tdata.dict, encoder_mlp,
47 | opt.bowDim, article_data.dict)
48 |
49 | mlp:train(train_data, valid)
50 | end
51 |
52 | main()
53 |
--------------------------------------------------------------------------------
/summary/util.lua:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright (c) 2015, Facebook, Inc.
3 | -- All rights reserved.
4 | --
5 | -- This source code is licensed under the BSD-style license found in the
6 | -- LICENSE file in the root directory of this source tree. An additional grant
7 | -- of patent rights can be found in the PATENTS file in the same directory.
8 | --
9 | -- Author: Alexander M Rush
10 | -- Sumit Chopra
11 | -- Jason Weston
12 |
13 | -- The utility tool box
14 | local util = {}
15 |
16 | function util.string_shortfloat(t)
17 | return string.format('%2.4g', t)
18 | end
19 |
20 | function util.shuffleTable(t)
21 | local rand = math.random
22 | local iterations = #t
23 | local j
24 | for i = iterations, 2, -1 do
25 | j = rand(i)
26 | t[i], t[j] = t[j], t[i]
27 | end
28 | end
29 |
30 |
31 | function util.string_split(s, c)
32 | if c==nil then c=' ' end
33 | local t={}
34 | while true do
35 | local f=s:find(c)
36 | if f==nil then
37 | if s:len()>0 then
38 | table.insert(t, s)
39 | end
40 | break
41 | end
42 | if f > 1 then
43 | table.insert(t, s:sub(1,f-1))
44 | end
45 | s=s:sub(f+1,s:len())
46 | end
47 | return t
48 | end
49 |
50 |
51 | function util.add(tab, key)
52 | local cur = tab
53 |
54 | for i = 1, #key-1 do
55 | local new_cur = cur[key[i]]
56 | if new_cur == nil then
57 | cur[key[i]] = {}
58 | new_cur = cur[key[i]]
59 | end
60 | cur = new_cur
61 | end
62 | cur[key[#key]] = true
63 | end
64 |
65 | function util.has(tab, key)
66 | local cur = tab
67 | for i = 1, #key do
68 | cur = cur[key[i]]
69 | if cur == nil then
70 | return false
71 | end
72 | end
73 | return true
74 | end
75 |
76 | function util.isnan(x)
77 | return x ~= x
78 | end
79 |
80 | return util
81 |
--------------------------------------------------------------------------------
/test_model.sh:
--------------------------------------------------------------------------------
1 | export LUA_PATH="$LUA_PATH;?.lua"
2 |
3 | th summary/run.lua \
4 | -modelFilename $2 \
5 | -inputf $1 \
6 | -length $3 \
7 | -blockRepeatWords
8 |
9 |
--------------------------------------------------------------------------------
/train_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export WINDOW=5
4 | export OUT_DIR=$1/processed
5 | export MDL_DIR=$1/models
6 |
7 | export LUA_PATH="$LUA_PATH;$ABS/?.lua"
8 |
9 | #bash $ABS/prep_torch_data.sh $2
10 |
11 | mkdir -p $MDL_DIR
12 |
13 | th -i $ABS/summary/train.lua -titleDir $OUT_DIR/train/title/ \
14 | -articleDir $OUT_DIR/train/article/ \
15 | -modelFilename $MDL_DIR/$2 \
16 | -miniBatchSize 64 \
17 | -embeddingDim 64 \
18 | -bowDim 200 \
19 | -hiddenSize 64 \
20 | -epochs 20 \
21 | -learningRate 0.1 \
22 | -validArticleDir $OUT_DIR/valid.filter/article/ \
23 | -validTitleDir $OUT_DIR/valid.filter/title/ \
24 | -window $WINDOW \
25 | -printEvery 100 \
26 | -encoderModel "attenbow" \
27 | -attenPool 5 \
28 |
--------------------------------------------------------------------------------
/tuning/SDecoder_cfg.txt:
--------------------------------------------------------------------------------
1 | LM 1.0
2 | uni 4.84922778048135
3 | bi 1.2132386742991166
4 | tri -13.382831610766107
5 | ooo -0.5293249226416208
6 | length 0.0
7 |
--------------------------------------------------------------------------------
/tuning/SDecoder_cmd.tpl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import os
4 | import sys
5 |
6 | d = {"src" : ,
7 | "model" : ,
8 | "title_len" : }
9 |
10 | for l in open("SDecoder_cfg.txt"):
11 | f, val = l.strip().split()
12 | d[f] = val
13 |
14 | cmd = "cd $ABS; th $ABS/summary/run.lua -modelFilename {model} " + \
15 | "-inputf {src} " + \
16 | "-length {title_len} -blockRepeatWords -recombine " + \
17 | "-beamSize 50 " + \
18 | "-lmWeight {LM} -unigramBonus {uni} -bigramBonus {bi} " + \
19 | "-trigramBonus {tri} -lengthBonus {length} -unorderBonus {ooo} " + \
20 | "-nbest > $ABS/tuning/nbest.out"
21 |
22 | os.system(cmd.format(d))
23 |
--------------------------------------------------------------------------------
/tuning/SDecoder_test.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2015, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree. An additional grant
7 | # of patent rights can be found in the PATENTS file in the same directory.
8 | #
9 | # Author: Alexander M Rush
10 | # Sumit Chopra
11 | # Jason Weston
12 |
13 | import os
14 | import sys
15 | #@lint-avoid-python-3-compatibility-imports
16 |
17 | d = {"src": sys.argv[1],
18 | "model": sys.argv[2],
19 | "title_len": 14}
20 |
21 | for l in open("tuning/blank.params"):
22 | f, val = l.strip().split()
23 | d[f] = val
24 |
25 | cmd = "cd $ABS; $CUTH $ABS/summary/run.lua -modelFilename {model} " + \
26 | "-inputf {src} -recombine " + \
27 | "-length {title_len} -blockRepeatWords " + \
28 | "-lmWeight {LM} -unigramBonus {uni} -bigramBonus {bi} " + \
29 | "-trigramBonus {tri} -lengthBonus {length} -unorderBonus {ooo} "
30 |
31 | os.system(cmd.format(**d))
32 |
--------------------------------------------------------------------------------
/tuning/ZMERT_cfg.txt:
--------------------------------------------------------------------------------
1 | ### Commonly used parameters
2 | -r ref # target sentences file name (in this case, file name prefix)
3 | -rps 4 # references per sentence
4 | -p params.txt # parameter file
5 | -m BLEU 4 closest # evaluation metric and its options
6 | -ipi 20 # number of intermediate initial points
7 | -cmd ./SDecoder_cmd.py # file containing commands to run decoder
8 | -decOut nbest.out # file prodcued by decoder
9 | -dcfg SDecoder_cfg.txt # decoder config file
10 | -N 500 # size of N-best list generated each iteration
11 | -v 1 # verbosity level (0-2; higher value =>
12 | -seed 12341234 # random number generator seed
13 |
--------------------------------------------------------------------------------
/tuning/params.txt:
--------------------------------------------------------------------------------
1 | LM ||| 1.0 Fix 0.0 +Inf -1 +1
2 | uni ||| 0.0 Opt -Inf +Inf -1 +1
3 | bi ||| 0.0 Opt -Inf +Inf -1 +1
4 | tri ||| 0.0 Opt -Inf +Inf -1 +1
5 | ooo ||| 0.0 Opt -Inf 0 -1 +1
6 | length ||| 0.0 Fix -Inf +Inf -1 +1
7 | normalization = none
8 |
--------------------------------------------------------------------------------